def transform_one(self, d): transformed = [] if isinstance(d, dict): text = d['content'] else: text = d d = {} # toks = twokenize.tokenizeRawTweetText(text) gettokens = Tokenizer() toks = gettokens.tokenize(text) for tok in toks: if self.re_url.match(tok): transformed.append('_url_') elif tok.startswith('@'): transformed.append('@mention') else: transformed.append(tok) if not self.ignore_topics_: topic = d.get('topic') text = u' '.join(transformed) if topic: start = 0 end = len(text) i = text.lower().find(topic.lower()) if i > -1: matches = [ m.end() for m in self.re_punctuation.finditer(text[:i]) ] if matches: start = matches[-1] m = self.re_punctuation.search(text[(i + len(topic)):]) if m: end = m.start() + i + len(topic) transformed = [u'topic=' + topic] + text[start:end].split() return transformed
def clean_tweet(tweet): """ Simple tweet preprocessing """ gettokens = Tokenizer() tweet = " ".join(gettokens.tokenize(tweet)) tweet = tweet.lower() # tweet = re.sub(tweet_reg, "", tweet) # tweet = re.sub("\d+", "", tweet) # tweet = tweet.lower().strip() tweet = [word for word in tweet.split() if word not in stopWords] tweet = [word for word in tweet if word not in string.punctuation] return tweet
def Clean(text): gettokens = Tokenizer(usernames="",urls="",numbers="") t=gettokens.tokenize(text) tt=[] for i in range(len(t)): if t[i] in UNICODE_EMOJI: t[i]='' # t[i]=REV_UNICODE_EMOJI[t[i]] for i in range(len(t)): if(isEnglish(t[i])): tt.append(t[i].lower()) return ' '.join(tt).encode('utf-8')
def load_semeval_text_only(fname, delimiter=u'\t'): gettokens = Tokenizer() assert os.path.isfile(fname) rvdata = codecs.open(fname, encoding='utf-8').readlines() assert type(rvdata[0]) == unicode rvdata = [s.strip() for s in rvdata] assert len(rvdata) > 0 numberofColoums = len(rvdata[0].split(delimiter)) ids = [s.split(delimiter)[0] for s in rvdata] if numberofColoums == 3: y = [s.split(delimiter)[1] for s in rvdata] x = [s.split(delimiter)[2] for s in rvdata] elif numberofColoums == 4: y = [s.split(delimiter)[2] for s in rvdata] x = [s.split(delimiter)[3] for s in rvdata] else: xstartindex = [int(s.split(delimiter)[2]) for s in rvdata] xendindex = [int(s.split(delimiter)[3]) for s in rvdata] y = [s.split(delimiter)[4] for s in rvdata] x = [s.split(delimiter)[5] for s in rvdata] print rvdata[0] x = [ u' '.join(s.split(u' ')[xs:xe + 1]) for s, xs, xe in zip(x, xstartindex, xendindex) ] print 'totoal', len(rvdata), '@', fname print ids[0], y[0], x[0] return ids, y, x
def tweet_tokens(tweet): ''' Takes a tweet and replaces mentions, hashtags, urls, times, and numbers with a generic label INPUT: string OUTPUT: string ''' gettokens = Tokenizer(usernames='USER', urls='URL', hashtags='HASHTAG', times='TIME', numbers='NUMBER', allcapskeep=True, lowercase=False) tokens = gettokens.tokenize(tweet) tweet = ' '.join(tokens) return tweet
class SocioLinguisticClassifier: def __init__(self): self.socling = SocioLinguistic() self.features_list = [] self.features = {} self.gettokens = Tokenizer() def label_file_to_dict(self, filename): dict_x = {} for line in open(filename, "r"): # print line temp = line.split("||") name = temp[0].strip() label = temp[1].strip() dict_x[name] = label return dict_x def get_features(self, line, demographic): self.socling.sent = line self.socling.sent_list = self.gettokens.tokenize(line.upper()) if demographic == "gend": self.socling.single_exclam() self.socling.pumping() self.socling.agreement() self.socling.affection() self.socling.emoticons() self.socling.excitement() self.socling.ellipses() self.socling.possessive_bigrams(self.features_list) self.socling.laugh() self.socling.shout() self.socling.exasperation() self.socling.honorifics() self.socling.slang() self.socling.pronouns() def initialize(self,demographic): self.features_list = set(self.socling.file_to_list("feature_files/feature_names_" + demographic)) def reset_dictionary(self): self.features = {} for feature in self.features_list: self.features[feature] = 0 def stacked_socling_init(self,demographic): self.features_list = set(self.socling.file_to_list("feature_files/feature_names_" + demographic)) self.reset_dictionary() self.socling.features_dict = self.features
def load_semeval_text_only(fname, delimiter=u'\t'): ''' :param fname: file name :param delimiter: deliminater :return: id: all ids y: training labels x: training text , lower cased , and filtered text ''' gettokens = Tokenizer() assert os.path.isfile(fname) rvdata = codecs.open(fname, encoding='utf-8').readlines() assert type(rvdata[0]) == unicode rvdata = [s.strip() for s in rvdata] assert len(rvdata) > 0 numberofColoums = len(rvdata[0].split(delimiter)) ids = [s.split(delimiter)[0] for s in rvdata] if numberofColoums == 3: y = [s.split(delimiter)[1] for s in rvdata] x = [s.split(delimiter)[2] for s in rvdata] elif numberofColoums == 4: y = [s.split(delimiter)[2] for s in rvdata] x = [s.split(delimiter)[3] for s in rvdata] else: xstartindex = [int(s.split(delimiter)[2]) for s in rvdata] xendindex = [int(s.split(delimiter)[3]) for s in rvdata] y = [s.split(delimiter)[4] for s in rvdata] x = [s.split(delimiter)[5] for s in rvdata] print rvdata[0] x = [ u' '.join(s.split(u' ')[xs:xe + 1]) for s, xs, xe in zip(x, xstartindex, xendindex) ] print 'totoal', len(rvdata), '@', fname print ids[0], y[0], x[0] x = map(filterlineEmoji, x) #x=map(filterline,x) x = map(unicode.lower, x) return ids, y, x
# Initilization print("Initilizing...") t0 = time() with open(stopwords_file_path) as f: stopwords_list = f.readlines() with open(english_stopwords_file_path) as f: english_stopwords_list = f.readlines() stopwords = [word.strip() for word in stopwords_list if word.strip()] + [ word.strip() for word in english_stopwords_list if word.strip() ] + tweet_stopwords politicians_sorted = sorted(list(politicians_info.keys())) politician_tweets = defaultdict(list) tweet_list = [] tokenizer = Tokenizer() print("done in {:0.4f}s".format(time() - t0)) # Collect tweets from JSON print("Collecting tweets...") t0 = time() tweets_so_far = 0 only_jsons = [ f for f in listdir(TWEETS_DIRECTORY) if isfile(join(TWEETS_DIRECTORY, f)) and f.endswith('.json') and not f == basename(POLITICIANS_INFO_FILE_PATH) ] for tweet_file in only_jsons: with open(join(TWEETS_DIRECTORY, tweet_file)) as tf: tweets = json.load(tf) for tweet in tweets: tweet_list.append(unidecode.unidecode(tweet['text']))
#function to replace other broken encoding def fix_other(text): for i in range(len(text)): if (text[i] == u'\u2014' or text[i] == u'\u2013'): text[i] = "-" if (text[i].find(u'\u2026') or text[i] == u'\u2026'): text[i] = text[i].replace(u"\u2026","...") return text #configure tweetokenize Tokenizer tknzr = Tokenizer(lowercase = False, allcapskeep = True, normalize = False, usernames = 'USERNAME', urls = 'URL', hashtags = 'HASHTAG', ignorequotes = False, ignorestopwords = False) tknzr.emoticons(filename="emoticons.txt") #input and output filepaths #pretoken_filepath = 'practice-data/tweet_tweet.csv' pretoken_filepath = 'project-data/twitter_tweet.csv' posttoken_filepath = 'preprocessed-data/tweet_tweet_pp.csv' text_index = 3 #read from input, tokenize, write to output
from tweetokenize import Tokenizer from nltk.tokenize.moses import MosesDetokenizer import time detokenizer = MosesDetokenizer() import time expanded_words = { "tbh": "to be honest", "lgtm": "looks good to me", "r+": "Review", "wc": "Welcome", "btw": "by the way" } with open("/Users/hiteshsapkota/Desktop/ICSETrust/Data/shortcodeemoji.json" ) as outfile: shortcodeemoji = json.load(outfile) gettokens = Tokenizer() def expandwords(comment): words = [] keys = [k for k, v in expanded_words.items()] for word in comment.split(): present = False for key in keys: if key in word.lower(): present = True words.append(expanded_words[key]) if present is False: words.append(word) return ' '.join(words)
def setUp(self): self.tokenizer = Tokenizer(lowercase=True)
class TokenizeTests(unittest.TestCase): def setUp(self): self.tokenizer = Tokenizer(lowercase=True) def test_general_1(self): self.tokenizer.normalize = 2 msg = ('omg wow < & > >.< >.< :):)' 'i CANT believe thatttt haha lol!!1') tks = ['omg', 'wow', '<', '&', '>', '>.<', '>.<', ':)', ':)', 'i', 'CANT', 'believe', 'thatt', 'haha', 'lol', '!', '!', '1'] self.assertEqual(self.tokenizer.tokenize(msg), tks) def test_general_2(self): msg = "i'm wanting to jump up and down but wouldn't if i couldn't.." tks = ["i'm", 'wanting', 'to', 'jump', 'up', 'and', 'down', 'but', "wouldn't", 'if', 'i', "couldn't", '...'] self.assertEqual(self.tokenizer.tokenize(msg), tks) def test_urls_1(self): msg = "hey bro chec'k out http://shitstorm.com its f*****g sick" tks = ['hey', 'bro', "chec'k", 'out', 'URL', 'its', 'f*****g', 'sick'] self.assertEqual(self.tokenizer.tokenize(msg), tks) def test_urls_2(self): msg = 'also see this crazy stuff https://shitstorm.com' tks = ['also', 'see', 'this', 'crazy', 'stuff', 'URL'] self.assertEqual(self.tokenizer.tokenize(msg), tks) def test_urls_3(self): msg = 'hiiiii rayj.com/ihititfirst and other google.com http://hobo.net' tks = ['hiii', 'URL', 'and', 'other', 'URL', 'URL'] self.assertEqual(self.tokenizer.tokenize(msg), tks) def test_usernames_1(self): msg = '@justinbeiber yo man!! ! i love you in a totally straight way <3:p:D' tks = ['USERNAME', 'yo', 'man', '!', '!', '!', 'i', 'love', 'you', 'in', 'a', 'totally', 'straight', 'way', '<3', ':p', ':D'] self.assertEqual(self.tokenizer.tokenize(msg), tks) def test_usernames_2(self): msg = '@heyheymango: what did you SAYYY??? or did you just.. NotHING?' tks = ['USERNAME', ':', 'what', 'did', 'you', 'SAYYY', '?', '?', '?', 'or', 'did', 'you', 'just', '...', 'nothing', '?'] self.assertEqual(self.tokenizer.tokenize(msg), tks) def test_numbers_1(self): self.tokenizer.numbers = None msg = ('i have this much money -2.42 in my bank acct.,friend! but you ' 'have mucho +88e44 and its about 1000% more than $400.') tks = ['i', 'have', 'this', 'much', 'money', '-2.42', 'in', 'my', 'bank', 'acct', '.', ',', 'friend', '!', 'but', 'you', 'have', 'mucho', '+88e44', 'and', 'its', 'about', '1000%', 'more', 'than', '$400', '.'] self.assertEqual(self.tokenizer.tokenize(msg), tks) def test_numbers_2(self): msg = ('i have this much money -2.42 in my bank acct.,friend! but you ' 'have mucho +88e44 and its about 1000% more than $400.') tks = ['i', 'have', 'this', 'much', 'money', 'NUMBER', 'in', 'my', 'bank', 'acct', '.', ',', 'friend', '!', 'but', 'you', 'have', 'mucho', 'NUMBER', 'and', 'its', 'about', 'NUMBER', 'more', 'than', 'NUMBER', '.'] self.assertEqual(self.tokenizer.tokenize(msg), tks) def test_numbers_3(self): self.tokenizer.lowercase = False # keep cases the same everywhere msg = ('I JUST want To Test FRACTIONZZZ 22432.41414/ 55894385e-341 also' ' lowercase etc.etc.etc. hope that last part doesn\'t parse as a url ' 'i would be kinda sad PANda!zsss..... .. . .... 4/5 5.1/4.0e0 3.14 -2') tks = ['I', 'JUST', 'want', 'To', 'Test', 'FRACTIONZZZ', 'NUMBER', 'also', 'lowercase', 'etc', '.', 'etc', '.', 'etc', '.', 'hope', 'that', 'last', 'part', "doesn't", 'parse', 'as', 'a', 'url', 'i', 'would', 'be', 'kinda', 'sad', 'PANda', '!', 'zsss', '...', '...', '.', '...', 'NUMBER', 'NUMBER', 'NUMBER', 'NUMBER'] self.assertEqual(self.tokenizer.tokenize(msg), tks) def test_time_1(self): msg = 'is the time now 12:14pm? or is it like 2:42AM??' tks = ['is', 'the', 'time', 'now', 'TIME', '?', 'or', 'is', 'it', 'like', 'TIME', '?', '?'] self.assertEqual(self.tokenizer.tokenize(msg), tks) def test_time_2(self): msg = 'new time is 2:42:09 PM!!' tks = ['new', 'time', 'is', 'TIME', '!', '!'] self.assertEqual(self.tokenizer.tokenize(msg), tks) def test_phonenumber_1(self): msg = ('my number is 18002432242 and 241.413.5584 also 1-242-156-6724' ' and (958)555-4875 or (999) 415 5542 is 422-5555 a 131-121-1441') tks = ['my', 'number', 'is', 'PHONENUMBER', 'and', 'PHONENUMBER', 'also', 'PHONENUMBER', 'and', 'PHONENUMBER', 'or', 'PHONENUMBER', 'is', 'PHONENUMBER', 'a', 'PHONENUMBER'] self.assertEqual(self.tokenizer.tokenize(msg), tks) def test_phonenumber_2(self): msg = 'numbers with extension: (201)-340-4915 x112 or 1 800.341.1311x99' tks = ['numbers', 'with', 'extension', ':', 'PHONENUMBER', 'or', 'PHONENUMBER'] self.assertEqual(self.tokenizer.tokenize(msg), tks) def test_quotes_1(self): self.tokenizer.ignorequotes = True msg = 'this is just a tweet with "someone said something funny" lol' tks = ['this', 'is', 'just', 'a', 'tweet', 'with', 'lol'] self.assertEqual(self.tokenizer.tokenize(msg), tks) def test_quotes_2(self): self.tokenizer.ignorequotes = False msg = 'this is just a tweet with "someone said something funny" lol' tks = ['this', 'is', 'just', 'a', 'tweet', 'with', '"', 'someone', 'said', 'something', 'funny', '"', 'lol'] self.assertEqual(self.tokenizer.tokenize(msg), tks) def test_quotes_3(self): self.tokenizer.ignorequotes = True msg = 'some stuff but he said “yea i know its crazy”other stuff...!!! ' tks = ['some', 'stuff', 'but', 'he', 'said', 'other', 'stuff', '...', '!', '!', '!'] self.assertEqual(self.tokenizer.tokenize(msg), tks) def test_quotes_4(self): self.tokenizer.ignorequotes = True msg = 'some stuff but he said “yea i know its crazy”other stuff...!!! ' tks = ['some', 'stuff', 'but', 'he', 'said', 'other', 'stuff', '...', '!', '!', '!'] self.assertEqual(self.tokenizer.tokenize(msg), tks) def test_quotes_5(self): self.tokenizer.ignorequotes = False msg = 'heyy buddyyyyy boy \'do you the lady\'s kitty like that??\'' tks = ['heyy', 'buddyyy', 'boy', "'", 'do', 'you', 'the', "lady's", 'kitty', 'like', 'that', '?', '?', "'"] self.assertEqual(self.tokenizer.tokenize(msg), tks) def test_hashtags_1(self): msg = 'omg i love#dog#cat#food#other#things#so#f*****g#much!!!11LOLOLOL' tks = ['omg', 'i', 'love', '#dog', '#cat', '#food', '#other', '#things', '#so', '#f*****g', '#much', '!', '!', '!', '11LOLOLOL'] self.assertEqual(self.tokenizer.tokenize(msg), tks) def test_hashtags_2(self): self.tokenizer.hashtags = 'HASHTAG' msg = 'omg i love#dog#cat#food#other#things#so#f*****g#much!!!11LOLOLOL' tks = ['omg', 'i', 'love', 'HASHTAG', 'HASHTAG', 'HASHTAG', 'HASHTAG', 'HASHTAG', 'HASHTAG', 'HASHTAG', 'HASHTAG', '!', '!', '!', '11LOLOLOL'] self.assertEqual(self.tokenizer.tokenize(msg), tks) def test_emoticons_1(self): msg = 'heyyyyyy:):):(>.<<v.vwhats up man LOL T.T tomcat.tomcat :$;).!!!' tks = ['heyyy', ':)', ':)', ':(', '>.<', '<', 'v.v', 'whats', 'up', 'man', 'LOL', 'T.T', 'tomcat', '.', 'tomcat', ':$', ';)', '.', '!', '!', '!'] self.assertEqual(self.tokenizer.tokenize(msg), tks) def test_removefeatures_1(self): self.tokenizer.usernames = "" # dont' want any usernames to show msg = ('hey @arnold @nickelodeon #90s#ilove90s#allthat#amandashow' '@rocko http://en.wikipedia.org/wiki/The_Angry_Beavers ^.^>>><<<^.^') tks = ['hey', '#90s', '#ilove90s', '#allthat', '#amandashow', 'URL', '^.^', '>', '>', '>', '<', '<', '<', '^.^'] self.assertEqual(self.tokenizer.tokenize(msg), tks) def test_removefeatures_2(self): self.tokenizer.usernames = "" # dont' want atest_tweetokenizeny usernames to show self.tokenizer.hashtags = "" # or hashtags msg = ('hey @arnold @nickelodeon #90s#ilove90s#allthat#amandashow' '@rocko http://en.wikipedia.org/wiki/The_Angry_Beavers ^.^>>><<<^.^') tks = ['hey', 'URL', '^.^', '>', '>', '>', '<', '<', '<', '^.^'] self.assertEqual(self.tokenizer.tokenize(msg), tks) def test_removefeatures_3(self): self.tokenizer.usernames = False # keep usernames self.tokenizer.urls = "" # URLs should be removed self.tokenizer.hashtags = "$$$" # hashtags should be $$$ msg = ('hey @arnold @nickelodeon #90s#ilove90s#allthat#amandashow' '@rocko http://en.wikipedia.org/wiki/The_Angry_Beavers ^.^>>><<<^.^') tks = ['hey', '@arnold', '@nickelodeon', '$$$', '$$$', '$$$', '$$$', '@rocko', '^.^', '>', '>', '>', '<', '<', '<', '^.^'] self.assertEqual(self.tokenizer.tokenize(msg), tks) def test_emoji_1(self): msg = ('hey mate!:):3.....@and🇨🇳ONE+ BRO#love😘😵💚💛💜💙 ' '💋😂😂LOLLLL.') tks = ['hey', 'mate', '!', ':)', ':3', '...', 'USERNAME', '\U0001f1e8\U0001f1f3', 'ONE', '+', 'BRO', '#love', '\U0001f618', '\U0001f635', '\U0001f49a', '\U0001f49b', '\U0001f49c', '\U0001f499', '\U0001f48b', '\U0001f602', '\U0001f602', 'LOLLL', '.'] self.assertEqual(self.tokenizer.tokenize(msg), tks) def test_emoji_2(self): msg = ('hey mate!:):3.....@andONE+🇬🇧 BRO#love😘😵💚💛💜💙 ' '💋😂😂LOLLLL.') tks = ['hey', 'mate', '!', ':)', ':3', '...', 'USERNAME', '+', '\U0001f1ec\U0001f1e7', 'BRO', '#love', '😘', '😵', '\U0001f49a', '\U0001f49b', '\U0001f49c', '\U0001f499', '💋', '\U0001f602', '\U0001f602', 'LOLLL', '.'] self.assertEqual(self.tokenizer.tokenize(msg), tks) def _test_emoji_3(self): msg = ('🚀=)</3O_O:$D:<:-@\xf0\x9f\x98\xb7🔥💩💅 outdated:💽 ancient:💾 ' '#getwiththecloud:💻 and it looks like 💭') tks = ['\U0001f680', '=)', '</3', 'O_O', ':$', 'D:<', ':-@', '\U0001f637', '\U0001f525', '\U0001f4a9', '\U0001f485', 'outdated', ':', '\U0001f4bd', 'ancient', ':', '\U0001f4be', '#getwiththecloud', ':', '\U0001f4bb', 'and', 'it', 'looks', 'like', '\U0001f4ad'] self.assertEqual(self.tokenizer.tokenize(msg), tks) def test_accent_1(self): msg = 'hola! cómo estás?' tks = ['hola', '!', 'cómo', 'estás', '?'] self.assertEqual(self.tokenizer.tokenize(msg), tks) def test_accent_2(self): self.tokenizer.ignoreaccents = True msg = 'hola! cómo estás? ANDRÉS' tks = ['hola', '!', 'como', 'estas', '?', 'ANDRES'] self.assertEqual(self.tokenizer.tokenize(msg), tks) def _test_email_1(self): self.tokenizer.mail = 'MAIL' msg = 'write me to [email protected]' tks = ['write', 'me', 'to', 'MAIL'] self.assertEqual(self.tokenizer.tokenize(msg), tks) def test_split_hashtag_1(self): self.tokenizer.splithashtag = True self.tokenizer.lowercase = False msg = 'hola! #EstoEsUnSaludo' tks = ['hola', '!', '#', 'EstoEsUnSaludo'] self.assertEqual(self.tokenizer.tokenize(msg), tks) def test_error_1(self): msg = [] with self.assertRaises(TypeError): self.tokenizer.tokenize(msg) def test_error_2(self): msg = lambda x: x with self.assertRaises(TypeError): self.tokenizer.tokenize(msg) def test_actual_tweets_1(self): """Number as part of name""" msg = '@LoganTillman not 2pac and floyd mayweather' tks = ['USERNAME', 'not', '2pac', 'and', 'floyd', 'mayweather'] self.assertEqual(self.tokenizer.tokenize(msg), tks) def test_actual_tweets_2(self): """Colon no space in hashtag""" msg = '#MentionSomeoneYoureGladYouMet: @LarryWorld_Wide of course.' tks = ['#MentionSomeoneYoureGladYouMet', ':', 'USERNAME', 'of', 'course', '.'] self.assertEqual(self.tokenizer.tokenize(msg), tks) def test_stopwords_1(self): self.tokenizer.ignorestopwords = True msg = 'i like myself and my so not much and our something he:)' tks = ['like', 'much', 'something', ':)'] self.assertEqual(self.tokenizer.tokenize(msg), tks)
class TokenizeTests(unittest.TestCase): def setUp(self): self.tokenizer = Tokenizer(lowercase=True) def test_general_1(self): self.tokenizer.normalize = 2 msg = ('omg wow < & > >.< >.< :):)' 'i CANT believe thatttt haha lol!!1') tks = ['omg', 'wow', '<', '&', '>', '>.<', '>.<', ':)', ':)', 'i', 'CANT', 'believe', 'thatt', 'haha', 'lol', '!', '!', '1'] self.assertEqual(self.tokenizer.tokenize(msg), tks) def test_general_2(self): msg = "i'm wanting to jump up and down but wouldn't if i couldn't.." tks = ["i'm", 'wanting', 'to', 'jump', 'up', 'and', 'down', 'but', "wouldn't", 'if', 'i', "couldn't", '...'] self.assertEqual(self.tokenizer.tokenize(msg), tks) def test_urls_1(self): msg = ("hey bro chec'k out http://shitstorm.com its f*****g sick") tks = ['hey', 'bro', "chec'k", 'out', 'URL', 'its', 'f*****g', 'sick'] self.assertEqual(self.tokenizer.tokenize(msg), tks) def test_urls_2(self): msg = ('also see this crazy stuff https://shitstorm.com') tks = ['also', 'see', 'this', 'crazy', 'stuff', 'URL'] self.assertEqual(self.tokenizer.tokenize(msg), tks) def test_urls_3(self): msg = 'hiiiii rayj.com/ihititfirst and other google.com http://hobo.net' tks = ['hiii', 'URL', 'and', 'other', 'URL', 'URL'] self.assertEqual(self.tokenizer.tokenize(msg), tks) def test_usernames_1(self): msg = ('@justinbeiber yo man!! ! i love you in a totally ' 'straight way <3:p:D') tks = ['USERNAME', 'yo', 'man', '!', '!', '!', 'i', 'love', 'you', 'in', 'a', 'totally', 'straight', 'way', '<3', ':p', ':D'] self.assertEqual(self.tokenizer.tokenize(msg), tks) def test_usernames_2(self): msg = '@heyheymango: what did you SAYYY??? or did you just.. NotHING?' tks = ['USERNAME', ':', 'what', 'did', 'you', 'SAYYY', '?', '?', '?', 'or', 'did', 'you', 'just', '...', 'nothing', '?'] self.assertEqual(self.tokenizer.tokenize(msg), tks) def test_numbers_1(self): self.tokenizer.numbers = None msg = ('i have this much money -2.42 in my bank acct.,friend! but you ' 'have mucho +88e44 and its about 1000% more than $400.') tks = ['i', 'have', 'this', 'much', 'money', '-2.42', 'in', 'my', 'bank', 'acct', '.', ',', 'friend', '!', 'but', 'you', 'have', 'mucho', '+88e44', 'and', 'its', 'about', '1000%', 'more', 'than', '$400', '.'] self.assertEqual(self.tokenizer.tokenize(msg), tks) def test_numbers_2(self): msg = ('i have this much money -2.42 in my bank acct.,friend! but you ' 'have mucho +88e44 and its about 1000% more than $400.') tks = ['i', 'have', 'this', 'much', 'money', 'NUMBER', 'in', 'my', 'bank', 'acct', '.', ',', 'friend', '!', 'but', 'you', 'have', 'mucho', 'NUMBER', 'and', 'its', 'about', 'NUMBER', 'more', 'than', 'NUMBER', '.'] self.assertEqual(self.tokenizer.tokenize(msg), tks) def test_numbers_3(self): self.tokenizer.lowercase = False # keep cases the same everywhere msg = ('I JUST want To Test FRACTIONZZZ 22432.41414/ 55894385e-341 also' ' lowercase etc.etc.etc. hope that last part doesn\'t parse as a url ' 'i would be kinda sad PANda!zsss..... .. . .... 4/5 5.1/4.0e0 3.14 -2') tks = ['I', 'JUST', 'want', 'To', 'Test', 'FRACTIONZZZ', 'NUMBER', 'also', 'lowercase', 'etc', '.', 'etc', '.', 'etc', '.', 'hope', 'that', 'last', 'part', "doesn't", 'parse', 'as', 'a', 'url', 'i', 'would', 'be', 'kinda', 'sad', 'PANda', '!', 'zsss', '...', '...', '.', '...', 'NUMBER', 'NUMBER', 'NUMBER', 'NUMBER'] self.assertEqual(self.tokenizer.tokenize(msg), tks) def test_time_1(self): msg = 'is the time now 12:14pm? or is it like 2:42AM??' tks = ['is', 'the', 'time', 'now', 'TIME', '?', 'or', 'is', 'it', 'like', 'TIME', '?', '?'] self.assertEqual(self.tokenizer.tokenize(msg), tks) def test_time_2(self): msg = 'new time is 2:42:09 PM!!' tks = ['new', 'time', 'is', 'TIME', '!', '!'] self.assertEqual(self.tokenizer.tokenize(msg), tks) def test_phonenumber_1(self): msg = ('my number is 18002432242 and 241.413.5584 also 1-242-156-6724' ' and (958)555-4875 or (999) 415 5542 is 422-5555 a 131-121-1441') tks = ['my', 'number', 'is', 'PHONENUMBER', 'and', 'PHONENUMBER', 'also', 'PHONENUMBER', 'and', 'PHONENUMBER', 'or', 'PHONENUMBER', 'is', 'PHONENUMBER', 'a', 'PHONENUMBER'] self.assertEqual(self.tokenizer.tokenize(msg), tks) def test_phonenumber_2(self): msg = 'numbers with extension: (201)-340-4915 x112 or 1 800.341.1311x99' tks = ['numbers', 'with', 'extension', ':', 'PHONENUMBER', 'or', 'PHONENUMBER'] self.assertEqual(self.tokenizer.tokenize(msg), tks) def test_quotes_1(self): self.tokenizer.ignorequotes = True msg = 'this is just a tweet with "someone said something funny" lol' tks = ['this', 'is', 'just', 'a', 'tweet', 'with', 'lol'] self.assertEqual(self.tokenizer.tokenize(msg), tks) def test_quotes_2(self): self.tokenizer.ignorequotes = False msg = 'this is just a tweet with "someone said something funny" lol' tks = ['this', 'is', 'just', 'a', 'tweet', 'with', '"', 'someone', 'said', 'something', 'funny', '"', 'lol'] self.assertEqual(self.tokenizer.tokenize(msg), tks) def test_quotes_3(self): self.tokenizer.ignorequotes = True msg = ('some stuff but he said “yea i know its crazy”other ' 'stuff...!!! ') tks = ['some', 'stuff', 'but', 'he', 'said', 'other', 'stuff', '...', '!', '!', '!'] self.assertEqual(self.tokenizer.tokenize(msg), tks) def test_quotes_4(self): self.tokenizer.ignorequotes = True msg = ('some stuff but he said “yea i know its crazy”other ' 'stuff...!!! ') tks = ['some', 'stuff', 'but', 'he', 'said', 'other', 'stuff', '...', '!', '!', '!'] self.assertEqual(self.tokenizer.tokenize(msg), tks) def test_quotes_5(self): self.tokenizer.ignorequotes = False msg = 'heyy buddyyyyy boy \'do you the lady\'s kitty like that??\'' tks = ['heyy', 'buddyyy', 'boy', "'", 'do', 'you', 'the', "lady's", 'kitty', 'like', 'that', '?', '?', "'"] self.assertEqual(self.tokenizer.tokenize(msg), tks) def test_hashtags_1(self): msg = 'omg i love#dog#cat#food#other#things#so#f*****g#much!!!11LOLOLOL' tks = ['omg', 'i', 'love', '#dog', '#cat', '#food', '#other', '#things', '#so', '#f*****g', '#much', '!', '!', '!', '11LOLOLOL'] self.assertEqual(self.tokenizer.tokenize(msg), tks) def test_hashtags_2(self): self.tokenizer.hashtags = 'HASHTAG' msg = 'omg i love#dog#cat#food#other#things#so#f*****g#much!!!11LOLOLOL' tks = ['omg', 'i', 'love', 'HASHTAG', 'HASHTAG', 'HASHTAG', 'HASHTAG', 'HASHTAG', 'HASHTAG', 'HASHTAG', 'HASHTAG', '!', '!', '!', '11LOLOLOL'] self.assertEqual(self.tokenizer.tokenize(msg), tks) def test_emoticons_1(self): msg = 'heyyyyyy:):):(>.<<v.vwhats up man LOL T.T tomcat.tomcat :$;).!!!' tks = ['heyyy', ':)', ':)', ':(', '>.<', '<', 'v.v', 'whats', 'up', 'man', 'LOL', 'T.T', 'tomcat', '.', 'tomcat', ':$', ';)', '.', '!', '!', '!'] self.assertEqual(self.tokenizer.tokenize(msg), tks) def test_removefeatures_1(self): self.tokenizer.usernames = "" # dont' want any usernames to show msg = ('hey @arnold @nickelodeon #90s#ilove90s#allthat#amandashow' '@rocko http://en.wikipedia.org/wiki/The_Angry_Beavers ^.^>>><<<^.^') tks = ['hey', '#90s', '#ilove90s', '#allthat', '#amandashow', 'URL', '^.^', '>', '>', '>', '<', '<', '<', '^.^'] self.assertEqual(self.tokenizer.tokenize(msg), tks) def test_removefeatures_2(self): self.tokenizer.usernames = "" # dont' want any usernames to show self.tokenizer.hashtags = "" # or hashtags msg = ('hey @arnold @nickelodeon #90s#ilove90s#allthat#amandashow' '@rocko http://en.wikipedia.org/wiki/The_Angry_Beavers ^.^>>><<<^.^') tks = ['hey', 'URL', '^.^', '>', '>', '>', '<', '<', '<', '^.^'] self.assertEqual(self.tokenizer.tokenize(msg), tks) def test_removefeatures_3(self): self.tokenizer.usernames = False # keep usernames self.tokenizer.urls = "" # URLs should be removed self.tokenizer.hashtags = "$$$" # hashtags should be $$$ msg = ('hey @arnold @nickelodeon #90s#ilove90s#allthat#amandashow' '@rocko http://en.wikipedia.org/wiki/The_Angry_Beavers ^.^>>><<<^.^') tks = ['hey', '@arnold', '@nickelodeon', '$$$', '$$$', '$$$', '$$$', '@rocko', '^.^', '>', '>', '>', '<', '<', '<', '^.^'] self.assertEqual(self.tokenizer.tokenize(msg), tks) def test_emoji_1(self): msg = ('hey mate!:):3.....@and🇨🇳ONE+ BRO#love😘😵💚💛💜💙 ' '💋😂😂LOLLLL.') tks = ['hey', 'mate', '!', ':)', ':3', '...', 'USERNAME', '\U0001f1e8\U0001f1f3', 'ONE', '+', 'BRO', '#love', '\U0001f618', '\U0001f635', '\U0001f49a', '\U0001f49b', '\U0001f49c', '\U0001f499', '\U0001f48b', '\U0001f602', '\U0001f602', 'LOLLL', '.'] self.assertEqual(self.tokenizer.tokenize(msg), tks) def test_emoji_2(self): msg = ('hey mate!:):3.....@andONE+🇬🇧 BRO#love😘😵💚💛💜💙 ' '💋😂😂LOLLLL.') tks = ['hey', 'mate', '!', ':)', ':3', '...', 'USERNAME', '+', '\U0001f1ec\U0001f1e7', 'BRO', '#love', '😘', '😵', '\U0001f49a', '\U0001f49b', '\U0001f49c', '\U0001f499', '💋', '\U0001f602', '\U0001f602', 'LOLLL', '.'] self.assertEqual(self.tokenizer.tokenize(msg), tks) def test_emoji_3(self): msg = ('🚀=)</3O_O:$D:<:-@\xf0\x9f\x98\xb7🔥💩💅 outdated:💽 ancient:💾 ' '#getwiththecloud:💻 and it looks like 💭') tks = ['\U0001f680', '=)', '</3', 'O_O', ':$', 'D:<', ':-@', '\U0001f637', '\U0001f525', '\U0001f4a9', '\U0001f485', 'outdated', ':', '\U0001f4bd', 'ancient', ':', '\U0001f4be', '#getwiththecloud', ':', '\U0001f4bb', 'and', 'it', 'looks', 'like', '\U0001f4ad'] self.assertEqual(self.tokenizer.tokenize(msg), tks) def test_error_1(self): msg = [] with self.assertRaises(TypeError): self.tokenizer.tokenize(msg) def test_error_2(self): msg = lambda x: x with self.assertRaises(TypeError): self.tokenizer.tokenize(msg) def test_actual_tweets_1(self): "Number as part of name" msg = '@LoganTillman not 2pac and floyd mayweather' tks = ['USERNAME', 'not', '2pac', 'and', 'floyd', 'mayweather'] self.assertEqual(self.tokenizer.tokenize(msg), tks) def test_actual_tweets_2(self): "Colon no space in hashtag" msg = '#MentionSomeoneYoureGladYouMet: @LarryWorld_Wide of course.' tks = ['#MentionSomeoneYoureGladYouMet', ':', 'USERNAME', 'of', 'course', '.'] self.assertEqual(self.tokenizer.tokenize(msg), tks) def test_stopwords_1(self): self.tokenizer.ignorestopwords = True msg = 'i like myself and my so not much and our something he:)' tks = ['like', 'much', 'something', ':)'] self.assertEqual(self.tokenizer.tokenize(msg), tks)
def __init__(self): self.socling = SocioLinguistic() self.features_list = [] self.features = {} self.gettokens = Tokenizer()
from collections import Counter, defaultdict import csv from ipdb import set_trace import numpy as np import os import re import sys import twokenize from tweetokenize import Tokenizer from yandex_translate import YandexTranslate, YandexTranslateException # emoticon regex taken from Christopher Potts' script at http://sentiment.christopherpotts.net/tokenizing.html emoticon_regex = r"""(?:[<>]?[:;=8][\-o\*\']?[\)\]\(\[dDpP/\:\}\{@\|\\]|[\)\]\(\[dDpP/\:\}\{@\|\\][\-o\*\']?[:;=8][<>]?)""" twk = Tokenizer(ignorequotes=False, usernames=False, urls=False) def count_emoticon_polarity(message): """ returns the number of positive, neutral and negative emoticons in message """ emoticon_list = re.findall(emoticon_regex, message) polarity_list = [] for emoticon in emoticon_list: if emoticon in ['8:', '::', 'p:']: continue # these are false positives: '8:48', 'http:', etc polarity = emoticon_polarity(emoticon) polarity_list.append(polarity) emoticons = Counter(polarity_list) pos = emoticons[1] neu = emoticons[0]