示例#1
0
    def transform_one(self, d):
        transformed = []
        if isinstance(d, dict): text = d['content']
        else:
            text = d
            d = {}

        # toks = twokenize.tokenizeRawTweetText(text)
        gettokens = Tokenizer()
        toks = gettokens.tokenize(text)

        for tok in toks:
            if self.re_url.match(tok): transformed.append('_url_')
            elif tok.startswith('@'): transformed.append('@mention')
            else: transformed.append(tok)

        if not self.ignore_topics_:
            topic = d.get('topic')
            text = u' '.join(transformed)
            if topic:
                start = 0
                end = len(text)
                i = text.lower().find(topic.lower())
                if i > -1:
                    matches = [
                        m.end() for m in self.re_punctuation.finditer(text[:i])
                    ]
                    if matches: start = matches[-1]
                    m = self.re_punctuation.search(text[(i + len(topic)):])
                    if m: end = m.start() + i + len(topic)

                transformed = [u'topic=' + topic] + text[start:end].split()

        return transformed
示例#2
0
def clean_tweet(tweet):
    """ Simple tweet preprocessing """
    gettokens = Tokenizer()
    tweet = " ".join(gettokens.tokenize(tweet))
    tweet = tweet.lower()
    # tweet = re.sub(tweet_reg, "", tweet)
    # tweet = re.sub("\d+", "", tweet)
    # tweet = tweet.lower().strip()
    tweet = [word for word in tweet.split() if word not in stopWords]
    tweet = [word for word in tweet if word not in string.punctuation]
    return tweet
def Clean(text):
    gettokens = Tokenizer(usernames="",urls="",numbers="")
    t=gettokens.tokenize(text)
    tt=[]

    for i in range(len(t)):
        if t[i] in UNICODE_EMOJI:
            t[i]=''
            # t[i]=REV_UNICODE_EMOJI[t[i]]
            
    for i in range(len(t)):
        if(isEnglish(t[i])):
            tt.append(t[i].lower())
            
    return ' '.join(tt).encode('utf-8')
示例#4
0
def load_semeval_text_only(fname, delimiter=u'\t'):
    gettokens = Tokenizer()
    assert os.path.isfile(fname)
    rvdata = codecs.open(fname, encoding='utf-8').readlines()
    assert type(rvdata[0]) == unicode
    rvdata = [s.strip() for s in rvdata]
    assert len(rvdata) > 0
    numberofColoums = len(rvdata[0].split(delimiter))
    ids = [s.split(delimiter)[0] for s in rvdata]
    if numberofColoums == 3:
        y = [s.split(delimiter)[1] for s in rvdata]
        x = [s.split(delimiter)[2] for s in rvdata]

    elif numberofColoums == 4:
        y = [s.split(delimiter)[2] for s in rvdata]
        x = [s.split(delimiter)[3] for s in rvdata]
    else:
        xstartindex = [int(s.split(delimiter)[2]) for s in rvdata]
        xendindex = [int(s.split(delimiter)[3]) for s in rvdata]
        y = [s.split(delimiter)[4] for s in rvdata]
        x = [s.split(delimiter)[5] for s in rvdata]
        print rvdata[0]
        x = [
            u' '.join(s.split(u' ')[xs:xe + 1])
            for s, xs, xe in zip(x, xstartindex, xendindex)
        ]

    print 'totoal', len(rvdata), '@', fname
    print ids[0], y[0], x[0]
    return ids, y, x
def tweet_tokens(tweet):
    '''
    Takes a tweet and replaces mentions, hashtags, urls, times, and numbers
    with a generic label
    INPUT: string
    OUTPUT: string
    '''

    gettokens = Tokenizer(usernames='USER',
                          urls='URL',
                          hashtags='HASHTAG',
                          times='TIME',
                          numbers='NUMBER',
                          allcapskeep=True,
                          lowercase=False)
    tokens = gettokens.tokenize(tweet)
    tweet = ' '.join(tokens)

    return tweet
class SocioLinguisticClassifier:
    def __init__(self):
        self.socling = SocioLinguistic()
        self.features_list = []
        self.features = {}
        self.gettokens = Tokenizer()

    def label_file_to_dict(self, filename):
        dict_x = {}
        for line in open(filename, "r"):
            # print line
            temp = line.split("||")
            name = temp[0].strip()
            label = temp[1].strip()
            dict_x[name] = label
        return dict_x

    def get_features(self, line, demographic):
        self.socling.sent = line
        self.socling.sent_list = self.gettokens.tokenize(line.upper())
        if demographic == "gend":
            self.socling.single_exclam()
            self.socling.pumping()
            self.socling.agreement()
            self.socling.affection()
        self.socling.emoticons()
        self.socling.excitement()
        self.socling.ellipses()
        self.socling.possessive_bigrams(self.features_list)
        self.socling.laugh()
        self.socling.shout()
        self.socling.exasperation()
        self.socling.honorifics()
        self.socling.slang()
        self.socling.pronouns()

    def initialize(self,demographic):
        self.features_list = set(self.socling.file_to_list("feature_files/feature_names_" + demographic))

    def reset_dictionary(self):
        self.features = {}
        for feature in self.features_list:
            self.features[feature] = 0

    def stacked_socling_init(self,demographic):
        self.features_list = set(self.socling.file_to_list("feature_files/feature_names_" + demographic))
        self.reset_dictionary()
        self.socling.features_dict = self.features
示例#7
0
def load_semeval_text_only(fname, delimiter=u'\t'):
    '''
    :param fname: file name
    :param delimiter: deliminater
    :return:
    id: all ids
    y:  training labels
    x:  training text , lower cased , and filtered text
    '''
    gettokens = Tokenizer()
    assert os.path.isfile(fname)
    rvdata = codecs.open(fname, encoding='utf-8').readlines()
    assert type(rvdata[0]) == unicode
    rvdata = [s.strip() for s in rvdata]
    assert len(rvdata) > 0
    numberofColoums = len(rvdata[0].split(delimiter))
    ids = [s.split(delimiter)[0] for s in rvdata]
    if numberofColoums == 3:
        y = [s.split(delimiter)[1] for s in rvdata]
        x = [s.split(delimiter)[2] for s in rvdata]

    elif numberofColoums == 4:
        y = [s.split(delimiter)[2] for s in rvdata]
        x = [s.split(delimiter)[3] for s in rvdata]
    else:
        xstartindex = [int(s.split(delimiter)[2]) for s in rvdata]
        xendindex = [int(s.split(delimiter)[3]) for s in rvdata]
        y = [s.split(delimiter)[4] for s in rvdata]
        x = [s.split(delimiter)[5] for s in rvdata]
        print rvdata[0]
        x = [
            u' '.join(s.split(u' ')[xs:xe + 1])
            for s, xs, xe in zip(x, xstartindex, xendindex)
        ]

    print 'totoal', len(rvdata), '@', fname
    print ids[0], y[0], x[0]

    x = map(filterlineEmoji, x)
    #x=map(filterline,x)
    x = map(unicode.lower, x)
    return ids, y, x
示例#8
0

# Initilization
print("Initilizing...")
t0 = time()
with open(stopwords_file_path) as f:
    stopwords_list = f.readlines()
with open(english_stopwords_file_path) as f:
    english_stopwords_list = f.readlines()
stopwords = [word.strip() for word in stopwords_list if word.strip()] + [
    word.strip() for word in english_stopwords_list if word.strip()
] + tweet_stopwords
politicians_sorted = sorted(list(politicians_info.keys()))
politician_tweets = defaultdict(list)
tweet_list = []
tokenizer = Tokenizer()
print("done in {:0.4f}s".format(time() - t0))

# Collect tweets from JSON
print("Collecting tweets...")
t0 = time()
tweets_so_far = 0
only_jsons = [
    f for f in listdir(TWEETS_DIRECTORY) if isfile(join(TWEETS_DIRECTORY, f))
    and f.endswith('.json') and not f == basename(POLITICIANS_INFO_FILE_PATH)
]
for tweet_file in only_jsons:
    with open(join(TWEETS_DIRECTORY, tweet_file)) as tf:
        tweets = json.load(tf)
        for tweet in tweets:
            tweet_list.append(unidecode.unidecode(tweet['text']))
示例#9
0
    
#function to replace other broken encoding
def fix_other(text):
    for i in range(len(text)):
        if (text[i] == u'\u2014' or text[i] == u'\u2013'):
            text[i] = "-"
        if (text[i].find(u'\u2026') or text[i] == u'\u2026'):
            text[i] = text[i].replace(u"\u2026","...")
    return text


#configure tweetokenize Tokenizer
tknzr = Tokenizer(lowercase = False,
                  allcapskeep = True,
                  normalize = False,
                  usernames = 'USERNAME',
                  urls = 'URL',
                  hashtags = 'HASHTAG',
                  ignorequotes = False,
                  ignorestopwords = False)
                  
tknzr.emoticons(filename="emoticons.txt")


#input and output filepaths
#pretoken_filepath = 'practice-data/tweet_tweet.csv'
pretoken_filepath = 'project-data/twitter_tweet.csv'
posttoken_filepath = 'preprocessed-data/tweet_tweet_pp.csv'
text_index = 3


#read from input, tokenize, write to output
示例#10
0
from tweetokenize import Tokenizer
from nltk.tokenize.moses import MosesDetokenizer
import time
detokenizer = MosesDetokenizer()
import time
expanded_words = {
    "tbh": "to be honest",
    "lgtm": "looks good to me",
    "r+": "Review",
    "wc": "Welcome",
    "btw": "by the way"
}
with open("/Users/hiteshsapkota/Desktop/ICSETrust/Data/shortcodeemoji.json"
          ) as outfile:
    shortcodeemoji = json.load(outfile)
gettokens = Tokenizer()


def expandwords(comment):
    words = []
    keys = [k for k, v in expanded_words.items()]
    for word in comment.split():
        present = False
        for key in keys:
            if key in word.lower():
                present = True
                words.append(expanded_words[key])
        if present is False:
            words.append(word)
    return ' '.join(words)
示例#11
0
 def setUp(self):
     self.tokenizer = Tokenizer(lowercase=True)
示例#12
0
class TokenizeTests(unittest.TestCase):
    def setUp(self):
        self.tokenizer = Tokenizer(lowercase=True)
    
    def test_general_1(self):
        self.tokenizer.normalize = 2
        msg = ('omg wow < & > >.< >.< :):)'
               'i CANT believe thatttt haha lol!!1')
        tks = ['omg', 'wow', '<', '&', '>', '>.<', '>.<', ':)', ':)',
               'i', 'CANT', 'believe', 'thatt', 'haha', 'lol', '!', '!', '1']
        self.assertEqual(self.tokenizer.tokenize(msg), tks)
    
    def test_general_2(self):
        msg = "i'm wanting to jump up and down but wouldn't if i couldn't.."
        tks = ["i'm", 'wanting', 'to', 'jump', 'up', 'and', 'down',
               'but', "wouldn't", 'if', 'i', "couldn't", '...']
        self.assertEqual(self.tokenizer.tokenize(msg), tks)
    
    def test_urls_1(self):
        msg = "hey bro chec'k out http://shitstorm.com its f*****g sick"
        tks = ['hey', 'bro', "chec'k", 'out', 'URL', 'its', 'f*****g', 'sick']
        self.assertEqual(self.tokenizer.tokenize(msg), tks)
    
    def test_urls_2(self):
        msg = 'also see this crazy stuff https://shitstorm.com'
        tks = ['also', 'see', 'this', 'crazy', 'stuff', 'URL']
        self.assertEqual(self.tokenizer.tokenize(msg), tks)
    
    def test_urls_3(self):
        msg = 'hiiiii rayj.com/ihititfirst and other google.com http://hobo.net'
        tks = ['hiii', 'URL', 'and', 'other', 'URL', 'URL']
        self.assertEqual(self.tokenizer.tokenize(msg), tks)
    
    def test_usernames_1(self):
        msg = '@justinbeiber yo man!! ! i love you in a totally straight way <3:p:D'
        tks = ['USERNAME', 'yo', 'man', '!', '!', '!',
               'i', 'love', 'you', 'in', 'a', 'totally', 'straight', 'way',
               '<3', ':p', ':D']
        self.assertEqual(self.tokenizer.tokenize(msg), tks)
    
    def test_usernames_2(self):
        msg = '@heyheymango: what did you SAYYY??? or did you just..  NotHING?'
        tks = ['USERNAME', ':', 'what', 'did', 'you', 'SAYYY', '?',
               '?', '?', 'or', 'did', 'you', 'just', '...', 'nothing', '?']
        self.assertEqual(self.tokenizer.tokenize(msg), tks)
    
    def test_numbers_1(self):
        self.tokenizer.numbers = None
        msg = ('i have this much money -2.42 in my bank acct.,friend! but you '
               'have mucho +88e44 and its about 1000% more than $400.')
        tks = ['i', 'have', 'this', 'much', 'money', '-2.42', 'in',
               'my', 'bank', 'acct', '.', ',', 'friend', '!', 'but', 'you',
               'have', 'mucho', '+88e44', 'and', 'its', 'about', '1000%',
               'more', 'than', '$400', '.']
        self.assertEqual(self.tokenizer.tokenize(msg), tks)
    
    def test_numbers_2(self):
        msg = ('i have this much money -2.42 in my bank acct.,friend! but you '
               'have mucho +88e44 and its about 1000% more than $400.')
        tks = ['i', 'have', 'this', 'much', 'money', 'NUMBER', 'in',
               'my', 'bank', 'acct', '.', ',', 'friend', '!', 'but', 'you',
               'have', 'mucho', 'NUMBER', 'and', 'its', 'about', 'NUMBER',
               'more', 'than', 'NUMBER', '.']
        self.assertEqual(self.tokenizer.tokenize(msg), tks)
    
    def test_numbers_3(self):
        self.tokenizer.lowercase = False  # keep cases the same everywhere
        msg = ('I JUST want To Test FRACTIONZZZ 22432.41414/ 55894385e-341 also'
               ' lowercase etc.etc.etc. hope that last part doesn\'t parse as a url '
               'i would be kinda sad PANda!zsss..... .. . .... 4/5 5.1/4.0e0 3.14 -2')
        tks = ['I', 'JUST', 'want', 'To', 'Test', 'FRACTIONZZZ',
               'NUMBER', 'also', 'lowercase', 'etc', '.', 'etc', '.', 'etc',
               '.', 'hope', 'that', 'last', 'part', "doesn't", 'parse', 'as',
               'a', 'url', 'i', 'would', 'be', 'kinda', 'sad', 'PANda', '!',
               'zsss', '...', '...', '.', '...', 'NUMBER', 'NUMBER', 'NUMBER',
               'NUMBER']
        self.assertEqual(self.tokenizer.tokenize(msg), tks)
    
    def test_time_1(self):
        msg = 'is the time now 12:14pm? or is it like 2:42AM??'
        tks = ['is', 'the', 'time', 'now', 'TIME', '?', 'or', 'is',
               'it', 'like', 'TIME', '?', '?']
        self.assertEqual(self.tokenizer.tokenize(msg), tks)
    
    def test_time_2(self):
        msg = 'new time is 2:42:09 PM!!'
        tks = ['new', 'time', 'is', 'TIME', '!', '!']
        self.assertEqual(self.tokenizer.tokenize(msg), tks)
    
    def test_phonenumber_1(self):
        msg = ('my number is 18002432242 and 241.413.5584 also 1-242-156-6724'
               ' and (958)555-4875 or (999) 415 5542 is 422-5555 a 131-121-1441')
        tks = ['my', 'number', 'is', 'PHONENUMBER', 'and', 'PHONENUMBER',
               'also', 'PHONENUMBER', 'and', 'PHONENUMBER', 'or', 'PHONENUMBER',
               'is', 'PHONENUMBER', 'a', 'PHONENUMBER']
        self.assertEqual(self.tokenizer.tokenize(msg), tks)
    
    def test_phonenumber_2(self):
        msg = 'numbers with extension: (201)-340-4915 x112 or 1 800.341.1311x99'
        tks = ['numbers', 'with', 'extension', ':', 'PHONENUMBER', 'or', 'PHONENUMBER']
        self.assertEqual(self.tokenizer.tokenize(msg), tks)
    
    def test_quotes_1(self):
        self.tokenizer.ignorequotes = True
        msg = 'this is just a tweet with "someone said something funny" lol'
        tks = ['this', 'is', 'just', 'a', 'tweet', 'with', 'lol']
        self.assertEqual(self.tokenizer.tokenize(msg), tks)
    
    def test_quotes_2(self):
        self.tokenizer.ignorequotes = False
        msg = 'this is just a tweet with "someone said something funny" lol'
        tks = ['this', 'is', 'just', 'a', 'tweet', 'with', '"', 'someone',
               'said', 'something', 'funny', '"', 'lol']
        self.assertEqual(self.tokenizer.tokenize(msg), tks)
    
    def test_quotes_3(self):
        self.tokenizer.ignorequotes = True
        msg = 'some stuff but he said “yea i know its crazy”other stuff...!!! '
        tks = ['some', 'stuff', 'but', 'he', 'said', 'other', 'stuff', '...', '!', '!', '!']
        self.assertEqual(self.tokenizer.tokenize(msg), tks)
    
    def test_quotes_4(self):
        self.tokenizer.ignorequotes = True
        msg = 'some stuff but he said &ldquo;yea i know its crazy&rdquo;other stuff...!!! '
        tks = ['some', 'stuff', 'but', 'he', 'said', 'other', 'stuff', '...', '!', '!', '!']
        self.assertEqual(self.tokenizer.tokenize(msg), tks)
    
    def test_quotes_5(self):
        self.tokenizer.ignorequotes = False
        msg = 'heyy buddyyyyy boy \'do you the lady\'s kitty like that??\''
        tks = ['heyy', 'buddyyy', 'boy', "'", 'do', 'you', 'the',
               "lady's", 'kitty', 'like', 'that', '?', '?', "'"]
        self.assertEqual(self.tokenizer.tokenize(msg), tks)
    
    def test_hashtags_1(self):
        msg = 'omg i love#dog#cat#food#other#things#so#f*****g#much!!!11LOLOLOL'
        tks = ['omg', 'i', 'love', '#dog', '#cat', '#food', '#other',
               '#things', '#so', '#f*****g', '#much', '!', '!', '!', '11LOLOLOL']
        self.assertEqual(self.tokenizer.tokenize(msg), tks)
    
    def test_hashtags_2(self):
        self.tokenizer.hashtags = 'HASHTAG'
        msg = 'omg i love#dog#cat#food#other#things#so#f*****g#much!!!11LOLOLOL'
        tks = ['omg', 'i', 'love', 'HASHTAG', 'HASHTAG', 'HASHTAG',
               'HASHTAG', 'HASHTAG', 'HASHTAG', 'HASHTAG', 'HASHTAG', '!', '!', '!', '11LOLOLOL']
        self.assertEqual(self.tokenizer.tokenize(msg), tks)
    
    def test_emoticons_1(self):
        msg = 'heyyyyyy:):):(>.<<v.vwhats up man LOL T.T tomcat.tomcat :$;).!!!'
        tks = ['heyyy', ':)', ':)', ':(', '>.<', '<', 'v.v', 'whats',
               'up', 'man', 'LOL', 'T.T', 'tomcat', '.', 'tomcat', ':$',
               ';)', '.', '!', '!', '!']
        self.assertEqual(self.tokenizer.tokenize(msg), tks)
    
    def test_removefeatures_1(self):
        self.tokenizer.usernames = ""  # dont' want any usernames to show
        msg = ('hey @arnold @nickelodeon #90s#ilove90s#allthat#amandashow'
               '@rocko http://en.wikipedia.org/wiki/The_Angry_Beavers ^.^>>><<<^.^')
        tks = ['hey', '#90s', '#ilove90s', '#allthat', '#amandashow',
               'URL', '^.^', '>', '>', '>', '<', '<', '<', '^.^']
        self.assertEqual(self.tokenizer.tokenize(msg), tks)
    
    def test_removefeatures_2(self):
        self.tokenizer.usernames = ""  # dont' want atest_tweetokenizeny usernames to show
        self.tokenizer.hashtags = ""  # or hashtags
        msg = ('hey @arnold @nickelodeon #90s#ilove90s#allthat#amandashow'
               '@rocko http://en.wikipedia.org/wiki/The_Angry_Beavers ^.^>>><<<^.^')
        tks = ['hey', 'URL', '^.^', '>', '>', '>', '<', '<', '<', '^.^']
        self.assertEqual(self.tokenizer.tokenize(msg), tks)
    
    def test_removefeatures_3(self):
        self.tokenizer.usernames = False  # keep usernames
        self.tokenizer.urls = ""  # URLs should be removed
        self.tokenizer.hashtags = "$$$"  # hashtags should be $$$
        msg = ('hey @arnold @nickelodeon #90s#ilove90s#allthat#amandashow'
               '@rocko http://en.wikipedia.org/wiki/The_Angry_Beavers ^.^>>><<<^.^')
        tks = ['hey', '@arnold', '@nickelodeon', '$$$', '$$$', '$$$',
               '$$$', '@rocko', '^.^', '>', '>', '>', '<', '<', '<', '^.^']
        self.assertEqual(self.tokenizer.tokenize(msg), tks)
    
    def test_emoji_1(self):
        msg = ('hey mate!:):3.....@and🇨🇳ONE+ BRO#love😘😵💚💛💜💙  '
               '💋😂😂LOLLLL.')
        tks = ['hey', 'mate', '!', ':)', ':3', '...',
               'USERNAME', '\U0001f1e8\U0001f1f3', 'ONE', '+', 'BRO', '#love',
               '\U0001f618', '\U0001f635', '\U0001f49a', '\U0001f49b',
               '\U0001f49c', '\U0001f499', '\U0001f48b', '\U0001f602',
               '\U0001f602', 'LOLLL', '.']
        self.assertEqual(self.tokenizer.tokenize(msg), tks)
    
    def test_emoji_2(self):
        msg = ('hey mate!:):3.....@andONE+🇬🇧  BRO#love😘😵💚💛💜💙  '
               '💋😂😂LOLLLL.')
        tks = ['hey', 'mate', '!', ':)', ':3', '...',
               'USERNAME', '+', '\U0001f1ec\U0001f1e7', 'BRO', '#love', '😘',
               '😵', '\U0001f49a', '\U0001f49b', '\U0001f49c',
               '\U0001f499', '💋', '\U0001f602', '\U0001f602',
               'LOLLL', '.']
        self.assertEqual(self.tokenizer.tokenize(msg), tks)
    
    def _test_emoji_3(self):
        msg = ('🚀=)</3O_O:$D:<:-@\xf0\x9f\x98\xb7🔥💩💅 outdated:💽 ancient:💾 '
               '#getwiththecloud:💻 and it looks like 💭')
        tks = ['\U0001f680', '=)', '</3', 'O_O', ':$', 'D:<', ':-@',
               '\U0001f637', '\U0001f525', '\U0001f4a9', '\U0001f485',
               'outdated', ':', '\U0001f4bd', 'ancient', ':',
               '\U0001f4be', '#getwiththecloud',
               ':', '\U0001f4bb', 'and', 'it', 'looks', 'like', '\U0001f4ad']
        self.assertEqual(self.tokenizer.tokenize(msg), tks)

    def test_accent_1(self):
        msg = 'hola! cómo estás?'
        tks = ['hola', '!', 'cómo', 'estás', '?']
        self.assertEqual(self.tokenizer.tokenize(msg), tks)

    def test_accent_2(self):
        self.tokenizer.ignoreaccents = True
        msg = 'hola! cómo estás? ANDRÉS'
        tks = ['hola', '!', 'como', 'estas', '?', 'ANDRES']
        self.assertEqual(self.tokenizer.tokenize(msg), tks)

    def _test_email_1(self):
        self.tokenizer.mail = 'MAIL'
        msg = 'write me to [email protected]'
        tks = ['write', 'me', 'to', 'MAIL']
        self.assertEqual(self.tokenizer.tokenize(msg), tks)

    def test_split_hashtag_1(self):
        self.tokenizer.splithashtag = True
        self.tokenizer.lowercase = False
        msg = 'hola! #EstoEsUnSaludo'
        tks = ['hola', '!', '#', 'EstoEsUnSaludo']
        self.assertEqual(self.tokenizer.tokenize(msg), tks)

    def test_error_1(self):
        msg = []
        with self.assertRaises(TypeError):
            self.tokenizer.tokenize(msg)
    
    def test_error_2(self):
        msg = lambda x: x
        with self.assertRaises(TypeError):
            self.tokenizer.tokenize(msg)
    
    def test_actual_tweets_1(self):
        """Number as part of name"""
        msg = '@LoganTillman not 2pac and floyd mayweather'
        tks = ['USERNAME', 'not', '2pac', 'and', 'floyd', 'mayweather']
        self.assertEqual(self.tokenizer.tokenize(msg), tks)
    
    def test_actual_tweets_2(self):
        """Colon no space in hashtag"""
        msg = '#MentionSomeoneYoureGladYouMet: @LarryWorld_Wide of course.'
        tks = ['#MentionSomeoneYoureGladYouMet', ':', 'USERNAME', 'of', 'course', '.']
        self.assertEqual(self.tokenizer.tokenize(msg), tks)
    
    def test_stopwords_1(self):
        self.tokenizer.ignorestopwords = True
        msg = 'i like myself and my so not much and our something he:)'
        tks = ['like', 'much', 'something', ':)']
        self.assertEqual(self.tokenizer.tokenize(msg), tks)
示例#13
0
 def setUp(self):
     self.tokenizer = Tokenizer(lowercase=True)
示例#14
0
class TokenizeTests(unittest.TestCase):
    def setUp(self):
        self.tokenizer = Tokenizer(lowercase=True)
    
    def test_general_1(self):
        self.tokenizer.normalize = 2
        msg = ('omg wow &#x3c; &#x26; &#x3e; &#62;.&#60; &gt;.&lt; :):)'
        'i CANT believe thatttt haha lol!!1')
        tks = ['omg', 'wow', '<', '&', '>', '>.<', '>.<', ':)', ':)',
        'i', 'CANT', 'believe', 'thatt', 'haha', 'lol', '!', '!', '1']
        self.assertEqual(self.tokenizer.tokenize(msg), tks)
    
    def test_general_2(self):
        msg = "i'm wanting to jump up and down but wouldn't if i couldn't.."
        tks = ["i'm", 'wanting', 'to', 'jump', 'up', 'and', 'down',
        'but', "wouldn't", 'if', 'i', "couldn't", '...']
        self.assertEqual(self.tokenizer.tokenize(msg), tks)
    
    def test_urls_1(self):
        msg = ("hey bro chec'k out http://shitstorm.com its f*****g sick")
        tks = ['hey', 'bro', "chec'k", 'out', 'URL', 'its', 'f*****g', 'sick']
        self.assertEqual(self.tokenizer.tokenize(msg), tks)
    
    def test_urls_2(self):
        msg = ('also see this crazy stuff https://shitstorm.com')
        tks = ['also', 'see', 'this', 'crazy', 'stuff', 'URL']
        self.assertEqual(self.tokenizer.tokenize(msg), tks)
    
    def test_urls_3(self):
        msg = 'hiiiii rayj.com/ihititfirst and other google.com http://hobo.net'
        tks = ['hiii', 'URL', 'and', 'other', 'URL', 'URL']
        self.assertEqual(self.tokenizer.tokenize(msg), tks)
    
    def test_usernames_1(self):
        msg = ('@justinbeiber yo man!! ! i love you in a totally '
        'straight way <3:p:D')
        tks = ['USERNAME', 'yo', 'man', '!', '!', '!',
        'i', 'love', 'you', 'in', 'a', 'totally', 'straight', 'way',
        '<3', ':p', ':D']
        self.assertEqual(self.tokenizer.tokenize(msg), tks)
    
    def test_usernames_2(self):
        msg = '@heyheymango: what did you SAYYY??? or did you just..  NotHING?'
        tks = ['USERNAME', ':', 'what', 'did', 'you', 'SAYYY', '?',
        '?', '?', 'or', 'did', 'you', 'just', '...', 'nothing', '?']
        self.assertEqual(self.tokenizer.tokenize(msg), tks)
    
    def test_numbers_1(self):
        self.tokenizer.numbers = None
        msg = ('i have this much money -2.42 in my bank acct.,friend! but you '
        'have mucho +88e44 and its about 1000% more than $400.')
        tks = ['i', 'have', 'this', 'much', 'money', '-2.42', 'in',
        'my', 'bank', 'acct', '.', ',', 'friend', '!', 'but', 'you',
        'have', 'mucho', '+88e44', 'and', 'its', 'about', '1000%',
        'more', 'than', '$400', '.']
        self.assertEqual(self.tokenizer.tokenize(msg), tks)
    
    def test_numbers_2(self):
        msg = ('i have this much money -2.42 in my bank acct.,friend! but you '
        'have mucho +88e44 and its about 1000% more than $400.')
        tks = ['i', 'have', 'this', 'much', 'money', 'NUMBER', 'in',
        'my', 'bank', 'acct', '.', ',', 'friend', '!', 'but', 'you',
        'have', 'mucho', 'NUMBER', 'and', 'its', 'about', 'NUMBER',
        'more', 'than', 'NUMBER', '.']
        self.assertEqual(self.tokenizer.tokenize(msg), tks)
    
    def test_numbers_3(self):
        self.tokenizer.lowercase = False # keep cases the same everywhere
        msg = ('I JUST want To Test FRACTIONZZZ 22432.41414/ 55894385e-341 also'
        ' lowercase etc.etc.etc. hope that last part doesn\'t parse as a url '
        'i would be kinda sad PANda!zsss..... .. . .... 4/5 5.1/4.0e0 3.14 -2')
        tks = ['I', 'JUST', 'want', 'To', 'Test', 'FRACTIONZZZ',
        'NUMBER', 'also', 'lowercase', 'etc', '.', 'etc', '.', 'etc',
        '.', 'hope', 'that', 'last', 'part', "doesn't", 'parse', 'as',
        'a', 'url', 'i', 'would', 'be', 'kinda', 'sad', 'PANda', '!',
        'zsss', '...', '...', '.', '...', 'NUMBER', 'NUMBER', 'NUMBER',
        'NUMBER']
        self.assertEqual(self.tokenizer.tokenize(msg), tks)
    
    def test_time_1(self):
        msg = 'is the time now 12:14pm? or is it like 2:42AM??'
        tks = ['is', 'the', 'time', 'now', 'TIME', '?', 'or', 'is',
        'it', 'like', 'TIME', '?', '?']
        self.assertEqual(self.tokenizer.tokenize(msg), tks)
    
    def test_time_2(self):
        msg = 'new time is 2:42:09 PM!!'
        tks = ['new', 'time', 'is', 'TIME', '!', '!']
        self.assertEqual(self.tokenizer.tokenize(msg), tks)
    
    def test_phonenumber_1(self):
        msg = ('my number is 18002432242 and 241.413.5584 also 1-242-156-6724'
        ' and (958)555-4875 or (999) 415 5542 is 422-5555 a 131-121-1441')
        tks = ['my', 'number', 'is', 'PHONENUMBER', 'and', 'PHONENUMBER',
        'also', 'PHONENUMBER', 'and', 'PHONENUMBER', 'or', 'PHONENUMBER',
        'is', 'PHONENUMBER', 'a', 'PHONENUMBER']
        self.assertEqual(self.tokenizer.tokenize(msg), tks)
    
    def test_phonenumber_2(self):
        msg = 'numbers with extension: (201)-340-4915 x112 or 1 800.341.1311x99'
        tks = ['numbers', 'with', 'extension', ':', 'PHONENUMBER', 'or',
        'PHONENUMBER']
        self.assertEqual(self.tokenizer.tokenize(msg), tks)
    
    def test_quotes_1(self):
        self.tokenizer.ignorequotes = True
        msg = 'this is just a tweet with "someone said something funny" lol'
        tks = ['this', 'is', 'just', 'a', 'tweet', 'with', 'lol']
        self.assertEqual(self.tokenizer.tokenize(msg), tks)
    
    def test_quotes_2(self):
        self.tokenizer.ignorequotes = False
        msg = 'this is just a tweet with "someone said something funny" lol'
        tks = ['this', 'is', 'just', 'a', 'tweet', 'with', '"', 'someone',
        'said', 'something', 'funny', '"', 'lol']
        self.assertEqual(self.tokenizer.tokenize(msg), tks)
    
    def test_quotes_3(self):
        self.tokenizer.ignorequotes = True
        msg = ('some stuff but he said “yea i know its crazy”other '
        'stuff...!!! ')
        tks = ['some', 'stuff', 'but', 'he', 'said', 'other', 'stuff',
        '...', '!', '!', '!']
        self.assertEqual(self.tokenizer.tokenize(msg), tks)
    
    def test_quotes_4(self):
        self.tokenizer.ignorequotes = True
        msg = ('some stuff but he said &ldquo;yea i know its crazy&rdquo;other '
        'stuff...!!! ')
        tks = ['some', 'stuff', 'but', 'he', 'said', 'other', 'stuff',
        '...', '!', '!', '!']
        self.assertEqual(self.tokenizer.tokenize(msg), tks)
    
    def test_quotes_5(self):
        self.tokenizer.ignorequotes = False
        msg = 'heyy buddyyyyy boy \'do you the lady\'s kitty like that??\''
        tks = ['heyy', 'buddyyy', 'boy', "'", 'do', 'you', 'the',
        "lady's", 'kitty', 'like', 'that', '?', '?', "'"]
        self.assertEqual(self.tokenizer.tokenize(msg), tks)
    
    def test_hashtags_1(self):
        msg = 'omg i love#dog#cat#food#other#things#so#f*****g#much!!!11LOLOLOL'
        tks = ['omg', 'i', 'love', '#dog', '#cat', '#food', '#other',
        '#things', '#so', '#f*****g', '#much', '!', '!', '!', '11LOLOLOL']
        self.assertEqual(self.tokenizer.tokenize(msg), tks)
    
    def test_hashtags_2(self):
        self.tokenizer.hashtags = 'HASHTAG'
        msg = 'omg i love#dog#cat#food#other#things#so#f*****g#much!!!11LOLOLOL'
        tks = ['omg', 'i', 'love', 'HASHTAG', 'HASHTAG', 'HASHTAG',
        'HASHTAG', 'HASHTAG', 'HASHTAG', 'HASHTAG', 'HASHTAG', '!', '!', '!',
        '11LOLOLOL']
        self.assertEqual(self.tokenizer.tokenize(msg), tks)
    
    def test_emoticons_1(self):
        msg = 'heyyyyyy:):):(>.<<v.vwhats up man LOL T.T tomcat.tomcat :$;).!!!'
        tks = ['heyyy', ':)', ':)', ':(', '>.<', '<', 'v.v', 'whats',
        'up', 'man', 'LOL', 'T.T', 'tomcat', '.', 'tomcat', ':$',
        ';)', '.', '!', '!', '!']
        self.assertEqual(self.tokenizer.tokenize(msg), tks)
    
    def test_removefeatures_1(self):
        self.tokenizer.usernames = "" # dont' want any usernames to show
        msg = ('hey @arnold @nickelodeon #90s#ilove90s#allthat#amandashow'
        '@rocko http://en.wikipedia.org/wiki/The_Angry_Beavers ^.^>>><<<^.^')
        tks = ['hey', '#90s', '#ilove90s', '#allthat', '#amandashow',
        'URL', '^.^', '>', '>', '>', '<', '<', '<', '^.^']
        self.assertEqual(self.tokenizer.tokenize(msg), tks)
    
    def test_removefeatures_2(self):
        self.tokenizer.usernames = "" # dont' want any usernames to show
        self.tokenizer.hashtags = ""  # or hashtags
        msg = ('hey @arnold @nickelodeon #90s#ilove90s#allthat#amandashow'
        '@rocko http://en.wikipedia.org/wiki/The_Angry_Beavers ^.^>>><<<^.^')
        tks = ['hey', 'URL', '^.^', '>', '>', '>', '<', '<', '<',
        '^.^']
        self.assertEqual(self.tokenizer.tokenize(msg), tks)
    
    def test_removefeatures_3(self):
        self.tokenizer.usernames = False # keep usernames
        self.tokenizer.urls = ""         # URLs should be removed
        self.tokenizer.hashtags = "$$$"  # hashtags should be $$$
        msg = ('hey @arnold @nickelodeon #90s#ilove90s#allthat#amandashow'
        '@rocko http://en.wikipedia.org/wiki/The_Angry_Beavers ^.^>>><<<^.^')
        tks = ['hey', '@arnold', '@nickelodeon', '$$$', '$$$', '$$$',
        '$$$', '@rocko', '^.^', '>', '>', '>', '<', '<', '<', '^.^']
        self.assertEqual(self.tokenizer.tokenize(msg), tks)
    
    def test_emoji_1(self):
        msg = ('hey mate!:):3.....@and🇨🇳ONE+ BRO#love😘😵💚💛💜💙  '
        '💋😂😂LOLLLL.')
        tks = ['hey', 'mate', '!', ':)', ':3', '...',
        'USERNAME', '\U0001f1e8\U0001f1f3', 'ONE', '+', 'BRO', '#love',
        '\U0001f618', '\U0001f635', '\U0001f49a', '\U0001f49b',
        '\U0001f49c', '\U0001f499', '\U0001f48b', '\U0001f602',
        '\U0001f602', 'LOLLL', '.']
        self.assertEqual(self.tokenizer.tokenize(msg), tks)
    
    def test_emoji_2(self):
        msg = ('hey mate!:):3.....@andONE+🇬🇧  BRO#love😘😵💚💛💜💙  '
        '💋😂😂LOLLLL.')
        tks = ['hey', 'mate', '!', ':)', ':3', '...',
        'USERNAME', '+', '\U0001f1ec\U0001f1e7', 'BRO', '#love', '😘',
        '😵', '\U0001f49a', '\U0001f49b', '\U0001f49c',
        '\U0001f499', '💋', '\U0001f602', '\U0001f602',
        'LOLLL', '.']
        self.assertEqual(self.tokenizer.tokenize(msg), tks)
    
    def test_emoji_3(self):
        msg = ('🚀=)</3O_O:$D:<:-@\xf0\x9f\x98\xb7🔥💩💅 outdated:💽 ancient:💾 '
        '#getwiththecloud:💻 and it looks like 💭')
        tks = ['\U0001f680', '=)', '</3', 'O_O', ':$', 'D:<', ':-@',
        '\U0001f637', '\U0001f525', '\U0001f4a9', '\U0001f485',
        'outdated', ':', '\U0001f4bd', 'ancient', ':',
        '\U0001f4be', '#getwiththecloud',
        ':', '\U0001f4bb', 'and', 'it', 'looks', 'like', '\U0001f4ad']
        self.assertEqual(self.tokenizer.tokenize(msg), tks)
    
    def test_error_1(self):
        msg = []
        with self.assertRaises(TypeError):
            self.tokenizer.tokenize(msg)
    
    def test_error_2(self):
        msg = lambda x: x
        with self.assertRaises(TypeError):
            self.tokenizer.tokenize(msg)
    
    def test_actual_tweets_1(self):
        "Number as part of name"
        msg = '@LoganTillman not 2pac and floyd mayweather'
        tks = ['USERNAME', 'not', '2pac', 'and', 'floyd', 'mayweather']
        self.assertEqual(self.tokenizer.tokenize(msg), tks)
    
    def test_actual_tweets_2(self):
        "Colon no space in hashtag"
        msg = '#MentionSomeoneYoureGladYouMet: @LarryWorld_Wide of course.'
        tks = ['#MentionSomeoneYoureGladYouMet', ':', 'USERNAME', 'of',
        'course', '.']
        self.assertEqual(self.tokenizer.tokenize(msg), tks)
    
    def test_stopwords_1(self):
        self.tokenizer.ignorestopwords = True
        msg = 'i like myself and my so not much and our something he:)'
        tks = ['like', 'much', 'something', ':)']
        self.assertEqual(self.tokenizer.tokenize(msg), tks)
示例#15
0
 def __init__(self):
     self.socling = SocioLinguistic()
     self.features_list = []
     self.features = {}
     self.gettokens = Tokenizer()
示例#16
0
from collections import Counter, defaultdict
import csv
from ipdb import set_trace
import numpy as np
import os
import re
import sys
import twokenize
from tweetokenize import Tokenizer
from yandex_translate import YandexTranslate, YandexTranslateException

# emoticon regex taken from Christopher Potts' script at http://sentiment.christopherpotts.net/tokenizing.html
emoticon_regex = r"""(?:[<>]?[:;=8][\-o\*\']?[\)\]\(\[dDpP/\:\}\{@\|\\]|[\)\]\(\[dDpP/\:\}\{@\|\\][\-o\*\']?[:;=8][<>]?)"""

twk = Tokenizer(ignorequotes=False, usernames=False, urls=False)


def count_emoticon_polarity(message):
    """
        returns the number of positive, neutral and negative emoticons in message
    """
    emoticon_list = re.findall(emoticon_regex, message)
    polarity_list = []
    for emoticon in emoticon_list:
        if emoticon in ['8:', '::', 'p:']:
            continue  # these are false positives: '8:48', 'http:', etc
        polarity = emoticon_polarity(emoticon)
        polarity_list.append(polarity)
    emoticons = Counter(polarity_list)
    pos = emoticons[1]
    neu = emoticons[0]