Exemplo n.º 1
0
 def test_case_lowering(self):
     T = tokenizer.TweetTokenizer(preserve_case=False)
     text = 'This is a tweet with Upper Cases and :-D emoticon'  #make sure it doesn't lower case the emoticon
     actual = T.tokenize(text)
     expected = [
         'this', 'is', 'a', 'tweet', 'with', 'upper', 'cases', 'and', ':-D',
         'emoticon'
     ]
     self.assertEqual(actual, expected)
Exemplo n.º 2
0
import tokenizer
import re
T = tokenizer.TweetTokenizer()

f = open('nyusha_nyusha.txt', 'r', encoding='utf-8')
tweet = f.read()
f.close()
tokens = T.tokenize(tweet)

scanner = re.Scanner([
    (r"((https?:\/\/|www)|\w+\.(\w{2-3}))([\w\!#$&-;=\?\-\[\]~]|%[0-9a-fA-F]{2})+",    lambda scanner, token: ("URL", token)),
	(r"^\+?\d{1}?[-\(]?\d{3}[\)-]?\d{3}-?\d{2}-?\d{2}$", lambda scanner, token: ("PHONE", token)),
	(r"(?:@\w+)",    lambda scanner, token: ("USER", token)),
	(r"(?:\#\w+)", lambda scanner, token: ("HASHTAG", token)),
	(r"[\w.+-]+@[\w-]+\.(?:[\w-]\.?)+[\w-]",    lambda scanner, token: ("EMAILS", token)),
	(r"<[^>\s]+>", lambda scanner, token: ("HTML-TAGS", token)),
	(r"[\-]+>|<[\-]+",    lambda scanner, token: ("ASCII_ARROWS", token)),
	(r"#(?=\w+)", lambda scanner, token: ("HASH", token)),
	(r"^(RT)", lambda scanner, token: ("RETWIT", token)),
	(r"[😎|😘|☺|😍|☺|️️|❤|☝|🏻|🏻|🙈|🙌|😘|✨|😍|😋|😳|😌|🐾|✈|💋|😽|☀|💪|👉|🎬|😂|😇|✊|☕|😜|😇|😍|😘|😂|😎|😬|🔥|😍|😀]", lambda scanner, token: ("EMOJI", token)),
	(r"(?:[<>]?[:;=8][\-o\*\']?[\)\]\(\[dDpP*/\:\}\{@\|\\]|[\)\]\(\[dDpPсСрР3/\:\}\{@\|\\][\-o\*\']?[:;=8][<>]?)", lambda scanner, token: ("SMILE", token)),
	(r'[!|$|%|&|"|,|.|;|:|-|?|*|«|»|\(|\)|-]+', lambda scanner, token: ("PUNCT", token)),
	(r"(?:[^\W\d_](?:[^\W\d_]|['\-_])+[^\W\d_])|(?:[+\-]?\d+[,/.:-]\d+[+\-]?)|(?:[\w_]+)|(?:\.(?:\s*\.){1,})|(?:\S)",lambda scanner, token: ("WORD", token))
	], re.UNICODE)


f2 = open('output.txt', 'w', encoding='utf-8')
for i in range(len(tokens)):
	results, remainder = scanner.scan(tokens[i])
	for j in results:
		f2.write(str(i) + '\t' + j[0] + '\t' + j[1] + '\r\n')
notstopwords = set(('not', 'no', 'mustn', "mustn\'t"))
stopwords = set(stopwords.words('english')) - notstopwords
text_processor = TextPreProcessor(
    normalize=['url', 'email', 'user'],
    fix_html=True,  # fix HTML tokens
    segmenter="twitter",
    corrector="twitter",
    unpack_hashtags=True,
    unpack_contractions=True,
    spell_correct_elong=True,
    tokenizer=SocialTokenizer(lowercase=True).tokenize)

lemmatizer = WordNetLemmatizer()
T = tokenizer.TweetTokenizer(preserve_handles=False,
                             preserve_hashes=False,
                             preserve_case=False,
                             preserve_url=False,
                             regularize=True)


def data_preprocessing(path_tweets):
    tweets = pd.read_csv(path_tweets, encoding='utf-8', sep=',')
    tweets['text'] = tweets['text'].apply(lambda x: standardization(x))
    tweets['sentiment'] = tweets['airline_sentiment'].apply(
        lambda x: 0 if x == 'negative' else (1 if x == 'neutral' else 2))
    return tweets['text'], tweets['sentiment']


def data_preprocessing(path_tweets, corpora):
    data = pd.read_csv(path_tweets,
                       encoding='utf-8',
Exemplo n.º 4
0
 def test_regularization(self):
     T = tokenizer.TweetTokenizer(regularize=True)
     text = "I'd've had to figure this out"
     actual = T.tokenize(text)
     expected = ['I', 'would', 'have', 'had', 'to', 'figure', 'this', 'out']
     self.assertEqual(actual, expected)
Exemplo n.º 5
0
 def test_shortening(self):
     T = tokenizer.TweetTokenizer(preserve_len=False)
     text = 'This is a loooooong tweettttt'
     actual = T.tokenize(text)
     expected = ['This', 'is', 'a', 'looong', 'tweettt']
     self.assertEqual(actual, expected)
Exemplo n.º 6
0
 def test_url_removal(self):
     T = tokenizer.TweetTokenizer(preserve_url=False)
     text = 'this is a url https://t.co/1234_MOD tweet'
     actual = T.tokenize(text)
     expected = ['this', 'is', 'a', 'url', 'tweet']
     self.assertEqual(actual, expected)
Exemplo n.º 7
0
 def test_handle_removal(self):
     T = tokenizer.TweetTokenizer(preserve_handles=False)
     text = '@somehandle a tweet at that person'
     actual = T.tokenize(text)
     expected = ['a', 'tweet', 'at', 'that', 'person']
     self.assertEqual(actual, expected)
Exemplo n.º 8
0
 def test_hash_removal(self):
     T = tokenizer.TweetTokenizer(preserve_hashes=False)
     text = 'this has #hash_tag and # separately'
     actual = T.tokenize(text)
     expected = ['this', 'has', 'hash_tag', 'and', '#', 'separately']
     self.assertEqual(actual, expected)
Exemplo n.º 9
0
 def setUp(self):
     self.T = tokenizer.TweetTokenizer()
Exemplo n.º 10
0
 def test_emoji(self):
     T = tokenizer.TweetTokenizer(preserve_emoji=False)
     text = "This is a tweet with😊"  #no space between text and emoji
     actual = T.tokenize(text)
     expected = ['This', 'is', 'a', 'tweet', 'with']
     self.assertEqual(actual, expected)
Exemplo n.º 11
0
 def setUp(self):
     self.T = tokenizer.TweetTokenizer()
     self.redditT = tokenizer.RedditTokenizer()