def tokenize_data(df_comments, ignore_stopwords=True, keepcaps=False, decontract=True, remove_punct=True): if ignore_stopwords: ignore_stopwords = 'english' tokenizer = CrazyTokenizer(ignore_stopwords=ignore_stopwords, keepcaps=keepcaps, subreddits='', reddit_usernames='', emails='', urls='', decontract=decontract, remove_punct=remove_punct) tokens = [] for i in tqdm(range(df_comments.shape[0])): current_tokens = tokenizer.tokenize( df_comments.iloc[i, df_comments.columns.get_loc('body')]) tokens.append(current_tokens) df_comments['tokens'] = tokens return df_comments
def test_url_tokenizing(): tokenizer = CrazyTokenizer(urls='domain') tokens = tokenizer.tokenize(url_text) assert tokens == [ 'i', 'always', 'go', 'to', 'rt', 'to', 'chat', 'about', 'politics', 'cnn', 'sucks', 'man' ]
def tokenize(self): word_tokenize = CrazyTokenizer(twitter_handles='split', hashtags='split', decontract=True) return self.__dataframe['preprocessed_text'] \ .apply(lambda text: word_tokenize.tokenize(text))
def test_url_fast_unwrapping(): tokenizer = CrazyTokenizer(urls='domain_unwrap_fast') tokens = tokenizer.tokenize(short_url_text) assert tokens == [ 'jobs', 'jobs', 'jobs', 'unemployment', 'claims', 'have', 'fallen', 'to', 'a', '45-year', 'low', 'bloomberg' ]
def test_splithashtags(): tokenizer = CrazyTokenizer(splithashtags=True, hashtags=False) tokens = tokenizer.tokenize(hashtag_text) assert tokens == [ 'make', 'america', 'great', 'again', 'make', 'russia', 'drunk', 'again', 'maga' ]
def test_tokenizing(): tokenizer = CrazyTokenizer( lowercase=True, keepcaps=True, normalize=3, ignorequotes=True, ignorestopwords=['is', 'are', 'am', 'not', 'a', 'the'], stem=False, removepunct=True, removebreaks=True, remove_nonunicode=False, decontract=False, splithashtags=True, twitter_handles='TOKENTWITTERHANDLE', urls='', hashtags=False, numbers=False, subreddits='TOKENSUBREDDIT', reddit_usernames='TOKENREDDITOR', emails='TOKENEMAIL', extra_patterns=None, pos_emojis=True, neg_emojis=None, neutral_emojis=None) tokens = tokenizer.tokenize(story_of_my_life) correct_answer = [ 'hi', 'my', 'name', 'TOKENTWITTERHANDLE', 'I', 'looove', 'beer', 'plato', 'once', 'said', 'bad', 'way', 'to', 'phrase', 'it', 'another', 'pint', 'please', 'by', 'way', 'do', "n't", 'forget', 'to', 'visit', 'I', "'m", 'also', 'on', 'reddit', 'as', 'TOKENREDDITOR', 'I', 'especially', 'love', 'TOKENSUBREDDIT', 'sending', 'my', 'love', 'to', 'you', 'as', 'they', 'say', 'POS_EMOJI', '24' ] assert tokens == correct_answer
def test_decontract(): tokenizer = CrazyTokenizer(decontract=True) tokens = tokenizer.tokenize(decontract_text) assert tokens == [ 'i', 'have', 'been', 'waiting', 'to', 'drink', 'this', 'beer', 'i', 'will', 'not', 'give', 'it', 'to', 'you' ]
def process_data(tweet) : #remove symbols, @s #remove hashtags #tokenize num_tweet = re.sub("\d+", "",tweet) name_tweet = '' for sent in nltk.sent_tokenize(num_tweet): for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))): try: if chunk.label() in ('PERSON','ORGANIZATION'): if name_tweet[-1:] in ('#','@'): name_tweet = name_tweet[:-1] else: pass else : for c in chunk.leaves(): name_tweet = name_tweet + ' ' + str(c[0]) except AttributeError: if (name_tweet[-1:] in punctuation and name_tweet[-1:] not in ('!','?','.','&')) or (str(chunk[0]) in punctuation and str(chunk[0])!='&') : name_tweet = name_tweet + str(chunk[0]) else : name_tweet = name_tweet + ' ' + str(chunk[0]) stopWords = stopwords.words('english') stopWords.extend(['$','trump','warren','sen.','senator','mayor','president','kamala','harris','silent','deleted','sanders','berniesanders','ami','klobuchar','pete','beto',"o'rourke"]) tokenizer = CrazyTokenizer(normalize=2,hashtags=False,remove_punct=True,decontract=True,latin_chars_fix=True,ignore_stopwords=stopWords,ignore_quotes=True,remove_nonunicode=True,twitter_handles='ANOTHER_TWITTER_USER',urls='URL',pos_emojis=True,neg_emojis=True,neutral_emojis=True) token_tweet = tokenizer.tokenize(name_tweet) clean_tweet = [word.strip() for word in token_tweet if len(word)>1] return clean_tweet
def test_removepunct(): tokenizer = CrazyTokenizer(remove_punct=True) tokens = tokenizer.tokenize(punct_text) print(tokens) assert tokens == ['this', 'is', 'the', 'text', 'which', 'contains', 'a', 'lot', 'of', 'punctuation', 'amazing', "is", "n't", 'it', 'who', 'knows']
def test_keep_untokenized(): tokenizer = CrazyTokenizer( keep_untokenized=['New York City', 'Los Angeles']) tokens = tokenizer.tokenize(untokenized_text) assert tokens == [ 'rats', 'are', 'actually', 'more', 'polite', 'in', 'new_york_city', 'than', 'in', 'los_angeles' ]
def test_extra_patterns(): tokenizer = CrazyTokenizer(extra_patterns=[( 'zagovor', re.compile(('([S,s]partak|[S,s]paratka|[S,s]partalke)')), 'GAZPROM')]) tokens = tokenizer.tokenize(spartak_text) assert tokens == [ 'GAZPROM', 'is', 'a', 'champion', 'GAZPROM', 'is', 'the', 'best' ]
def test_replacement(): tokenizer = CrazyTokenizer(twitter_handles='handle', urls='url', hashtags='hashtag', numbers='number', subreddits='subreddit', reddit_usernames='redditor', emails='email') tokens = tokenizer.tokenize(replacement_text) assert tokens == ['url', 'is', 'number', 'number', 'site', 'according', 'to', 'handle', 'url'] tokens = tokenizer.tokenize(replacement_text2) assert tokens == ['email', 'was', 'hacked', 'by', 'redditor', 'from', 'subreddit', 'hashtag']
def process_data(tweet): letterText = re.sub("\d+", "", tweet) stopWords = list(STOPWORDS) stopWords.extend([ 'demdebate', 're', 'campaign', 'senator', 'sen', 'mayor', 'president', 'trump', 'RT', 'bernie', 'warren', 'kamala', 'buttigieg', 'castro', 'beto', 'klobuchar', 'joe', 'rogan', 'elizabeth', 'sander', 'sanders', 'candidate', 'utm', 'source', 'harris', 'biden', 'debate', 'people' ]) name_tweet = "" for sent in nltk.sent_tokenize(letterText): for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))): try: if chunk.label() in ('PERSON', 'ORGANIZATION'): if name_tweet[-1:] in ('@', '#') or str( chunk[0]) == 'GOPDebates': name_tweet = name_tweet[:-1] else: pass else: for c in chunk.leaves(): name_tweet = name_tweet + ' ' + str(c[0]) except AttributeError: if (name_tweet[-1:] in punctuation and name_tweet[-1:] not in ('!', '?', '.', '&')) or ( str(chunk[0]) in punctuation and str(chunk[0]) not in ('&', '#', '@')): name_tweet = name_tweet + str(chunk[0]) else: name_tweet = name_tweet + ' ' + str(chunk[0]) url_tweet = replaceURL(name_tweet) per_str = re.sub(r"[^a-zA-Z0-9 @]", ' ', url_tweet) tokenizer = CrazyTokenizer(normalize=2, hashtags='', remove_punct=True, decontract=True, latin_chars_fix=True, ignore_stopwords=stopWords, ignore_quotes=True, remove_nonunicode=True, twitter_handles='', urls='URL', pos_emojis=True, neg_emojis=True, neutral_emojis=True) token_tweet = tokenizer.tokenize(per_str) clean_tweet = [word.strip() for word in token_tweet if len(word) > 1] return " ".join(clean_tweet)
def process_data(tweet): letterText = re.sub("\d+", "", tweet) stopWords = list(stopwords.words('english')) stemmer = SnowballStemmer('english') stopWords.extend( ['senator', 'sen.', 'mayor', 'president', 'trump', 'RT', 'bernie']) name_tweet = "" for sent in nltk.sent_tokenize(letterText): for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))): try: if chunk.label() in ('PERSON', 'ORGANIZATION'): if name_tweet[-1:] in ('@', '#') or str( chunk[0]) == 'GOPDebates': name_tweet = name_tweet[:-1] else: pass else: for c in chunk.leaves(): name_tweet = name_tweet + ' ' + str(c[0]) except AttributeError: if (name_tweet[-1:] in punctuation and name_tweet[-1:] not in ('!', '?', '.', '&')) or ( str(chunk[0]) in punctuation and str(chunk[0]) not in ('&', '#', '@')): name_tweet = name_tweet + str(chunk[0]) else: name_tweet = name_tweet + ' ' + str(chunk[0]) tokenizer = CrazyTokenizer(normalize=2, hashtags='', remove_punct=True, decontract=True, latin_chars_fix=True, ignore_stopwords=stopWords, ignore_quotes=True, remove_nonunicode=True, twitter_handles='', urls='URL', pos_emojis=True, neg_emojis=True, neutral_emojis=True) token_tweet = tokenizer.tokenize(name_tweet) clean_tweet = [ stemmer.stem(word.strip()) for word in token_tweet if len(word) > 1 ] return " ".join(clean_tweet)
def tokenize(partition): partition_name = "{}-{}-{}".format(partition["tw_year"].iloc[0], partition["tw_month"].iloc[0], partition["tw_day"].iloc[0]) start = timer() print("Begining Tokenization: {}".format(partition_name)) tokenizer = CrazyTokenizer(extra_patterns=PATTERNS, lowercase=True, normalize=3, ignore_quotes=False, ignore_stopwords=True, stem="lemm", remove_punct=True, remove_numbers=True, remove_breaks=True, decontract=True, hashtags="split", twitter_handles='', urls=False) partition["tokens"] = partition["full_text"].apply(tokenizer.tokenize) table = pa.Table.from_pandas(partition) pq.write_to_dataset(table, root_path=OUTPUT_DIR, partition_cols=['tw_year', 'tw_month', 'tw_day']) end = timer() print("Tokenization Finished for {}. Took {} seconds.".format( partition_name, end - start))
def test_lowercase_keepcaps(): tokenizer = CrazyTokenizer(lowercase=True, keepcaps=True) tokens = tokenizer.tokenize(vova_text) assert tokens == ['vladimir', 'putin', 'is', 'the', 'BEST', 'AND', 'AMAZING'] tokenizer = CrazyTokenizer(lowercase=True, keepcaps=False) tokens = tokenizer.tokenize(vova_text) assert tokens == ['vladimir', 'putin', 'is', 'the', 'best', 'and', 'amazing'] tokenizer = CrazyTokenizer(lowercase=False, keepcaps=False) tokens = tokenizer.tokenize(vova_text) assert tokens == ['Vladimir', 'Putin', 'is', 'the', 'BEST', 'AND', 'AMAZING']
def tokenize_reddit(comments_directory, output_directory, subsample=100000, val_size=0.1, test_size=0.1, random_state=24): csv_files = glob.glob(osp.join(comments_directory, '*.csv')) df_comments = pd.concat((pd.read_csv(csv_file, lineterminator='\n', usecols=[ 'id', 'body', 'subreddit', 'created_utc']) for csv_file in csv_files)) df_comments.drop_duplicates('id', inplace=True) df_comments['created_utc'] = pd.to_datetime( df_comments['created_utc'], unit='s') df_comments = df_comments.sample(frac=1.0, random_state=random_state) df_comments = df_comments.groupby('subreddit').head(subsample) tokenizer = CrazyTokenizer( keepcaps=False, decontract=True, ignore_stopwords='english', subreddits='', reddit_usernames='', numbers='', emails='', urls='') tokens = [] for i in tqdm(range(df_comments.shape[0])): current_tokens = tokenizer.tokenize( df_comments.iloc[i, df_comments.columns.get_loc('body')]) tokens.append(current_tokens) df_comments['tokens'] = tokens del tokens df_train_val, df_test = train_test_split( df_comments, test_size=test_size, random_state=random_state, shuffle=True) df_train, df_val = train_test_split( df_train_val, test_size=val_size, random_state=random_state, shuffle=True) df_train = df_train.loc[df_train.tokens.str.len() > 0] df_val = df_val.loc[df_val.tokens.str.len() > 0] df_test = df_test.loc[df_test.tokens.str.len() > 0] df_train.to_csv(osp.join(output_directory, 'reddit_train.csv'), index=False) df_val.to_csv(osp.join(output_directory, 'reddit_val.csv'), index=False) df_test.to_csv(osp.join(output_directory, 'reddit_test.csv'), index=False)
def tokenize_image_titles( data_path: str, offset: int = 0, limit: int = None, invalidate_cache: bool = False, debug_info: bool = False, ) -> None: article_paths = [ join(data_path, f) for f in listdir(data_path) if isdir(join(data_path, f)) ] valid_limit = _validated_limit(limit, offset, len(article_paths)) tokenizer = CrazyTokenizer(hashtags='split') mapper = str.maketrans({x: '' for x in string.punctuation}) regex = re.compile(r'(\d+)') for i in range(offset, offset + valid_limit): path = article_paths[i] if debug_info: print(i, path) meta_path = join(path, 'img/', 'meta.json') meta_arr = _getJSON(meta_path)['img_meta'] for meta in meta_arr: if 'parsed_title' in meta and not invalidate_cache: continue filename = os.path.splitext(meta['title'])[0] sentence = filename.translate(mapper) sentence = regex.sub(r' \g<1> ', sentence) tokens = [] for word in sentence.split(): tokens += ( tokenizer.tokenize("#" + word) if not word.isdigit() else [word] ) meta['parsed_title'] = " ".join(tokens) _dump(meta_path, {"img_meta": meta_arr})
def test_normalize(): tokenizer = CrazyTokenizer(normalize=3) tokens = tokenizer.tokenize(norm_text) assert tokens == ['eeeboy', 'this', 'shiiit', 'is', 'good'] tokenizer = CrazyTokenizer(normalize=2) tokens = tokenizer.tokenize(norm_text) assert tokens == ['eeboy', 'this', 'shiit', 'is', 'good']
def test_stop(): tokenizer = CrazyTokenizer(ignore_stopwords=[ 'vladimir', 'putin', 'and'], lowercase=False) tokens = tokenizer.tokenize(vova_text) assert tokens == ['is', 'the', 'BEST', 'AMAZING'] tokenizer = CrazyTokenizer(ignore_stopwords=True) tokens = tokenizer.tokenize(english_stop) assert tokens == []
def tokenize_tweets(tweets_file, output_file): tweets = pd.read_csv(tweets_file, parse_dates=[ 'created_at'], lineterminator='\n') tweets['id'] = pd.to_numeric(tweets['id']) tweets.drop_duplicates('id', inplace=True) tokenizer = CrazyTokenizer( keepcaps=False, decontract=True, ignore_stopwords='english', twitter_handles='realname', hashtags='split', numbers='', emails='', urls='') tokens = [] for i in tqdm(range(tweets.shape[0])): current_tokens = tokenizer.tokenize( tweets.iloc[i, tweets.columns.get_loc('text')]) tokens.append(current_tokens) tweets['tokens'] = tokens tweets.to_csv(output_file, index=False)
def test_annoying_case(): tokenizer = CrazyTokenizer() tokens = tokenizer.tokenize(annoying_case) assert tokens == ['b', '@realdonaldtrump', '@crazyfrogspb', '*****@*****.**', '#maga', '#russiago', 'http://fscorelab.ru/overview#scoring'] tokenizer = CrazyTokenizer(emails='EMAIL', twitter_handles='HANDLE', urls='domain', hashtags='split') tokens = tokenizer.tokenize(annoying_case) assert tokens == ['b', 'HANDLE', 'HANDLE', 'EMAIL', 'maga', 'russia', 'go', 'fscorelab']
def test_ngrams(): tokenizer = CrazyTokenizer(ngrams=2) tokens = tokenizer.tokenize(ngrams_text) assert tokens == ['we', 'need', 'more', 'tokens', 'we_need', 'need_more', 'more_tokens']
def splitHashtags(sentence): tokenizer = CrazyTokenizer(hashtags='split') sentence = tokenizer.tokenize(sentence) return ' '.join(sentence)
def test_ignorequotes(): tokenizer = CrazyTokenizer(ignore_quotes=True, remove_punct=True) tokens = tokenizer.tokenize(quotes_text) assert tokens == ['said', 'no', 'one', 'ever']
def test_repeated(): tokenizer = CrazyTokenizer( pos_emojis=True, neg_emojis=True, neutral_emojis=True) for i in range(100): tokenizer.tokenize(trump_rant)
def test_emoji(): tokenizer = CrazyTokenizer( pos_emojis=True, neg_emojis=True, neutral_emojis=True) tokens = tokenizer.tokenize(doc_emoji) assert tokens == ['POS_EMOJI', 'NEG_EMOJI', 'NEG_EMOJI']
def test_handles_split(): tokenizer = CrazyTokenizer(twitter_handles='split') tokens = tokenizer.tokenize(splithandle_text) assert tokens == ['real', 'donald', 'trump', 'loves', 'breitbart', 'news']
def test_realname(): tokenizer = CrazyTokenizer(hashtags='split', twitter_handles='realname') tokens = tokenizer.tokenize(realname_text) assert tokens == ['donald', 'j.', 'trump', 'please', 'make', 'america', 'great', 'again']
def test_hex(): tokenizer = CrazyTokenizer(latin_chars_fix=True) tokens = tokenizer.tokenize(hex_text) assert tokens == ['i', "m", 'so', 'annoyed', 'by', 'these', 'characters', '😢']