def norm_words(df): ''' Tokenizes and normalizes words. Differs from norm_text() because it does not lemmatize the words. Used in word embedding models. Returns df with new columns "tokenized_words" and "normalized_words" ''' df['tokenized_words'] = df['text'].apply( lambda x: lucem_illud_2020.word_tokenize(x)) df['normalized_words'] = df['tokenized_words'].apply( lambda x: lucem_illud_2020.normalizeTokens(x, lemma=False)) return df
def norm_text(df): ''' Tokenizes and normalizes text data. Returns df with new columns "tokenized_text" and "normalized_text" ''' # Tokenized and normalized texts df['tokenized_text'] = df['text'].apply( lambda x: lucem_illud_2020.word_tokenize(x)) df['normalized_text'] = df['tokenized_text'].apply( lambda x: lucem_illud_2020.normalizeTokens(x)) return df
def norm_sent(df): ''' Tokenizes and normalizes sentences. Returns df with new columns "tokenized_sents" and "normalized_sents" ''' # Tokenized and normalized sents df['tokenized_sents'] = df['text'].apply(lambda x: [ lucem_illud_2020.word_tokenize(s) for s in lucem_illud_2020.sent_tokenize(x) ]) df['normalized_sents'] = df['tokenized_sents'].apply( lambda x: [lucem_illud_2020.normalizeTokens(s, lemma=False) for s in x]) return df
#load tweets for all days days = pd.read_csv(r'.\Classified\all.csv') days = days.reset_index().drop(['index', 'Unnamed: 0'], axis=1) #%% subset variables tweets = days[[ 'screen_name', 'status_id', 'created_at', 'text', 'cap', 'is_retweet', 'favorite_count', 'retweet_count', 'followers_count', 'friends_count' ]] # keep only original text tweets = tweets.drop_duplicates(subset='text') #%% tokenize and normalize the text of each tweet tweets['tokenized_text'] = tweets['text'].apply( lambda x: lucem.word_tokenize(x)) tweets['normalized_tokens'] = tweets['tokenized_text'].apply( lambda x: lucem.normalizeTokens(x, extra_stop=['amp'])) #%% create a subset of max 1000 words that appear less than 3 times and in more... #...than half of documents #initialize model twTFVectorizer = sklearn.feature_extraction.text.TfidfVectorizer( max_df=0.5, max_features=1500, min_df=100, stop_words='english', norm='l2') #train the model twTFVects = twTFVectorizer.fit_transform(tweets['text']) print(twTFVects.shape) #%% creat dropMissing function to apply the tf-idf filter def dropMissing(wordLst, vocab):