# In[15]: pos_tweet = list(x.split() for x in data[data['new_sentiment'] == 'positive']['text']) neu_tweet = list(x.split() for x in data[data['new_sentiment'] == 'neutral']['text']) neg_tweet = list(x.split() for x in data[data['new_sentiment'] == 'negative']['text']) postop10tfidf = tfidf.get_top_n_tfidf_bow(pos_tweet, top_n_tokens=30) negtop10tfidf = tfidf.get_top_n_tfidf_bow(neg_tweet, top_n_tokens=30) print('top 30 negative review tfidf', negtop10tfidf) print('top 30 positive review tfidf', postop10tfidf) # In[16]: top10_posfreq_list = DataExploration.get_topn_freq_bow(pos_tweet, topn=10) top10_negfreq_list = DataExploration.get_topn_freq_bow(neg_tweet, topn=10) print(top10_posfreq_list) print(top10_negfreq_list) # In[17]: DataExploration.generate_word_cloud(pos_tweet) # In[18]: DataExploration.generate_word_cloud(neg_tweet) # We didn't remove stop words, so the LDA does not work well, to do topic modelling, we need to remove stop words. But for sentiment analysis, it is better to keep all words. # However, even for sentiment analysis, we need to set minimal words length, there we need to set it as 2.
print('vaccine ', len(df_filt)) return df_filt side_effect = find_side_effect(data).reset_index(drop=True) side_effect = side_effect[side_effect.sentiment=='negative'] side_effect['explore_text'] = textClean.pipeline(side_effect['text'].to_list(), multi_gram = [1], lower_case=True, deacc=False, encoding='utf8', errors='strict', stem_lemma = 'lemma', tag_drop = [], nltk_stop=True, stop_word_list=['effect','vaccine','side','covid'], check_numbers=False, word_length=2, remove_consecutives=True) # In[172]: print(DataExploration.get_topn_freq_bow(list(side_effect['explore_text']), topn = 30)) # In[173]: print(tfidf.get_top_n_tfidf_bow(list(side_effect['explore_text']), top_n_tokens = 30)) # In[174]: DataExploration.generate_word_cloud(list(side_effect['explore_text'])) # In[162]:
encoding='utf8', errors='strict', stem_lemma='lemma', tag_drop=[], nltk_stop=True, stop_word_list=[], remove_pattern=[], check_numbers=True, word_length=2, remove_consecutives=True) # In[43]: selected_tokens = train[train.LABEL == 'Mathematics']['title_tokens'].to_list() top_10_freq_words = [ i[0] for i in DataExploration.get_topn_freq_bow(selected_tokens, topn=10) ] print('top 10 frequent words', top_10_freq_words) top30tfidf = tfidf.get_top_n_tfidf_bow(selected_tokens, top_n_tokens=30) print('top 30 tfidf', top30tfidf) no_topics = 10 lda_allbow, bow_corpus, dictionary = lda.fit_lda(selected_tokens, num_topics=no_topics) lda.lda_topics(lda_allbow) # In[44]: DataExploration.generate_word_cloud(selected_tokens) # ## Fit model based on title
data = raw_data.copy() # In[15]: data['token'] = textClean.pipeline(raw_data['text'].to_list(), multi_gram = [1], lower_case=True, deacc=False, encoding='utf8', errors='strict', stem_lemma = 'lemma', tag_drop = [], nltk_stop=True, stop_word_list=[], remove_pattern = [], check_numbers=True, word_length=2, remove_consecutives=True) # In[16]: top_10_freq_words = [i[0] for i in DataExploration.get_topn_freq_bow(data['token'].to_list(), topn = 10)] print(top_10_freq_words) # In[17]: top30tfidf = tfidf.get_top_n_tfidf_bow(data['token'].to_list(), top_n_tokens = 30) print('top 30 tfidf', top30tfidf) # In[18]: DataExploration.generate_word_cloud(data['token'].to_list())
pos_tokens[0] # In[41]: postop10tfidf = tfidf.get_top_n_tfidf_bow(pos_tokens, top_n_tokens = 30) negtop10tfidf = tfidf.get_top_n_tfidf_bow(neg_tokens, top_n_tokens = 30) print('top 10 negative review tfidf', negtop10tfidf) print('top 10 positive review tfidf', postop10tfidf) # In[36]: top10_freq_list = DataExploration.get_topn_freq_bow(preprocessed_tokens, topn = 10) top10_posfreq_list = DataExploration.get_topn_freq_bow(pos_tokens, topn = 10) top10_negfreq_list = DataExploration.get_topn_freq_bow(neg_tokens, topn = 10) print(top10_freq_list) print(top10_posfreq_list) print(top10_negfreq_list) # In[37]: DataExploration.generate_word_cloud(pos_tokens) # In[38]:
processed_letter_df.to_csv(os.path.join(data_path, 'processed_letter.csv')) # ## Data Exploration # In[4]: processed_letter_df = pd.read_csv(os.path.join(data_path, 'processed_letter.csv')) processed_letter_df['tokens'] = processed_letter_df.clean_letter.apply(lambda x: x.split(' ')) # In[26]: top_10_freq_words = [i[0] for i in DataExploration.get_topn_freq_bow(processed_letter_df['tokens'].to_list(), topn = 10)] print(top_10_freq_words) # In[27]: top30tfidf = tfidf.get_top_n_tfidf_bow(processed_letter_df['tokens'].to_list(), top_n_tokens = 30) print('top 30 tfidf', top30tfidf) # In[28]: DataExploration.generate_word_cloud(processed_letter_df['tokens'].to_list())
deacc=False, encoding='utf8', errors='strict', stem_lemma='lemma', tag_drop=['V'], nltk_stop=True, stop_word_list=[], remove_pattern=['http:', '#', '@'], check_numbers=True, word_length=2, remove_consecutives=True) # In[23]: top_10_freq_words = [ i[0] for i in DataExploration.get_topn_freq_bow(djia_tokens, topn=10) ] print('top 10 frequent words', top_10_freq_words) top30tfidf = tfidf.get_top_n_tfidf_bow(djia_tokens, top_n_tokens=30) print('top 30 tfidf', top30tfidf) no_topics = 10 lda_allbow, bow_corpus, dictionary = lda.fit_lda(djia_tokens, num_topics=no_topics) lda.lda_topics(lda_allbow) # In[20]: DataExploration.generate_word_cloud(djia_tokens) # In[27]: