test= data[(data.date.astype(str)=='2021-06-01')&(data.country=='India')].reset_index(drop=True) test.orig_text[5] # In[49]: test['explore_text'] = textClean.pipeline(test['text'].to_list(), multi_gram = [1], lower_case=True, deacc=False, encoding='utf8', errors='strict', stem_lemma = 'lemma', tag_drop = [], nltk_stop=True, stop_word_list=['effect','vaccine','side','covid'], check_numbers=False, word_length=2, remove_consecutives=True) print(tfidf.get_top_n_tfidf_bow(list(test['explore_text']), top_n_tokens = 30)) DataExploration.generate_word_cloud(list(test['explore_text'])) no_topics = 10 lda_allbow, bow_corpus, dictionary = lda.fit_lda(list(test['explore_text']), num_topics = 10) lda.lda_topics(lda_allbow) # From the above plots we can see, people in India and Canada tweets a lot in March and April, and the spikes in overall volume trend were contributed by India, we will do more exploration for Canada and India. For india, it seems most tweets are neutral. # # In India, in 2021-Mar-01, most tweets are about vacc, in 2021-Apr-21, most tweets are about vacc and infection and medical service, in 2021-June-01, India start use sputnik, etc. # In[76]: canada_overtime = data[data.country.isin(['Canada'])].groupby(['date', 'sentiment']).agg(**{'tweets': ('id', 'count')}).reset_index().dropna() fig = px.line(canada_overtime, x='date', y='tweets', color='sentiment', title='Timeline showing emotion of tweets in Canada about COVID-19 vaccines')
selected_tokens = train[train.LABEL == 'Mathematics']['title_tokens'].to_list() top_10_freq_words = [ i[0] for i in DataExploration.get_topn_freq_bow(selected_tokens, topn=10) ] print('top 10 frequent words', top_10_freq_words) top30tfidf = tfidf.get_top_n_tfidf_bow(selected_tokens, top_n_tokens=30) print('top 30 tfidf', top30tfidf) no_topics = 10 lda_allbow, bow_corpus, dictionary = lda.fit_lda(selected_tokens, num_topics=no_topics) lda.lda_topics(lda_allbow) # In[44]: DataExploration.generate_word_cloud(selected_tokens) # ## Fit model based on title # In[58]: train_index, test_index = train_test_split(train.index, test_size=0.33, random_state=42) X_train = train[train.index.isin(train_index)][['TITLE']] X_test = train[train.index.isin(test_index)][['TITLE']] y_train = pd.get_dummies(train[train.index.isin(train_index)]['LABEL']).values y_test = pd.get_dummies(train[train.index.isin(test_index)]['LABEL']).values X_train = [i for i in X_train.TITLE] X_test = [i for i in X_test.TITLE]
for x in data[data['new_sentiment'] == 'negative']['text']) postop10tfidf = tfidf.get_top_n_tfidf_bow(pos_tweet, top_n_tokens=30) negtop10tfidf = tfidf.get_top_n_tfidf_bow(neg_tweet, top_n_tokens=30) print('top 30 negative review tfidf', negtop10tfidf) print('top 30 positive review tfidf', postop10tfidf) # In[16]: top10_posfreq_list = DataExploration.get_topn_freq_bow(pos_tweet, topn=10) top10_negfreq_list = DataExploration.get_topn_freq_bow(neg_tweet, topn=10) print(top10_posfreq_list) print(top10_negfreq_list) # In[17]: DataExploration.generate_word_cloud(pos_tweet) # In[18]: DataExploration.generate_word_cloud(neg_tweet) # We didn't remove stop words, so the LDA does not work well, to do topic modelling, we need to remove stop words. But for sentiment analysis, it is better to keep all words. # However, even for sentiment analysis, we need to set minimal words length, there we need to set it as 2. # In[20]: no_topics = 10 lda_allbow, bow_corpus, dictionary = lda.fit_lda(pos_tweet, num_topics=no_topics) lda.lda_topics(lda_allbow)
# In[36]: top10_freq_list = DataExploration.get_topn_freq_bow(preprocessed_tokens, topn = 10) top10_posfreq_list = DataExploration.get_topn_freq_bow(pos_tokens, topn = 10) top10_negfreq_list = DataExploration.get_topn_freq_bow(neg_tokens, topn = 10) print(top10_freq_list) print(top10_posfreq_list) print(top10_negfreq_list) # In[37]: DataExploration.generate_word_cloud(pos_tokens) # In[38]: DataExploration.generate_word_cloud(neg_tokens) # ## Model Development # ### 1. Split Dataset # In[3]:
top_10_freq_words = [i[0] for i in DataExploration.get_topn_freq_bow(data['token'].to_list(), topn = 10)] print(top_10_freq_words) # In[17]: top30tfidf = tfidf.get_top_n_tfidf_bow(data['token'].to_list(), top_n_tokens = 30) print('top 30 tfidf', top30tfidf) # In[18]: DataExploration.generate_word_cloud(data['token'].to_list()) # In[19]: no_topics = 10 lda_allbow, bow_corpus, dictionary = lda.fit_lda(data['token'].to_list(), num_topics = no_topics) lda.lda_topics(lda_allbow) # In[33]: tfidf_data = tfidf.get_tfidf_dataframe(data['token'].to_list(), doc_index = list(data.doc_id),
top_10_freq_words = [i[0] for i in DataExploration.get_topn_freq_bow(processed_letter_df['tokens'].to_list(), topn = 10)] print(top_10_freq_words) # In[27]: top30tfidf = tfidf.get_top_n_tfidf_bow(processed_letter_df['tokens'].to_list(), top_n_tokens = 30) print('top 30 tfidf', top30tfidf) # In[28]: DataExploration.generate_word_cloud(processed_letter_df['tokens'].to_list()) # In[29]: no_topics = 10 lda_allbow, bow_corpus, dictionary = lda.fit_lda(processed_letter_df['tokens'].to_list(), num_topics = no_topics) lda.lda_topics(lda_allbow) # ### Words Frequency # In[8]:
negtop10tfidf = tfidf.get_top_n_tfidf_bow(neg_tweet, top_n_tokens=30) neutop10tfidf = tfidf.get_top_n_tfidf_bow(neu_tweet, top_n_tokens=30) print('top 30 negative review tfidf', negtop10tfidf) print('top 30 positive review tfidf', postop10tfidf) print('top 30 neutual review tfidf', neutop10tfidf) # In[40]: top10_posfreq_list = DataExploration.get_topn_freq_bow(pos_tweet, topn=10) top10_negfreq_list = DataExploration.get_topn_freq_bow(neg_tweet, topn=10) print(top10_posfreq_list) print(top10_negfreq_list) # In[38]: DataExploration.generate_word_cloud(pos_tweet) # In[41]: DataExploration.generate_word_cloud(neg_tweet) # In[42]: DataExploration.generate_word_cloud(neu_tweet) # ## LDA # In[43]: no_topics = 10 lda_allbow, bow_corpus, dictionary = lda.fit_lda(pos_tweet,
print(printdata.company_profile.item()) print('-------------------- Job Description --------------------') print(printdata.description.item()) print('-------------------- Requirements --------------------') print(printdata.requirements.item()) print('-------------------- Benifits --------------------') print(printdata.benefits.item()) print_job(raw_data, 50) # In[5]: profile_tokens = list(raw_data['profile_tokens']) print(tfidf.get_top_n_tfidf_bow(profile_tokens, top_n_tokens=30)) DataExploration.generate_word_cloud(profile_tokens) no_topics = 10 lda_allbow, bow_corpus, dictionary = lda.fit_lda(profile_tokens, num_topics=10) lda.lda_topics(lda_allbow) # In[6]: profile_tokens = list(raw_data['description_tokens']) print(tfidf.get_top_n_tfidf_bow(profile_tokens, top_n_tokens=30)) DataExploration.generate_word_cloud(profile_tokens) no_topics = 10 lda_allbow, bow_corpus, dictionary = lda.fit_lda(profile_tokens, num_topics=10) lda.lda_topics(lda_allbow) # In[7]:
lower_case=True, deacc=False, encoding='utf8', errors='strict', stem_lemma = 'lemma', tag_drop = ['J'], nltk_stop=True, stop_word_list=['course','courses'], check_numbers=False, word_length=0, remove_consecutives=True) # In[163]: dataExploration.generate_word_cloud(processed_doc) # In[11]: tfidf_value_data = tfidf.get_tfidf_dataframe(processed_doc,no_below =2, no_above = 1) tfidf_value_data.head(10) # In[12]: base_book = 'To Kill a Mockingbird' base_book_detail = content_data[content_data.original_title == base_book] bookid = base_book_detail['id'].values
print('top 30 positive review tfidf', postop10tfidf) # In[14]: top10_posfreq_list = DataExploration.get_topn_freq_bow(pos_tweet, topn = 10) top10_negfreq_list = DataExploration.get_topn_freq_bow(neg_tweet, topn = 10) print(top10_posfreq_list) print(top10_negfreq_list) # In[15]: DataExploration.generate_word_cloud(pos_tweet) # In[16]: DataExploration.generate_word_cloud(neg_tweet) # In[18]: hashtag_list = list(sample_data.hashtag) DataExploration.generate_word_cloud(hashtag_list)
# In[23]: top_10_freq_words = [ i[0] for i in DataExploration.get_topn_freq_bow(djia_tokens, topn=10) ] print('top 10 frequent words', top_10_freq_words) top30tfidf = tfidf.get_top_n_tfidf_bow(djia_tokens, top_n_tokens=30) print('top 30 tfidf', top30tfidf) no_topics = 10 lda_allbow, bow_corpus, dictionary = lda.fit_lda(djia_tokens, num_topics=no_topics) lda.lda_topics(lda_allbow) # In[20]: DataExploration.generate_word_cloud(djia_tokens) # In[27]: top_10_freq_words = [ i[0] for i in DataExploration.get_topn_freq_bow(nasdaq_tokens, topn=10) ] print('top 10 frequent words', top_10_freq_words) top30tfidf = tfidf.get_top_n_tfidf_bow(nasdaq_tokens, top_n_tokens=30) print('top 30 tfidf', top30tfidf) no_topics = 10 lda_allbow, bow_corpus, dictionary = lda.fit_lda(nasdaq_tokens, num_topics=no_topics) lda.lda_topics(lda_allbow) # In[28]:
print('-------------------- Benifits --------------------') print(printdata.benefits.item()) print_job(raw_data, 50) # In[145]: # raw_data['jd_tokens'] = textClean.pipeline(raw_data['jd'].to_list(), multi_gram = [1], lower_case=True, # deacc=False, encoding='utf8', errors='strict', stem_lemma = 'lemma', # tag_drop = [], nltk_stop=True, # stop_word_list=[], # check_numbers=False, word_length=2, remove_consecutives=True) fraud_tokens = list(raw_data[raw_data.fraudulent == 1]['jd_tokens']) print(tfidf.get_top_n_tfidf_bow(fraud_tokens, top_n_tokens=30)) DataExploration.generate_word_cloud(fraud_tokens) no_topics = 10 lda_allbow, bow_corpus, dictionary = lda.fit_lda(fraud_tokens, num_topics=10) lda.lda_topics(lda_allbow) # In[146]: raw_data['jd_tokens2'] = textClean.pipeline(raw_data['jd'].to_list(), multi_gram=[2], lower_case=True, deacc=False, encoding='utf8', errors='strict', stem_lemma='lemma', tag_drop=[], nltk_stop=True,