test= data[(data.date.astype(str)=='2021-06-01')&(data.country=='India')].reset_index(drop=True)
test.orig_text[5]


# In[49]:


test['explore_text'] = textClean.pipeline(test['text'].to_list(), multi_gram = [1], lower_case=True, 
                                                 deacc=False, encoding='utf8', errors='strict', stem_lemma = 'lemma', 
                                                 tag_drop = [], nltk_stop=True, 
                                                 stop_word_list=['effect','vaccine','side','covid'], 
                                                 check_numbers=False, word_length=2, remove_consecutives=True)
print(tfidf.get_top_n_tfidf_bow(list(test['explore_text']), top_n_tokens = 30))
DataExploration.generate_word_cloud(list(test['explore_text']))
no_topics = 10
lda_allbow, bow_corpus, dictionary = lda.fit_lda(list(test['explore_text']), num_topics = 10)
lda.lda_topics(lda_allbow)


# From the above plots we can see, people in India and Canada tweets a lot in March and April, and the spikes in overall volume trend were contributed by India, we will do more exploration for Canada and India. For india, it seems most tweets are neutral.
# 
# In India, in 2021-Mar-01, most tweets are about vacc, in 2021-Apr-21, most tweets are about vacc and infection and medical service, in 2021-June-01, India start use sputnik, etc.

# In[76]:


canada_overtime = data[data.country.isin(['Canada'])].groupby(['date', 'sentiment']).agg(**{'tweets': ('id', 'count')}).reset_index().dropna()
fig = px.line(canada_overtime, x='date', y='tweets', color='sentiment',
             title='Timeline showing emotion of tweets in Canada about COVID-19 vaccines')
Exemplo n.º 2
0
selected_tokens = train[train.LABEL == 'Mathematics']['title_tokens'].to_list()
top_10_freq_words = [
    i[0] for i in DataExploration.get_topn_freq_bow(selected_tokens, topn=10)
]
print('top 10 frequent words', top_10_freq_words)
top30tfidf = tfidf.get_top_n_tfidf_bow(selected_tokens, top_n_tokens=30)
print('top 30 tfidf', top30tfidf)
no_topics = 10
lda_allbow, bow_corpus, dictionary = lda.fit_lda(selected_tokens,
                                                 num_topics=no_topics)
lda.lda_topics(lda_allbow)

# In[44]:

DataExploration.generate_word_cloud(selected_tokens)

# ## Fit model based on title

# In[58]:

train_index, test_index = train_test_split(train.index,
                                           test_size=0.33,
                                           random_state=42)
X_train = train[train.index.isin(train_index)][['TITLE']]
X_test = train[train.index.isin(test_index)][['TITLE']]
y_train = pd.get_dummies(train[train.index.isin(train_index)]['LABEL']).values
y_test = pd.get_dummies(train[train.index.isin(test_index)]['LABEL']).values
X_train = [i for i in X_train.TITLE]
X_test = [i for i in X_test.TITLE]
                 for x in data[data['new_sentiment'] == 'negative']['text'])
postop10tfidf = tfidf.get_top_n_tfidf_bow(pos_tweet, top_n_tokens=30)
negtop10tfidf = tfidf.get_top_n_tfidf_bow(neg_tweet, top_n_tokens=30)
print('top 30 negative review tfidf', negtop10tfidf)
print('top 30 positive review tfidf', postop10tfidf)

# In[16]:

top10_posfreq_list = DataExploration.get_topn_freq_bow(pos_tweet, topn=10)
top10_negfreq_list = DataExploration.get_topn_freq_bow(neg_tweet, topn=10)
print(top10_posfreq_list)
print(top10_negfreq_list)

# In[17]:

DataExploration.generate_word_cloud(pos_tweet)

# In[18]:

DataExploration.generate_word_cloud(neg_tweet)

# We didn't remove stop words, so the LDA does not work well, to do topic modelling, we need to remove stop words. But for sentiment analysis, it is better to keep all words.
# However, even for sentiment analysis, we need to set minimal words length, there we need to set it as 2.

# In[20]:

no_topics = 10
lda_allbow, bow_corpus, dictionary = lda.fit_lda(pos_tweet,
                                                 num_topics=no_topics)
lda.lda_topics(lda_allbow)
Exemplo n.º 4
0
# In[36]:


top10_freq_list = DataExploration.get_topn_freq_bow(preprocessed_tokens, topn = 10)
top10_posfreq_list = DataExploration.get_topn_freq_bow(pos_tokens, topn = 10)
top10_negfreq_list = DataExploration.get_topn_freq_bow(neg_tokens, topn = 10)
print(top10_freq_list)
print(top10_posfreq_list)
print(top10_negfreq_list)


# In[37]:


DataExploration.generate_word_cloud(pos_tokens)


# In[38]:


DataExploration.generate_word_cloud(neg_tokens)


# ## Model Development

# ### 1. Split Dataset

# In[3]:

Exemplo n.º 5
0
top_10_freq_words = [i[0] for i in DataExploration.get_topn_freq_bow(data['token'].to_list(), topn = 10)]
print(top_10_freq_words)


# In[17]:


top30tfidf = tfidf.get_top_n_tfidf_bow(data['token'].to_list(), top_n_tokens = 30)
print('top 30 tfidf', top30tfidf)


# In[18]:


DataExploration.generate_word_cloud(data['token'].to_list())


# In[19]:


no_topics = 10
lda_allbow, bow_corpus, dictionary = lda.fit_lda(data['token'].to_list(), num_topics = no_topics)
lda.lda_topics(lda_allbow)


# In[33]:


tfidf_data = tfidf.get_tfidf_dataframe(data['token'].to_list(), 
                                       doc_index = list(data.doc_id),
Exemplo n.º 6
0
top_10_freq_words = [i[0] for i in DataExploration.get_topn_freq_bow(processed_letter_df['tokens'].to_list(), topn = 10)]
print(top_10_freq_words)


# In[27]:


top30tfidf = tfidf.get_top_n_tfidf_bow(processed_letter_df['tokens'].to_list(), top_n_tokens = 30)
print('top 30 tfidf', top30tfidf)


# In[28]:


DataExploration.generate_word_cloud(processed_letter_df['tokens'].to_list())


# In[29]:


no_topics = 10
lda_allbow, bow_corpus, dictionary = lda.fit_lda(processed_letter_df['tokens'].to_list(), num_topics = no_topics)
lda.lda_topics(lda_allbow)


# ### Words Frequency

# In[8]:

Exemplo n.º 7
0
negtop10tfidf = tfidf.get_top_n_tfidf_bow(neg_tweet, top_n_tokens=30)
neutop10tfidf = tfidf.get_top_n_tfidf_bow(neu_tweet, top_n_tokens=30)
print('top 30 negative review tfidf', negtop10tfidf)
print('top 30 positive review tfidf', postop10tfidf)
print('top 30 neutual review tfidf', neutop10tfidf)

# In[40]:

top10_posfreq_list = DataExploration.get_topn_freq_bow(pos_tweet, topn=10)
top10_negfreq_list = DataExploration.get_topn_freq_bow(neg_tweet, topn=10)
print(top10_posfreq_list)
print(top10_negfreq_list)

# In[38]:

DataExploration.generate_word_cloud(pos_tweet)

# In[41]:

DataExploration.generate_word_cloud(neg_tweet)

# In[42]:

DataExploration.generate_word_cloud(neu_tweet)

# ## LDA

# In[43]:

no_topics = 10
lda_allbow, bow_corpus, dictionary = lda.fit_lda(pos_tweet,
Exemplo n.º 8
0
    print(printdata.company_profile.item())
    print('-------------------- Job Description --------------------')
    print(printdata.description.item())
    print('-------------------- Requirements --------------------')
    print(printdata.requirements.item())
    print('-------------------- Benifits --------------------')
    print(printdata.benefits.item())


print_job(raw_data, 50)

# In[5]:

profile_tokens = list(raw_data['profile_tokens'])
print(tfidf.get_top_n_tfidf_bow(profile_tokens, top_n_tokens=30))
DataExploration.generate_word_cloud(profile_tokens)
no_topics = 10
lda_allbow, bow_corpus, dictionary = lda.fit_lda(profile_tokens, num_topics=10)
lda.lda_topics(lda_allbow)

# In[6]:

profile_tokens = list(raw_data['description_tokens'])
print(tfidf.get_top_n_tfidf_bow(profile_tokens, top_n_tokens=30))
DataExploration.generate_word_cloud(profile_tokens)
no_topics = 10
lda_allbow, bow_corpus, dictionary = lda.fit_lda(profile_tokens, num_topics=10)
lda.lda_topics(lda_allbow)

# In[7]:
Exemplo n.º 9
0
                                        lower_case=True, 
                                        deacc=False, encoding='utf8',
                                        errors='strict', 
                                        stem_lemma = 'lemma', 
                                        tag_drop = ['J'], 
                                        nltk_stop=True, 
                                        stop_word_list=['course','courses'], 
                                        check_numbers=False, 
                                        word_length=0, 
                                        remove_consecutives=True)


# In[163]:


dataExploration.generate_word_cloud(processed_doc)


# In[11]:


tfidf_value_data = tfidf.get_tfidf_dataframe(processed_doc,no_below =2, no_above = 1)
tfidf_value_data.head(10)


# In[12]:


base_book = 'To Kill a Mockingbird'
base_book_detail = content_data[content_data.original_title == base_book]
bookid = base_book_detail['id'].values
Exemplo n.º 10
0
print('top 30 positive review tfidf', postop10tfidf)


# In[14]:


top10_posfreq_list = DataExploration.get_topn_freq_bow(pos_tweet, topn = 10)
top10_negfreq_list = DataExploration.get_topn_freq_bow(neg_tweet, topn = 10)
print(top10_posfreq_list)
print(top10_negfreq_list)


# In[15]:


DataExploration.generate_word_cloud(pos_tweet)


# In[16]:


DataExploration.generate_word_cloud(neg_tweet)


# In[18]:


hashtag_list = list(sample_data.hashtag)
DataExploration.generate_word_cloud(hashtag_list)

# In[23]:

top_10_freq_words = [
    i[0] for i in DataExploration.get_topn_freq_bow(djia_tokens, topn=10)
]
print('top 10 frequent words', top_10_freq_words)
top30tfidf = tfidf.get_top_n_tfidf_bow(djia_tokens, top_n_tokens=30)
print('top 30 tfidf', top30tfidf)
no_topics = 10
lda_allbow, bow_corpus, dictionary = lda.fit_lda(djia_tokens,
                                                 num_topics=no_topics)
lda.lda_topics(lda_allbow)

# In[20]:

DataExploration.generate_word_cloud(djia_tokens)

# In[27]:

top_10_freq_words = [
    i[0] for i in DataExploration.get_topn_freq_bow(nasdaq_tokens, topn=10)
]
print('top 10 frequent words', top_10_freq_words)
top30tfidf = tfidf.get_top_n_tfidf_bow(nasdaq_tokens, top_n_tokens=30)
print('top 30 tfidf', top30tfidf)
no_topics = 10
lda_allbow, bow_corpus, dictionary = lda.fit_lda(nasdaq_tokens,
                                                 num_topics=no_topics)
lda.lda_topics(lda_allbow)

# In[28]:
Exemplo n.º 12
0
    print('-------------------- Benifits --------------------')
    print(printdata.benefits.item())


print_job(raw_data, 50)

# In[145]:

# raw_data['jd_tokens'] = textClean.pipeline(raw_data['jd'].to_list(), multi_gram = [1], lower_case=True,
#                                            deacc=False, encoding='utf8', errors='strict', stem_lemma = 'lemma',
#                                            tag_drop = [], nltk_stop=True,
#                                            stop_word_list=[],
#                                            check_numbers=False, word_length=2, remove_consecutives=True)
fraud_tokens = list(raw_data[raw_data.fraudulent == 1]['jd_tokens'])
print(tfidf.get_top_n_tfidf_bow(fraud_tokens, top_n_tokens=30))
DataExploration.generate_word_cloud(fraud_tokens)
no_topics = 10
lda_allbow, bow_corpus, dictionary = lda.fit_lda(fraud_tokens, num_topics=10)
lda.lda_topics(lda_allbow)

# In[146]:

raw_data['jd_tokens2'] = textClean.pipeline(raw_data['jd'].to_list(),
                                            multi_gram=[2],
                                            lower_case=True,
                                            deacc=False,
                                            encoding='utf8',
                                            errors='strict',
                                            stem_lemma='lemma',
                                            tag_drop=[],
                                            nltk_stop=True,