Exemplos de generate_word_cloud em Python, exemplos de nlpbasic.dataExploration.generate_word_cloud em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: Covid19_vaccination_twitter_analysis.py Projeto: jinfeijoy/NLP


test= data[(data.date.astype(str)=='2021-06-01')&(data.country=='India')].reset_index(drop=True)
test.orig_text[5]


# In[49]:


test['explore_text'] = textClean.pipeline(test['text'].to_list(), multi_gram = [1], lower_case=True, 
                                                 deacc=False, encoding='utf8', errors='strict', stem_lemma = 'lemma', 
                                                 tag_drop = [], nltk_stop=True, 
                                                 stop_word_list=['effect','vaccine','side','covid'], 
                                                 check_numbers=False, word_length=2, remove_consecutives=True)
print(tfidf.get_top_n_tfidf_bow(list(test['explore_text']), top_n_tokens = 30))
DataExploration.generate_word_cloud(list(test['explore_text']))
no_topics = 10
lda_allbow, bow_corpus, dictionary = lda.fit_lda(list(test['explore_text']), num_topics = 10)
lda.lda_topics(lda_allbow)


# From the above plots we can see, people in India and Canada tweets a lot in March and April, and the spikes in overall volume trend were contributed by India, we will do more exploration for Canada and India. For india, it seems most tweets are neutral.
# 
# In India, in 2021-Mar-01, most tweets are about vacc, in 2021-Apr-21, most tweets are about vacc and infection and medical service, in 2021-June-01, India start use sputnik, etc.

# In[76]:


canada_overtime = data[data.country.isin(['Canada'])].groupby(['date', 'sentiment']).agg(**{'tweets': ('id', 'count')}).reset_index().dropna()
fig = px.line(canada_overtime, x='date', y='tweets', color='sentiment',
             title='Timeline showing emotion of tweets in Canada about COVID-19 vaccines')

Exemplo n.º 2

0

Exibir arquivo

selected_tokens = train[train.LABEL == 'Mathematics']['title_tokens'].to_list()
top_10_freq_words = [
    i[0] for i in DataExploration.get_topn_freq_bow(selected_tokens, topn=10)
]
print('top 10 frequent words', top_10_freq_words)
top30tfidf = tfidf.get_top_n_tfidf_bow(selected_tokens, top_n_tokens=30)
print('top 30 tfidf', top30tfidf)
no_topics = 10
lda_allbow, bow_corpus, dictionary = lda.fit_lda(selected_tokens,
                                                 num_topics=no_topics)
lda.lda_topics(lda_allbow)

# In[44]:

DataExploration.generate_word_cloud(selected_tokens)

# ## Fit model based on title

# In[58]:

train_index, test_index = train_test_split(train.index,
                                           test_size=0.33,
                                           random_state=42)
X_train = train[train.index.isin(train_index)][['TITLE']]
X_test = train[train.index.isin(test_index)][['TITLE']]
y_train = pd.get_dummies(train[train.index.isin(train_index)]['LABEL']).values
y_test = pd.get_dummies(train[train.index.isin(test_index)]['LABEL']).values
X_train = [i for i in X_train.TITLE]
X_test = [i for i in X_test.TITLE]

Exemplo n.º 3

0

Exibir arquivo

Arquivo: Basic_twitter_sentiment_analysis_smallDataset.py Projeto: jinfeijoy/NLP

                 for x in data[data['new_sentiment'] == 'negative']['text'])
postop10tfidf = tfidf.get_top_n_tfidf_bow(pos_tweet, top_n_tokens=30)
negtop10tfidf = tfidf.get_top_n_tfidf_bow(neg_tweet, top_n_tokens=30)
print('top 30 negative review tfidf', negtop10tfidf)
print('top 30 positive review tfidf', postop10tfidf)

# In[16]:

top10_posfreq_list = DataExploration.get_topn_freq_bow(pos_tweet, topn=10)
top10_negfreq_list = DataExploration.get_topn_freq_bow(neg_tweet, topn=10)
print(top10_posfreq_list)
print(top10_negfreq_list)

# In[17]:

DataExploration.generate_word_cloud(pos_tweet)

# In[18]:

DataExploration.generate_word_cloud(neg_tweet)

# We didn't remove stop words, so the LDA does not work well, to do topic modelling, we need to remove stop words. But for sentiment analysis, it is better to keep all words.
# However, even for sentiment analysis, we need to set minimal words length, there we need to set it as 2.

# In[20]:

no_topics = 10
lda_allbow, bow_corpus, dictionary = lda.fit_lda(pos_tweet,
                                                 num_topics=no_topics)
lda.lda_topics(lda_allbow)

Exemplo n.º 4

0

Exibir arquivo

# In[36]:


top10_freq_list = DataExploration.get_topn_freq_bow(preprocessed_tokens, topn = 10)
top10_posfreq_list = DataExploration.get_topn_freq_bow(pos_tokens, topn = 10)
top10_negfreq_list = DataExploration.get_topn_freq_bow(neg_tokens, topn = 10)
print(top10_freq_list)
print(top10_posfreq_list)
print(top10_negfreq_list)


# In[37]:


DataExploration.generate_word_cloud(pos_tokens)


# In[38]:


DataExploration.generate_word_cloud(neg_tokens)


# ## Model Development

# ### 1. Split Dataset

# In[3]:

Exemplo n.º 5

0

Exibir arquivo

top_10_freq_words = [i[0] for i in DataExploration.get_topn_freq_bow(data['token'].to_list(), topn = 10)]
print(top_10_freq_words)


# In[17]:


top30tfidf = tfidf.get_top_n_tfidf_bow(data['token'].to_list(), top_n_tokens = 30)
print('top 30 tfidf', top30tfidf)


# In[18]:


DataExploration.generate_word_cloud(data['token'].to_list())


# In[19]:


no_topics = 10
lda_allbow, bow_corpus, dictionary = lda.fit_lda(data['token'].to_list(), num_topics = no_topics)
lda.lda_topics(lda_allbow)


# In[33]:


tfidf_data = tfidf.get_tfidf_dataframe(data['token'].to_list(), 
                                       doc_index = list(data.doc_id),

Exemplo n.º 6

0

Exibir arquivo

top_10_freq_words = [i[0] for i in DataExploration.get_topn_freq_bow(processed_letter_df['tokens'].to_list(), topn = 10)]
print(top_10_freq_words)


# In[27]:


top30tfidf = tfidf.get_top_n_tfidf_bow(processed_letter_df['tokens'].to_list(), top_n_tokens = 30)
print('top 30 tfidf', top30tfidf)


# In[28]:


DataExploration.generate_word_cloud(processed_letter_df['tokens'].to_list())


# In[29]:


no_topics = 10
lda_allbow, bow_corpus, dictionary = lda.fit_lda(processed_letter_df['tokens'].to_list(), num_topics = no_topics)
lda.lda_topics(lda_allbow)


# ### Words Frequency

# In[8]:

Exemplo n.º 7

0

Exibir arquivo

negtop10tfidf = tfidf.get_top_n_tfidf_bow(neg_tweet, top_n_tokens=30)
neutop10tfidf = tfidf.get_top_n_tfidf_bow(neu_tweet, top_n_tokens=30)
print('top 30 negative review tfidf', negtop10tfidf)
print('top 30 positive review tfidf', postop10tfidf)
print('top 30 neutual review tfidf', neutop10tfidf)

# In[40]:

top10_posfreq_list = DataExploration.get_topn_freq_bow(pos_tweet, topn=10)
top10_negfreq_list = DataExploration.get_topn_freq_bow(neg_tweet, topn=10)
print(top10_posfreq_list)
print(top10_negfreq_list)

# In[38]:

DataExploration.generate_word_cloud(pos_tweet)

# In[41]:

DataExploration.generate_word_cloud(neg_tweet)

# In[42]:

DataExploration.generate_word_cloud(neu_tweet)

# ## LDA

# In[43]:

no_topics = 10
lda_allbow, bow_corpus, dictionary = lda.fit_lda(pos_tweet,

Exemplo n.º 8

0

Exibir arquivo

    print(printdata.company_profile.item())
    print('-------------------- Job Description --------------------')
    print(printdata.description.item())
    print('-------------------- Requirements --------------------')
    print(printdata.requirements.item())
    print('-------------------- Benifits --------------------')
    print(printdata.benefits.item())


print_job(raw_data, 50)

# In[5]:

profile_tokens = list(raw_data['profile_tokens'])
print(tfidf.get_top_n_tfidf_bow(profile_tokens, top_n_tokens=30))
DataExploration.generate_word_cloud(profile_tokens)
no_topics = 10
lda_allbow, bow_corpus, dictionary = lda.fit_lda(profile_tokens, num_topics=10)
lda.lda_topics(lda_allbow)

# In[6]:

profile_tokens = list(raw_data['description_tokens'])
print(tfidf.get_top_n_tfidf_bow(profile_tokens, top_n_tokens=30))
DataExploration.generate_word_cloud(profile_tokens)
no_topics = 10
lda_allbow, bow_corpus, dictionary = lda.fit_lda(profile_tokens, num_topics=10)
lda.lda_topics(lda_allbow)

# In[7]:

Exemplo n.º 9

0

Exibir arquivo

                                        lower_case=True, 
                                        deacc=False, encoding='utf8',
                                        errors='strict', 
                                        stem_lemma = 'lemma', 
                                        tag_drop = ['J'], 
                                        nltk_stop=True, 
                                        stop_word_list=['course','courses'], 
                                        check_numbers=False, 
                                        word_length=0, 
                                        remove_consecutives=True)


# In[163]:


dataExploration.generate_word_cloud(processed_doc)


# In[11]:


tfidf_value_data = tfidf.get_tfidf_dataframe(processed_doc,no_below =2, no_above = 1)
tfidf_value_data.head(10)


# In[12]:


base_book = 'To Kill a Mockingbird'
base_book_detail = content_data[content_data.original_title == base_book]
bookid = base_book_detail['id'].values

Exemplo n.º 10

0

Exibir arquivo

print('top 30 positive review tfidf', postop10tfidf)


# In[14]:


top10_posfreq_list = DataExploration.get_topn_freq_bow(pos_tweet, topn = 10)
top10_negfreq_list = DataExploration.get_topn_freq_bow(neg_tweet, topn = 10)
print(top10_posfreq_list)
print(top10_negfreq_list)


# In[15]:


DataExploration.generate_word_cloud(pos_tweet)


# In[16]:


DataExploration.generate_word_cloud(neg_tweet)


# In[18]:


hashtag_list = list(sample_data.hashtag)
DataExploration.generate_word_cloud(hashtag_list)

Exemplo n.º 11

0

Exibir arquivo

Arquivo: stock_news_sentiment_exploration.py Projeto: jinfeijoy/NLP

# In[23]:

top_10_freq_words = [
    i[0] for i in DataExploration.get_topn_freq_bow(djia_tokens, topn=10)
]
print('top 10 frequent words', top_10_freq_words)
top30tfidf = tfidf.get_top_n_tfidf_bow(djia_tokens, top_n_tokens=30)
print('top 30 tfidf', top30tfidf)
no_topics = 10
lda_allbow, bow_corpus, dictionary = lda.fit_lda(djia_tokens,
                                                 num_topics=no_topics)
lda.lda_topics(lda_allbow)

# In[20]:

DataExploration.generate_word_cloud(djia_tokens)

# In[27]:

top_10_freq_words = [
    i[0] for i in DataExploration.get_topn_freq_bow(nasdaq_tokens, topn=10)
]
print('top 10 frequent words', top_10_freq_words)
top30tfidf = tfidf.get_top_n_tfidf_bow(nasdaq_tokens, top_n_tokens=30)
print('top 30 tfidf', top30tfidf)
no_topics = 10
lda_allbow, bow_corpus, dictionary = lda.fit_lda(nasdaq_tokens,
                                                 num_topics=no_topics)
lda.lda_topics(lda_allbow)

# In[28]:

Exemplo n.º 12

0

Exibir arquivo

    print('-------------------- Benifits --------------------')
    print(printdata.benefits.item())


print_job(raw_data, 50)

# In[145]:

# raw_data['jd_tokens'] = textClean.pipeline(raw_data['jd'].to_list(), multi_gram = [1], lower_case=True,
#                                            deacc=False, encoding='utf8', errors='strict', stem_lemma = 'lemma',
#                                            tag_drop = [], nltk_stop=True,
#                                            stop_word_list=[],
#                                            check_numbers=False, word_length=2, remove_consecutives=True)
fraud_tokens = list(raw_data[raw_data.fraudulent == 1]['jd_tokens'])
print(tfidf.get_top_n_tfidf_bow(fraud_tokens, top_n_tokens=30))
DataExploration.generate_word_cloud(fraud_tokens)
no_topics = 10
lda_allbow, bow_corpus, dictionary = lda.fit_lda(fraud_tokens, num_topics=10)
lda.lda_topics(lda_allbow)

# In[146]:

raw_data['jd_tokens2'] = textClean.pipeline(raw_data['jd'].to_list(),
                                            multi_gram=[2],
                                            lower_case=True,
                                            deacc=False,
                                            encoding='utf8',
                                            errors='strict',
                                            stem_lemma='lemma',
                                            tag_drop=[],
                                            nltk_stop=True,