Exemplo n.º 1
0

# In[19]:


no_topics = 10
lda_allbow, bow_corpus, dictionary = lda.fit_lda(data['token'].to_list(), num_topics = no_topics)
lda.lda_topics(lda_allbow)


# In[33]:


tfidf_data = tfidf.get_tfidf_dataframe(data['token'].to_list(), 
                                       doc_index = list(data.doc_id),
                                       no_below =5, 
                                       no_above = 0.5, 
                                       keep_n = 100000)


# In[28]:


tfidf_data.columns = ['doc_id'] + [i[1] for i in tfidf_data.columns][1:]
tfidf_data.head(3)


# In[34]:


tfidf_data = tfidf_data.pivot(index=['doc_id'], columns='bow').fillna(0).reset_index()
weibo_data.date1.sample(10)


# ## TFIDF

# In[103]:


weibo_data.sample(10)


# In[101]:


corseid = list(weibo_data.index)
tfidf_data = tfidf.get_tfidf_dataframe(sentences, doc_index = corseid, no_below =1, no_above = 0.5, keep_n = 100000)


# In[106]:


test = dataExploration.get_similarity_cosin(tfidf_data[tfidf_data.doc_id ==7413], 
                                            tfidf_data, 
                                            'bow', 
                                            'doc_id', 
                                            index_is_int = False, 
                                            topn = 50)
test = test.merge(weibo_data[['index','title']], how = 'left', left_on = ['compareindex'], right_on = ['index']).drop(columns = ['index']).rename(columns={'display_name':'compare_name'})
test = test.merge(weibo_data[['index','title']], how = 'left', left_on = ['baseindex'], right_on = ['index']).drop(columns = ['index']).rename(columns={'display_name':'base_name'})
test.head(3)
Exemplo n.º 3
0
                                        stop_word_list=['course','courses'], 
                                        check_numbers=False, 
                                        word_length=0, 
                                        remove_consecutives=True)


# In[163]:


dataExploration.generate_word_cloud(processed_doc)


# In[11]:


tfidf_value_data = tfidf.get_tfidf_dataframe(processed_doc,no_below =2, no_above = 1)
tfidf_value_data.head(10)


# In[12]:


base_book = 'To Kill a Mockingbird'
base_book_detail = content_data[content_data.original_title == base_book]
bookid = base_book_detail['id'].values
filter_data = tfidf_value_data[tfidf_value_data.doc_id.isin(bookid)]

test = dataExploration.get_similarity_cosin(tfidf_value_data, 
                                            filter_data, 
                                            'bow', doc_key = 'doc_id', filterbase='base')#, comp_col = 'tfidf_value', topn_output = 10)
recommendation = content_data[content_data.index.isin(test.baseindex.to_list())]
Exemplo n.º 4
0
test = test.T


# In[32]:


snsplt.plot_heatmap(test, x='year', y='count', title = 'Top 10 words heatmap')


# ## Doc Similarity
# ### tfidf

# In[5]:


tfidf_data = tfidf.get_tfidf_dataframe(processed_letter_df['tokens'].to_list(), doc_index = yearid,no_below =5, no_above = 0.5, keep_n = 100000)
tfidf_data.head(3)


# In[6]:


test = DataExploration.get_similarity_cosin(tfidf_data, tfidf_data[tfidf_data.doc_id ==2008], 'bow', 'doc_id')
test.head(15)


# ### word2vec

# In[9]:

Exemplo n.º 5
0
    errors='strict',
    stem_lemma='lemma',
    tag_drop=['V'],
    nltk_stop=True,
    stop_word_list=['movie', 'film', 'movies', 'films'],
    check_numbers=True,
    word_length=3,
    remove_consecutives=True)

# ## Task 2: Create a Term Document Matrix using TF-IDF

# During the Day 2 lab, we created a term-document matrix by simply counting the occurence of words in each document. Let's try using TF-IDF to turn our documents in vectors here.

# In[5]:

tfidf_value_data = tfidf.get_tfidf_dataframe(preprocessed_tokens)
to10_tfidf_bow = tfidf.get_top_n_tfidf_bow(preprocessed_tokens,
                                           top_n_tokens=10)
to10_tfidf_bow

# In[6]:

dictionary = DocVector.generate_corpus_dict(preprocessed_tokens,
                                            no_below=1,
                                            no_above=0.5,
                                            keep_n=100000)
bow_corpus = DocVector.create_document_vector(preprocessed_tokens, dictionary)
tfidf_trans = models.TfidfModel(bow_corpus)
my_df = DocVector.get_vocab_matrix(tfidf_trans[bow_corpus], dictionary)

# In[7]: