# In[19]: no_topics = 10 lda_allbow, bow_corpus, dictionary = lda.fit_lda(data['token'].to_list(), num_topics = no_topics) lda.lda_topics(lda_allbow) # In[33]: tfidf_data = tfidf.get_tfidf_dataframe(data['token'].to_list(), doc_index = list(data.doc_id), no_below =5, no_above = 0.5, keep_n = 100000) # In[28]: tfidf_data.columns = ['doc_id'] + [i[1] for i in tfidf_data.columns][1:] tfidf_data.head(3) # In[34]: tfidf_data = tfidf_data.pivot(index=['doc_id'], columns='bow').fillna(0).reset_index()
weibo_data.date1.sample(10) # ## TFIDF # In[103]: weibo_data.sample(10) # In[101]: corseid = list(weibo_data.index) tfidf_data = tfidf.get_tfidf_dataframe(sentences, doc_index = corseid, no_below =1, no_above = 0.5, keep_n = 100000) # In[106]: test = dataExploration.get_similarity_cosin(tfidf_data[tfidf_data.doc_id ==7413], tfidf_data, 'bow', 'doc_id', index_is_int = False, topn = 50) test = test.merge(weibo_data[['index','title']], how = 'left', left_on = ['compareindex'], right_on = ['index']).drop(columns = ['index']).rename(columns={'display_name':'compare_name'}) test = test.merge(weibo_data[['index','title']], how = 'left', left_on = ['baseindex'], right_on = ['index']).drop(columns = ['index']).rename(columns={'display_name':'base_name'}) test.head(3)
stop_word_list=['course','courses'], check_numbers=False, word_length=0, remove_consecutives=True) # In[163]: dataExploration.generate_word_cloud(processed_doc) # In[11]: tfidf_value_data = tfidf.get_tfidf_dataframe(processed_doc,no_below =2, no_above = 1) tfidf_value_data.head(10) # In[12]: base_book = 'To Kill a Mockingbird' base_book_detail = content_data[content_data.original_title == base_book] bookid = base_book_detail['id'].values filter_data = tfidf_value_data[tfidf_value_data.doc_id.isin(bookid)] test = dataExploration.get_similarity_cosin(tfidf_value_data, filter_data, 'bow', doc_key = 'doc_id', filterbase='base')#, comp_col = 'tfidf_value', topn_output = 10) recommendation = content_data[content_data.index.isin(test.baseindex.to_list())]
test = test.T # In[32]: snsplt.plot_heatmap(test, x='year', y='count', title = 'Top 10 words heatmap') # ## Doc Similarity # ### tfidf # In[5]: tfidf_data = tfidf.get_tfidf_dataframe(processed_letter_df['tokens'].to_list(), doc_index = yearid,no_below =5, no_above = 0.5, keep_n = 100000) tfidf_data.head(3) # In[6]: test = DataExploration.get_similarity_cosin(tfidf_data, tfidf_data[tfidf_data.doc_id ==2008], 'bow', 'doc_id') test.head(15) # ### word2vec # In[9]:
errors='strict', stem_lemma='lemma', tag_drop=['V'], nltk_stop=True, stop_word_list=['movie', 'film', 'movies', 'films'], check_numbers=True, word_length=3, remove_consecutives=True) # ## Task 2: Create a Term Document Matrix using TF-IDF # During the Day 2 lab, we created a term-document matrix by simply counting the occurence of words in each document. Let's try using TF-IDF to turn our documents in vectors here. # In[5]: tfidf_value_data = tfidf.get_tfidf_dataframe(preprocessed_tokens) to10_tfidf_bow = tfidf.get_top_n_tfidf_bow(preprocessed_tokens, top_n_tokens=10) to10_tfidf_bow # In[6]: dictionary = DocVector.generate_corpus_dict(preprocessed_tokens, no_below=1, no_above=0.5, keep_n=100000) bow_corpus = DocVector.create_document_vector(preprocessed_tokens, dictionary) tfidf_trans = models.TfidfModel(bow_corpus) my_df = DocVector.get_vocab_matrix(tfidf_trans[bow_corpus], dictionary) # In[7]: