normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split()) return normalized # Creating a list of documents from the complaints column list_of_docs = df["message"].tolist() # Implementing the function for all the complaints of list_of_docs doc_clean = [clean(doc).split() for doc in list_of_docs] # Code starts here # Creating the dictionary from our cleaned word list doc_clean dictionary = corpora.Dictionary(doc_clean) # Creating the corpus doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean] # Creating the LSi model lsimodel = LsiModel(corpus=doc_term_matrix, num_topics=5, id2word=dictionary) pprint(lsimodel.print_topics()) # -------------- from gensim.models import LdaModel from gensim.models import CoherenceModel # doc_term_matrix - Word matrix created in the last task # dictionary - Dictionary created in the last task # Function to calculate coherence values def compute_coherence_values(dictionary, corpus, texts, limit, start=2,
!pip install pyLDAvis import pyLDAvis.gensim pyLDAvis.enable_notebook() pyLDAvis.gensim.prepare(lda, journals_corpus, journals_dictionary) from gensim.models import CoherenceModel lda_cm=CoherenceModel(model=lda,corpus=journals_corpus,dictionary=journals_dictionary,texts= journals['Full title'],coherence='c_v') LDA_cm=lda_cm.get_coherence() LDA_cm from gensim.models.lsimodel import LsiModel lsi = LsiModel(corpus=journals_corpus,id2word=journals_dictionary,num_topics=20) lsi_topics = lsi.print_topics() for topic in lsi_topics: print(topic) test_doc = 'Journal of medicines and herbs' test_doc = custom_preprocess(test_doc) test_doc_bow = journals_dictionary.doc2bow(test_doc) print(test_doc_bow) print(lsi[test_doc_bow]) test_doc2 = 'Material and physics' test_doc2 = custom_preprocess(test_doc2) test_doc_bow2 = journals_dictionary.doc2bow(test_doc2) print(test_doc_bow2)
list_of_docs = df["message"].tolist() # Implementing the function for all the complaints of list_of_docs doc_clean = [clean(doc).split() for doc in list_of_docs] # Code starts here #Creating the dictionary id2word from our cleaned word list doc_clean dictionary = corpora.Dictionary(doc_clean) # Creating the corpus doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean] # Creating the LSi model lsimodel = LsiModel(corpus=doc_term_matrix, num_topics=5, id2word=dictionary) lsimodel.print_topics() #Topic modeling using LDA -------------- from gensim.models import LdaModel from gensim.models import CoherenceModel # doc_term_matrix - Word matrix created in the last task # dictionary - Dictionary created in the last task # Function to calculate coherence values def compute_coherence_values(dictionary, corpus, texts, limit, start=2,
print corpus # tfidf = models.TfidfModel(corpus) # vec = [(0, 1), (4, 1)] # print(tfidf[vec]) # index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=12) # sims = index[tfidf[vec]] # print(list(enumerate(sims))) corpora.MmCorpus.save_corpus('file.mm', corpus) #id2word= corpora.Dictionary.load('deerwester.dict') mmCorpus = corpora.MmCorpus("file.mm") print mmCorpus lsi = LsiModel(mmCorpus, id2word=dictionary,num_topics=10) print "lsi:" #print(lsi[new_vec]) lsi.print_debug(4, 4) lsi.print_topics(4,2) lsi.show_topic(10, 10) lda = LdaModel(mmCorpus,id2word=dictionary,num_topics=10) lda.print_topics(4,4) doc_lda = lda[new_vec] print "lda:" #print doc_lda # corpus = [[(0, 1.0), (1, 1.0), (2, 1.0)], # [(2, 1.0), (3, 1.0), (4, 1.0), (5, 1.0), (6, 1.0), (8, 1.0)], # [(1, 1.0), (3, 1.0), (4, 1.0), (7, 1.0)], # [(0, 1.0), (4, 2.0), (7, 1.0)], # [(3, 1.0), (5, 1.0), (6, 1.0)], # [(9, 1.0)],
def fit_model(corpus, id2word, num_topics): # 训练模型 lsi = LsiModel(corpus=corpus, id2word=id2word, num_topics=num_topics) lsi.print_topics(num_topics) return lsi
stop_free = " ".join([i for i in doc.lower().split() if i not in stop]) punc_free = "".join(ch for ch in stop_free if ch not in exclude) normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split()) return normalized # Creating a list of documents from the complaints column list_of_docs = df["message"].tolist() # Implementing the function for all the complaints of list_of_docs doc_clean = [clean(doc).split() for doc in list_of_docs] # Code starts here dictionary = corpora.Dictionary(doc_clean) doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean] # Creating the LSi model lsimodel = LsiModel(corpus=doc_term_matrix, num_topics=5, id2word=dictionary) lsimodel.print_topics(num_topics=5, num_words=3) # -------------- from gensim.models import LdaModel from gensim.models import CoherenceModel # doc_term_matrix - Word matrix created in the last task # dictionary - Dictionary created in the last task # Function to calculate coherence values def compute_coherence_values(dictionary, corpus, texts, limit, start=2,