test_doc2 = 'Material and physics' test_doc2 = custom_preprocess(test_doc2) test_doc_bow2 = journals_dictionary.doc2bow(test_doc2) print(test_doc_bow2) print(lsi[test_doc_bow2]) lsi_cm=CoherenceModel(model=lsi,corpus=journals_corpus,dictionary=journals_dictionary,texts= journals['Full title'],coherence='c_v') LSI_cm=lsi_cm.get_coherence() LSI_cm from gensim.models.hdpmodel import HdpModel hdp = HdpModel(corpus=journals_corpus,id2word=journals_dictionary) hdp_topics = hdp.print_topics() for topic in hdp_topics: print(topic) test_doc = 'Journal of medicines and herbs' test_doc = custom_preprocess(test_doc) test_doc_bow = journals_dictionary.doc2bow(test_doc) print(test_doc_bow) print(hdp[test_doc_bow]) test_doc2 = 'Material and physics' test_doc2 = custom_preprocess(test_doc2) test_doc_bow2 = journals_dictionary.doc2bow(test_doc2) print(test_doc_bow2)
kfolds=10 kf = cross_validation.KFold(count1, n_folds=kfolds) for li in f: li=li.split() corpora_documents.append(li) for la in f2: la=la.split() label_level.append(la) corpora_documents=array(corpora_documents) label_level=array(label_level) #生成字典和向量语料 dictionary = corpora.Dictionary(corpora_documents) #dictionary.save('dictionary.dict') corpus = [dictionary.doc2bow(text) for text in corpora_documents] tfidf=models.TfidfModel(corpus) corpus_tfidf=tfidf[corpus] hdp=HdpModel(corpus_tfidf,id2word=dictionary) corpus_hdp=hdp[corpus_tfidf] index=similarities.MatrixSimilarity(corpus_hdp) print(hdp.print_topics(num_topics=20, num_words=10))
def fit_model(corpus,id2word,num_topics=20): # 训练模型 hdp = HdpModel(corpus=corpus, id2word=id2word) hdp.print_topics(num_topics) return hdp