wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2') wiki.save(outp + '_corpus.pkl.bz2') # load back the id->word mapping directly from file # this seems to save more memory, compared to keeping the wiki.dictionary object from above dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2') # build tfidf if os.path.exists(outp + '_tfidf.mm'): mm = gensim.corpora.MmCorpus(outp + '_tfidf.mm') else: tfidf = TfidfModel(wiki, id2word=dictionary, normalize=True) #tfidf.save(outp + '.tfidf_model') # save tfidf vectors in matrix market format mm = tfidf[wiki] MmCorpus.serialize(outp + '_tfidf.mm', mm, progress_cnt=10000) logger.info("finished pre-processing, starting LDA %s", program) lda = LdaMulticore(mm, id2word=dictionary, workers=10, num_topics=ntopics) lda.save(model_name) topics = lda.show_topics(num_topics=ntopics, num_words=30) print(topics) logger.info("finished LDA %s", program) toptopics = lda.top_topics(corpus=wiki, dictionary=lda.id2word, coherence='u_mass') logger.info("top topicsL %s", 'u_mass') print(toptopics)
# Set training parameters. num_topics = 100 chunksize = 2000 eval_every = None # Don't evaluate model perplexity, takes too much time. # Make a index to word dictionary. temp = dictionary[0] # This is only to "load" the dictionary. id2word = dictionary.id2token model = LdaMulticore( corpus=corpus, id2word=id2word, num_topics=num_topics, ) top_topics = model.top_topics(corpus) #, num_words=20) # Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics. avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics print('Average topic coherence: %.4f.' % avg_topic_coherence) from pprint import pprint pprint(top_topics) import pyLDAvis.gensim import pyLDAvis # pyLDAvis.enable_notebook() vis = pyLDAvis.gensim.prepare(model, corpus, dictionary=dictionary) pyLDAvis.save_html(vis, open("lda_vis.html", "w"))