if not model_path.exists(): model_path.mkdir(exist_ok=True, parents=True) print((num_topics, passes), end=' ', flush=True) lda = LdaMulticore(corpus=train_corpus, num_topics=num_topics, id2word=id2word, passes=passes, eval_every=None, workers=72, random_state=42) test_perplexity = 2**(-lda.log_perplexity(test_corpus)) lda.update(corpus=test_corpus) lda.save((model_path / 'lda').resolve().as_posix()) topic_coherence = lda.top_topics(corpus=corpus, coherence='u_mass', topn=20) coherence.append([c[1] for c in topic_coherence]) perplexity.append([ vocab_size, test_vocab, min_df, max_df, binary, num_topics, passes, test_perplexity ]) elapsed = time() - start print( f'\nDone: {i / n:.2%} | Duration: {format_time(elapsed)} | To Go: {format_time(elapsed / i * (n - i))}\n' ) perplexity = pd.DataFrame(perplexity, columns=cols).sort_values('perplexity') print(perplexity)
# Make a index to word dictionary. temp = dictionary[0] # This is only to "load" the dictionary. id2word = dictionary.id2token model = LdaMulticore(corpus=corpus, id2word=id2word, chunksize=chunksize, alpha=0.05, eta=0.01, iterations=iterations, num_topics=num_topics, passes=passes, eval_every=eval_every, workers=4) top_topics = model.top_topics(corpus) #, num_words=20) # Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics. avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics print('Average topic coherence: %.4f.' % avg_topic_coherence) pprint(top_topics) print(top_topics) numpy.save(os.path.join(out_path, "topics.npy"), top_topics) model.save(os.path.join(out_path, "lda_model")) #predict a topic for a document important_words = docs[2] print(important_words) print(len(important_words))
id2word = dictionary.id2token lda_model = LdaMulticore( corpus=tfidf_corpus, id2word=id2word, chunksize=chunksize, alpha='symmetric', eta='auto', iterations=iterations, num_topics=num_topics, passes=passes, eval_every=eval_every, workers=4 # Use all four cores ) top_topics = lda_model.top_topics(tfidf_corpus) pprint(top_topics) # - # Gensim calculates the [intrinsic coherence score](http://qpleple.com/topic-coherence-to-evaluate-topic-models/) for # each topic. By averaging across all of the topics in the model you can get an average coherence score. Coherence # is a measure of the strength of the association between words in a topic cluster. It is supposed to be an objective # way to evaluate the quailty of the topic clusters. Higher scores are better. # + # Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics. avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics print('Average topic coherence: %.4f.' % avg_topic_coherence) # - # References: