with open('ldamodels.pickle', 'rb') as f:
    lda, temp, x1, x2, DTM, dictionary = pickle.load(
        f)  # chose model with 20 topics, selected 15 from 20

# quick look at topic keywords
nt = lda.num_topics
for t in range(nt):
    print(t, lda.show_topic(t))

# save topics for inspection
topic_df = pd.DataFrame(lda.show_topics(nt), columns=['topic_num', 'keywords'])
topic_df.to_csv("initial_topics.csv", index=False)

# check doc_topic probability distribution
len(lda[DTM])  # mallet produces a dense doc-topic probability matrix
topics_docs_dict = out_topics_docs(lda, DTM)

doc_topic_probs = {}
for t in sorted(topics_docs_dict.keys()):
    dt_prob = check_topic_doc_prob(topics_docs_dict, t)
    print(dt_prob.describe(), "\n")

# examine each topic by topic key words, number of generated documents, document probabilities, docs with top probabilities
topic_num = 15

#print("topic", topic_num, "has", len(topics_docs_dict[topic_num]),"documents")
print(
    "Distribution of probabilities of documents being generated from this topic:"
)
doc_prob = check_topic_doc_prob(topics_docs_dict, topic_num)
print(doc_prob.describe(), "\n")
Exemplo n.º 2
0
# Print the coherence scores
for m, cv in zip(x, coherence_values):
    print("Num Topics =", m, " has Coherence Value of", round(cv, 4))

lda_model = model_list[9]  # 12 topics

print(lda_model.alpha)
print(max(lda_model.eta))
print(min(lda_model.eta))
print(np.mean(lda_model.eta))

pprint(lda_model.print_topics())

## A closer look at the document_topics distribution
len(lda_model[DTM]) 
topics_docs_dict = out_topics_docs(lda_model, DTM)

# check doc_topic probability distribution
for t in sorted(topics_docs_dict.keys()):
    test_prob = check_topic_doc_prob(topics_docs_dict, t)
    print(test_prob.describe(),"\n")
    
# examine each topic by topic key words, number of generated documents, document probabilities, docs with top probabilities
topic_num = 2
print(lda_model.show_topic(topicid=topic_num))
print("topic", topic_num, "has", len(topics_docs_dict[topic_num]),"documents")
print("Distribution of probabilities of documents being generated from this topic:")
doc_prob = check_topic_doc_prob(topics_docs_dict, topic_num)
print(doc_prob.describe(),"\n")

top_docprobs = topn_docs_by_topic(topics_docs_dict,topic_num, 10)
# create document term matrix (corpus), it's a list of nd elements, nd = the number of documents
# each element of DTM (AKA corpus) is a list of tuples (int, int) representing (word_index, frequency)
DTM = [dictionary.doc2bow(doc) for doc in reviews]

%time ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=DTM, num_topics=10, id2word=dictionary)

# Show Topics
pprint(ldamallet.show_topics(formatted=False))

# Compute Coherence Score
coherence_model_ldamallet = CoherenceModel(model=ldamallet, texts=reviews, dictionary=dictionary, coherence='c_v')
coherence_ldamallet = coherence_model_ldamallet.get_coherence()
print('\nCoherence Score: ', coherence_ldamallet)

len(ldamallet[DTM])  # mallet produces a dense doc-topic probability matrix, this could be problematic for memory
topics_docs_dict = out_topics_docs(ldamallet, DTM)

# check doc_topic probability distribution
for t in sorted(topics_docs_dict.keys()):
    test_prob = check_topic_doc_prob(topics_docs_dict, t)
    print(test_prob.describe(),"\n")
    
# examine each topic by topic key words, number of generated documents, document probabilities, docs with top probabilities
topic_num = 0
print(ldamallet.show_topic(topicid=topic_num))
print("topic", topic_num, "has", len(topics_docs_dict[topic_num]),"documents")
print("Distribution of probabilities of documents being generated from this topic:")
doc_prob = check_topic_doc_prob(topics_docs_dict, topic_num)
print(doc_prob.describe(),"\n")
top_docprobs = topn_docs_by_topic(topics_docs_dict,topic_num, 10)
idxs = pd.Series([x[0] for x in top_docprobs])