for f, w in words)) output.write("\n\n\n") # We first identify the most discussed topic, i.e., the one with the # highest total weight topics = matutils.corpus2dense(model[corpus], num_terms=model.num_topics) weight = topics.sum(1) max_topic = weight.argmax() # Get the top 64 words for this topic # Without the argument, show_topic would return only 10 words words = model.show_topic(max_topic, 64) # This function will actually check for the presence of pytagcloud and is otherwise a no-op create_cloud('cloud_blei_lda.png', words) num_topics_used = [len(model[doc]) for doc in corpus] fig, ax = plt.subplots() ax.hist(num_topics_used, np.arange(42)) ax.set_ylabel('Nr of documents') ax.set_xlabel('Nr of topics') fig.tight_layout() fig.savefig('Figure_04_01.png') # Now, repeat the same exercise using alpha=1.0 # You can edit the constant below to play around with this parameter ALPHA = 1.0 model1 = models.ldamodel.LdaModel(corpus, num_topics=NUM_TOPICS,
# 100, 2246 weight = topics.sum(1) weight.shape # 100, max_topic = weight.argmax() # 30 np.sum(topics[30] > 0.1) # 960 from 2246 topics have 0.1 frecvency or higher for topic 30 # get the top 64 words for this topic words = model.show_topic(max_topic, 64) ### Create a word cloud using pytagcloud create_cloud('lda_gensim_tagcloud.png', words) ### Plot number of topics, number of posts num_topics_used = [len(model[doc]) for doc in corpus] fig, ax = plt.subplots() ax.hist(num_topics_used, np.arange(42)) ax.set_ylabel('Nr of documents') ax.set_xlabel('Nr of topics') fig.tight_layout() fig.savefig('topics_vs_docs1.png') # change alpha and plot again # bigger alpha => more topics per document # DEFAULT gensim ALPHA = 1 / len(corpus) ALPHA = 1.0
# load corpus corpus = corpora.MmCorpus('../data/aaj.mm') # Build the topic model model = models.ldamodel.LdaModel(corpus, num_topics=NUM_TOPICS, id2word=dictionary, alpha=None) # We first identify the most discussed topic, i.e., the one with the # highest total weight topics = matutils.corpus2dense(model[corpus], num_terms=model.num_topics) weight = topics.sum(1) max_topic = weight.argmax() # Get the top 64 words for this topic # Without the argument, show_topic would return only 10 words words = model.show_topic(max_topic, 64) # ワードクラウドを生成 create_cloud('../data/cloud_lda.png', words) # トピック数分布をプロット num_topics_used = [len(model[doc]) for doc in corpus] fig, ax = plt.subplots() ax.hist(num_topics_used, np.arange(42)) ax.set_ylabel('Nr of documents') ax.set_xlabel('Nr of topics') fig.tight_layout() fig.savefig('../data/Figure_04_01.png')
topics = np.load('topics.npy', mmap_mode='r') # 각 문서에서 언급된 주제 개수 계산 lens = (topics > 0).sum(axis=1) print('Mean number of topics mentioned: {0:.3}'.format(np.mean(lens))) print('Percentage of articles mentioning less than 10 topics: {0:.1%}'.format( np.mean(lens <= 10))) # Weights는 각 주제의 총 가중치 weights = topics.sum(0) # 단어 클아우드로 최대로 사용한 주제 찾기와 그리기 words = model.show_topic(weights.argmax(), 64) # 매개변수 ``maxsize``로 보기좋게 만든다. create_cloud('Wikipedia_most.png', words, maxsize=250, fontname='Cardo') fraction_mention = np.mean(topics[:, weights.argmax()] > 0) print("The most mentioned topics is mentioned in {:.1%} of documents.".format( fraction_mention)) total_weight = np.mean(topics[:, weights.argmax()]) print( "It represents {:.1%} of the total number of words.".format(total_weight)) print() print() print() # 단어 클아우드로 최소로 사용한 주제 찾기와 그리기 words = model.show_topic(weights.argmin(), 64) create_cloud('Wikipedia_least.png', words, maxsize=150, fontname='Cardo') fraction_mention = np.mean(topics[:, weights.argmin()] > 0)
# Load the precomputed model model = gensim.models.ldamodel.LdaModel.load('wiki_lda.pkl') topics = np.load('topics.npy', mmap_mode='r') # Compute the number of topics mentioned in each document lens = (topics > 0).sum(1) print('Mean number of topics mentioned: {0:.3}'.format(np.mean(lens))) print('Percentage of articles mentioning less than 10 topics: {0:.1%}'.format(np.mean(lens <= 10))) # Weights will be the total weight of each topic weights = topics.sum(0) # Retrieve the most heavily used topic and plot it as a word cloud: words = model.show_topic(weights.argmax(), 64) # The parameter ``maxsize`` often needs some manual tuning to make it look nice. create_cloud('Wikipedia_most.png', words, maxsize=410, fontname='Neucha') print(words) print() print() print() # Retrieve the **least** heavily used topic and plot it as a word cloud: words = model.show_topic(weights.argmin(), 64) create_cloud('Wikipedia_least.png', words, maxsize=180, fontname='Neucha') print(words) print() print() print()
weight = topics.sum(1) weight.shape # 100, max_topic = weight.argmax() # 30 np.sum(topics[30] > 0.1) # 960 from 2246 topics have 0.1 frecvency or higher for topic 30 # get the top 64 words for this topic words = model.show_topic(max_topic, 64) ### Create a word cloud using pytagcloud create_cloud('lda_gensim_tagcloud.png', words) ### Plot number of topics, number of posts num_topics_used = [len(model[doc]) for doc in corpus] fig,ax = plt.subplots() ax.hist(num_topics_used, np.arange(42)) ax.set_ylabel('Nr of documents') ax.set_xlabel('Nr of topics') fig.tight_layout() fig.savefig('topics_vs_docs1.png') # change alpha and plot again # bigger alpha => more topics per document # DEFAULT gensim ALPHA = 1 / len(corpus) ALPHA = 1.0
topics = np.load('topics.npy', mmap_mode='r') # Compute the number of topics mentioned in each document lens = (topics > 0).sum(axis=1) print('Mean number of topics mentioned: {0:.3}'.format(np.mean(lens))) print('Percentage of articles mentioning less than 10 topics: {0:.1%}'.format(np.mean(lens <= 10))) # Weights will be the total weight of each topic weights = topics.sum(0) # Retrieve the most heavily used topic and plot it as a word cloud: words = model.show_topic(weights.argmax(), 64) # The parameter ``maxsize`` often needs some manual tuning to make it look nice. create_cloud('Wikipedia_most.png', words, maxsize=250, fontname='Cardo') fraction_mention = np.mean(topics[:,weights.argmax()] > 0) print("The most mentioned topics is mentioned in {:.1%} of documents.".format(fraction_mention)) total_weight = np.mean(topics[:,weights.argmax()]) print("It represents {:.1%} of the total number of words.".format(total_weight)) print() print() print() # Retrieve the **least** heavily used topic and plot it as a word cloud: words = model.show_topic(weights.argmin(), 64) create_cloud('Wikipedia_least.png', words, maxsize=150, fontname='Cardo') fraction_mention = np.mean(topics[:,weights.argmin()] > 0) print("The least mentioned topics is mentioned in {:.1%} of documents.".format(fraction_mention)) total_weight = np.mean(topics[:,weights.argmin()])
# First, we need to sum up the weights across all the documents weight = np.zeros(model.num_topics) for doc in corpus: for col, val in model[doc]: weight[col] += val # As a reasonable alternative, we could have used the log of val: # weight[col] += np.log(val) max_topic = weight.argmax() # Get the top 64 words for this topic # Without the argument, show_topic would return only 10 words words = model.show_topic(max_topic, 64) # This function will actually check for the presence of pytagcloud and is otherwise a no-op create_cloud('cloud_blei_lda.png', words) num_topics_used = [len(model[doc]) for doc in corpus] plt.hist(num_topics_used, np.arange(42)) plt.ylabel('Nr of documents') plt.xlabel('Nr of topics') plt.savefig('../1400OS_04_01+.png') plt.clf() # Now, repeat the same exercise using alpha=1: model1 = models.ldamodel.LdaModel( corpus, num_topics=NUM_TOPICS, id2word=corpus.id2word, alpha=1.) num_topics_used1 = [len(model1[doc]) for doc in corpus]