# Estimate the optimal number of topics viz = Visualization(topic_model) viz.plot_greene_metric(min_num_topics=10, max_num_topics=30, tao=10, step=1, top_n_words=10) viz.plot_arun_metric(min_num_topics=5, max_num_topics=30, iterations=10) viz.plot_consensus_metric(min_num_topics=5, max_num_topics=30, iterations=10) # Infer topics print 'Inferring topics...' topic_model.infer_topics(num_topics=15) # Save model on disk utils.save_topic_model(topic_model, 'output/NMF_15topics.pickle') # Print results print '\nTopics:' topic_model.print_topics(num_words=10) print '\nTopic distribution for document 0:', \ topic_model.topic_distribution_for_document(0) print '\nMost likely topic for document 0:', \ topic_model.most_likely_topic_for_document(0) print '\nFrequency of topics:', \ topic_model.topics_frequency() print '\nTop 10 most relevant words for topic 2:', \ topic_model.top_words(2, 10)
num_topics = 20 vectorization = 'tfidf' # Load corpus corpus = Corpus(source_file_path='../input/egc.csv', language='french', vectorization=vectorization, max_relative_frequency=max_tf, min_absolute_frequency=min_tf, preprocessor=None) print 'corpus size:', corpus.size print 'vocabulary size:', len(corpus.vocabulary) # Infer topics topic_model = NonNegativeMatrixFactorization(corpus=corpus) topic_model.infer_topics(num_topics=num_topics) topic_model.print_topics(num_words=10) # Clean the data directory if os.path.exists('static/data'): shutil.rmtree('static/data') os.makedirs('static/data') # Export topic cloud utils.save_topic_cloud(topic_model, 'static/data/topic_cloud.json') # Export details about topics for topic_id in range(topic_model.nb_topics): utils.save_word_distribution(topic_model.top_words(topic_id, 20), 'static/data/word_distribution'+str(topic_id)+'.tsv') utils.save_affiliation_repartition(topic_model.affiliation_repartition(topic_id),
num_topics = 20 vectorization = 'tfidf' # Load corpus corpus = Corpus(source_file_path='../input/elysee.csv', language='french', vectorization=vectorization, max_relative_frequency=max_tf, min_absolute_frequency=min_tf, preprocessor=None) print 'corpus size:', corpus.size print 'vocabulary size:', len(corpus.vocabulary) # Infer topics topic_model = NonNegativeMatrixFactorization(corpus=corpus) topic_model.infer_topics(num_topics=num_topics) topic_model.print_topics(num_words=10) # Clean the data directory if os.path.exists('static/data'): shutil.rmtree('static/data') os.makedirs('static/data') # Export topic cloud utils.save_topic_cloud(topic_model, 'static/data/topic_cloud.json') # Export details about topics for topic_id in range(topic_model.nb_topics): utils.save_word_distribution( topic_model.top_words(topic_id, 20), 'static/data/word_distribution' + str(topic_id) + '.tsv')