示例#1
0
# Estimate the optimal number of topics
viz = Visualization(topic_model)
viz.plot_greene_metric(min_num_topics=10,
                       max_num_topics=30,
                       tao=10, step=1,
                       top_n_words=10)
viz.plot_arun_metric(min_num_topics=5,
                     max_num_topics=30,
                     iterations=10)
viz.plot_consensus_metric(min_num_topics=5,
                        max_num_topics=30,
                        iterations=10)

# Infer topics
print 'Inferring topics...'
topic_model.infer_topics(num_topics=15)
# Save model on disk
utils.save_topic_model(topic_model, 'output/NMF_15topics.pickle')

# Print results
print '\nTopics:'
topic_model.print_topics(num_words=10)
print '\nTopic distribution for document 0:', \
    topic_model.topic_distribution_for_document(0)
print '\nMost likely topic for document 0:', \
    topic_model.most_likely_topic_for_document(0)
print '\nFrequency of topics:', \
    topic_model.topics_frequency()
print '\nTop 10 most relevant words for topic 2:', \
    topic_model.top_words(2, 10)
示例#2
0
num_topics = 20
vectorization = 'tfidf'

# Load corpus
corpus = Corpus(source_file_path='../input/egc.csv',
                language='french',
                vectorization=vectorization,
                max_relative_frequency=max_tf,
                min_absolute_frequency=min_tf,
                preprocessor=None)
print 'corpus size:', corpus.size
print 'vocabulary size:', len(corpus.vocabulary)

# Infer topics
topic_model = NonNegativeMatrixFactorization(corpus=corpus)
topic_model.infer_topics(num_topics=num_topics)
topic_model.print_topics(num_words=10)

# Clean the data directory
if os.path.exists('static/data'):
    shutil.rmtree('static/data')
os.makedirs('static/data')

# Export topic cloud
utils.save_topic_cloud(topic_model, 'static/data/topic_cloud.json')

# Export details about topics
for topic_id in range(topic_model.nb_topics):
    utils.save_word_distribution(topic_model.top_words(topic_id, 20),
                                 'static/data/word_distribution'+str(topic_id)+'.tsv')
    utils.save_affiliation_repartition(topic_model.affiliation_repartition(topic_id),
示例#3
0
num_topics = 20
vectorization = 'tfidf'

# Load corpus
corpus = Corpus(source_file_path='../input/elysee.csv',
                language='french',
                vectorization=vectorization,
                max_relative_frequency=max_tf,
                min_absolute_frequency=min_tf,
                preprocessor=None)
print 'corpus size:', corpus.size
print 'vocabulary size:', len(corpus.vocabulary)

# Infer topics
topic_model = NonNegativeMatrixFactorization(corpus=corpus)
topic_model.infer_topics(num_topics=num_topics)
topic_model.print_topics(num_words=10)

# Clean the data directory
if os.path.exists('static/data'):
    shutil.rmtree('static/data')
os.makedirs('static/data')

# Export topic cloud
utils.save_topic_cloud(topic_model, 'static/data/topic_cloud.json')

# Export details about topics
for topic_id in range(topic_model.nb_topics):
    utils.save_word_distribution(
        topic_model.top_words(topic_id, 20),
        'static/data/word_distribution' + str(topic_id) + '.tsv')