Exemplo n.º 1
0
def buildBrowser(className, startTime, endTime):
    # Parameters
    max_tf = 0.8
    min_tf = 4
    num_topics = 7
    vectorization = 'tfidf'

    MYDIR = os.path.dirname(__file__)

    # Load corpus
    corpus = getCorpus(className, startTime, endTime)
    print('corpus size:', corpus.size)
    print('vocabulary size:', len(corpus.vocabulary))

    # Infer topics
    topic_model = NonNegativeMatrixFactorization(corpus=corpus)
    topic_model.infer_topics(num_topics=num_topics)
    topic_model.print_topics(num_words=10)

    # Save the topic model for reference
    # We'll just use a placeholder path for now
    utils.save_topic_model(topic_model, os.path.join(MYDIR, getTopicModelPath(className)))

    MYDIR = os.path.dirname(__file__)

    # Clean the data directory
    if os.path.exists(os.path.join(MYDIR, 'browser/static/data')):
        shutil.rmtree(os.path.join(MYDIR, 'browser/static/data'))
    os.makedirs(os.path.join(MYDIR, 'browser/static/data'))

    # Export topic cloud
    utils.save_topic_cloud(topic_model, os.path.join(MYDIR,'browser/static/data/topic_cloud.json'))

    # Export details about topics
    for topic_id in range(topic_model.nb_topics):
        utils.save_word_distribution(topic_model.top_words(topic_id, 20),
                                     os.path.join(MYDIR, 'browser/static/data/word_distribution') + str(topic_id) + '.tsv')
        utils.save_affiliation_repartition(topic_model.affiliation_repartition(topic_id),
                                           os.path.join(MYDIR, 'browser/static/data/affiliation_repartition') + str(topic_id) + '.tsv')

    # Export details about questions
    for doc_id in range(topic_model.corpus.size):
        utils.save_topic_distribution(topic_model.topic_distribution_for_document(doc_id),
                                      os.path.join(MYDIR, 'browser/static/data/topic_distribution_d') + str(doc_id) + '.tsv')

    # Export details about words
    for word_id in range(len(topic_model.corpus.vocabulary)):
        utils.save_topic_distribution(topic_model.topic_distribution_for_word(word_id),
                                      os.path.join(MYDIR, 'browser/static/data/topic_distribution_w') + str(word_id) + '.tsv')

    # Associate documents with topics
    topic_associations = topic_model.documents_per_topic()
Exemplo n.º 2
0
# Infer topics
topic_model = NonNegativeMatrixFactorization(corpus=corpus)
topic_model.infer_topics(num_topics=num_topics)
topic_model.print_topics(num_words=10)

# Clean the data directory
if os.path.exists('browser/static/data'):
    shutil.rmtree('browser/static/data')
os.makedirs('browser/static/data')

# Export topic cloud
utils.save_topic_cloud(topic_model, 'browser/static/data/topic_cloud.json')

# Export details about topics
for topic_id in range(topic_model.nb_topics):
    utils.save_word_distribution(topic_model.top_words(topic_id, 20),
                                 'browser/static/data/word_distribution' + str(topic_id) + '.tsv')
    utils.save_affiliation_repartition(topic_model.affiliation_repartition(topic_id),
                                       'browser/static/data/affiliation_repartition' + str(topic_id) + '.tsv')
    evolution = []
    for i in range(2012, 2016):
        evolution.append((i, topic_model.topic_frequency(topic_id, date=i)))
    utils.save_topic_evolution(evolution, 'browser/static/data/frequency' + str(topic_id) + '.tsv')

# Export details about documents
for doc_id in range(topic_model.corpus.size):
    utils.save_topic_distribution(topic_model.topic_distribution_for_document(doc_id),
                                  'browser/static/data/topic_distribution_d' + str(doc_id) + '.tsv')

# Export details about words
for word_id in range(len(topic_model.corpus.vocabulary)):
Exemplo n.º 3
0
# print('Estimating the number of topics...')
# viz = Visualization(topic_model)
# viz.plot_greene_metric(min_num_topics=10,
#                        max_num_topics=11,
#                        tao=10, step=1,
#                        top_n_words=10)
# viz.plot_arun_metric(min_num_topics=5,
#                      max_num_topics=30,
#                      iterations=10)
# viz.plot_brunet_metric(min_num_topics=5,
#                        max_num_topics=30,
#                        iterations=10)

# Infer topics
print('Inferring topics...')
topic_model.infer_topics(num_topics=15)
# Save model on disk
ut.save_topic_model(topic_model, 'NMF_EGC_15topics.pickle')
# Load model from disk: topic_model = ut.load_topic_model('NMF_EGC_15topics.pickle')

# Print results
print('\nTopics:')
topic_model.print_topics(num_words=10)
print('\nTopic distribution for document 0:',
      topic_model.topic_distribution_for_document(0))
print('\nMost likely topic for document 0:',
      topic_model.most_likely_topic_for_document(0))
print('\nFrequency of topics:', topic_model.topics_frequency())
print('\nTop 10 most relevant words for topic 2:',
      topic_model.top_words(2, 10))
Exemplo n.º 4
0
topic_model = NonNegativeMatrixFactorization(corpus=corpus)
topic_model.infer_topics(num_topics=num_topics)
topic_model.print_topics(num_words=10)

# Clean the data directory
if os.path.exists('browser/static/data'):
    shutil.rmtree('browser/static/data')
os.makedirs('browser/static/data')

# Export topic cloud
utils.save_topic_cloud(topic_model, 'browser/static/data/topic_cloud.json')

# Export details about topics
for topic_id in range(topic_model.nb_topics):
    utils.save_word_distribution(
        topic_model.top_words(topic_id, 20),
        'browser/static/data/word_distribution' + str(topic_id) + '.tsv')
    utils.save_affiliation_repartition(
        topic_model.affiliation_repartition(topic_id),
        'browser/static/data/affiliation_repartition' + str(topic_id) + '.tsv')
    evolution = []
    for i in range(2012, 2016):
        evolution.append((i, topic_model.topic_frequency(topic_id, date=i)))
    utils.save_topic_evolution(
        evolution, 'browser/static/data/frequency' + str(topic_id) + '.tsv')

# Export details about documents
for doc_id in range(topic_model.corpus.size):
    utils.save_topic_distribution(
        topic_model.topic_distribution_for_document(doc_id),
        'browser/static/data/topic_distribution_d' + str(doc_id) + '.tsv')
Exemplo n.º 5
0
print('Estimating the number of topics...')
viz = Visualization(topic_model)
viz.plot_greene_metric(min_num_topics=10,
                       max_num_topics=11,
                       tao=10, step=1,
                       top_n_words=10)
viz.plot_arun_metric(min_num_topics=5,
                     max_num_topics=30,
                     iterations=10)
viz.plot_brunet_metric(min_num_topics=5,
                       max_num_topics=30,
                       iterations=10)

# Infer topics
print('Inferring topics...')
topic_model.infer_topics(num_topics=15)
# Save model on disk
ut.save_topic_model(topic_model, 'output/NMF_15topics.pickle')

# Print results
print('\nTopics:')
topic_model.print_topics(num_words=10)
print('\nTopic distribution for document 0:',
      topic_model.topic_distribution_for_document(0))
print('\nMost likely topic for document 0:',
      topic_model.most_likely_topic_for_document(0))
print('\nFrequency of topics:',
      topic_model.topics_frequency())
print('\nTop 10 most relevant words for topic 2:',
      topic_model.top_words(2, 10))