def buildBrowser(className, startTime, endTime): # Parameters max_tf = 0.8 min_tf = 4 num_topics = 7 vectorization = 'tfidf' MYDIR = os.path.dirname(__file__) # Load corpus corpus = getCorpus(className, startTime, endTime) print('corpus size:', corpus.size) print('vocabulary size:', len(corpus.vocabulary)) # Infer topics topic_model = NonNegativeMatrixFactorization(corpus=corpus) topic_model.infer_topics(num_topics=num_topics) topic_model.print_topics(num_words=10) # Save the topic model for reference # We'll just use a placeholder path for now utils.save_topic_model(topic_model, os.path.join(MYDIR, getTopicModelPath(className))) MYDIR = os.path.dirname(__file__) # Clean the data directory if os.path.exists(os.path.join(MYDIR, 'browser/static/data')): shutil.rmtree(os.path.join(MYDIR, 'browser/static/data')) os.makedirs(os.path.join(MYDIR, 'browser/static/data')) # Export topic cloud utils.save_topic_cloud(topic_model, os.path.join(MYDIR,'browser/static/data/topic_cloud.json')) # Export details about topics for topic_id in range(topic_model.nb_topics): utils.save_word_distribution(topic_model.top_words(topic_id, 20), os.path.join(MYDIR, 'browser/static/data/word_distribution') + str(topic_id) + '.tsv') utils.save_affiliation_repartition(topic_model.affiliation_repartition(topic_id), os.path.join(MYDIR, 'browser/static/data/affiliation_repartition') + str(topic_id) + '.tsv') # Export details about questions for doc_id in range(topic_model.corpus.size): utils.save_topic_distribution(topic_model.topic_distribution_for_document(doc_id), os.path.join(MYDIR, 'browser/static/data/topic_distribution_d') + str(doc_id) + '.tsv') # Export details about words for word_id in range(len(topic_model.corpus.vocabulary)): utils.save_topic_distribution(topic_model.topic_distribution_for_word(word_id), os.path.join(MYDIR, 'browser/static/data/topic_distribution_w') + str(word_id) + '.tsv') # Associate documents with topics topic_associations = topic_model.documents_per_topic()
for i in range(2012, 2016): evolution.append((i, topic_model.topic_frequency(topic_id, date=i))) utils.save_topic_evolution(evolution, 'browser/static/data/frequency' + str(topic_id) + '.tsv') # Export details about documents for doc_id in range(topic_model.corpus.size): utils.save_topic_distribution(topic_model.topic_distribution_for_document(doc_id), 'browser/static/data/topic_distribution_d' + str(doc_id) + '.tsv') # Export details about words for word_id in range(len(topic_model.corpus.vocabulary)): utils.save_topic_distribution(topic_model.topic_distribution_for_word(word_id), 'browser/static/data/topic_distribution_w' + str(word_id) + '.tsv') # Associate documents with topics topic_associations = topic_model.documents_per_topic() # Export per-topic author network for topic_id in range(topic_model.nb_topics): utils.save_json_object(corpus.collaboration_network(topic_associations[topic_id]), 'browser/static/data/author_network' + str(topic_id) + '.json') @app.route('/') def index(): return render_template('index.html', topic_ids=range(topic_model.nb_topics), doc_ids=range(corpus.size), method=type(topic_model).__name__, corpus_size=corpus.size, vocabulary_size=len(corpus.vocabulary),