__email__ = "*****@*****.**" # Load and prepare a corpus print 'Load documents from CSV' corpus = Corpus(source_file_path='input/egc.csv', language='french', # language for stop words vectorization='tfidf', # 'tf' (term-frequency) or 'tfidf' (term-frequency inverse-document-frequency) max_relative_frequency=0.8, # ignore words which relative frequency is > than max_relative_frequency min_absolute_frequency=4, # ignore words which absolute frequency is < than min_absolute_frequency preprocessor=FrenchLemmatizer()) # pre-process documents print 'corpus size:', corpus.size print 'vocabulary size:', len(corpus.vocabulary) print 'Vector representation of document 0:\n', corpus.vector_for_document(0) # Instantiate a topic model topic_model = NonNegativeMatrixFactorization(corpus) # Estimate the optimal number of topics viz = Visualization(topic_model) viz.plot_greene_metric(min_num_topics=10, max_num_topics=30, tao=10, step=1, top_n_words=10) viz.plot_arun_metric(min_num_topics=5, max_num_topics=30, iterations=10) viz.plot_consensus_metric(min_num_topics=5, max_num_topics=30, iterations=10) # Infer topics
lemmatizer = None num_topics = 20 vectorization = 'tfidf' # Load corpus corpus = Corpus(source_file_path='../input/egc.csv', language='french', vectorization=vectorization, max_relative_frequency=max_tf, min_absolute_frequency=min_tf, preprocessor=None) print 'corpus size:', corpus.size print 'vocabulary size:', len(corpus.vocabulary) # Infer topics topic_model = NonNegativeMatrixFactorization(corpus=corpus) topic_model.infer_topics(num_topics=num_topics) topic_model.print_topics(num_words=10) # Clean the data directory if os.path.exists('static/data'): shutil.rmtree('static/data') os.makedirs('static/data') # Export topic cloud utils.save_topic_cloud(topic_model, 'static/data/topic_cloud.json') # Export details about topics for topic_id in range(topic_model.nb_topics): utils.save_word_distribution(topic_model.top_words(topic_id, 20), 'static/data/word_distribution'+str(topic_id)+'.tsv')
lemmatizer = None num_topics = 20 vectorization = 'tfidf' # Load corpus corpus = Corpus(source_file_path='../input/elysee.csv', language='french', vectorization=vectorization, max_relative_frequency=max_tf, min_absolute_frequency=min_tf, preprocessor=None) print 'corpus size:', corpus.size print 'vocabulary size:', len(corpus.vocabulary) # Infer topics topic_model = NonNegativeMatrixFactorization(corpus=corpus) topic_model.infer_topics(num_topics=num_topics) topic_model.print_topics(num_words=10) # Clean the data directory if os.path.exists('static/data'): shutil.rmtree('static/data') os.makedirs('static/data') # Export topic cloud utils.save_topic_cloud(topic_model, 'static/data/topic_cloud.json') # Export details about topics for topic_id in range(topic_model.nb_topics): utils.save_word_distribution( topic_model.top_words(topic_id, 20),