def topic_analysis(self, n_topics=10, model_type='lda', n_terms=50, n_highlighted_topics=5, plot=False, save=False, kwargs=None): ''' DESC: Latent topic modeling of tf/tfidf/binary matrix. If plot, generates termite plot of latent topics. for corpus on n topics --Input-- n_topics: (int) number of latent topics model_type: (str) 'nmf','lsa','lda' or sklearn.decomposition.<model> n_terms: (int) number of key terms ploted in termite plot (y-axis) n_highlighted_topics: (int) number of highlighted key topics sorted by importance, max highlighted topics is 6 plot = (bool) if True will create a terminte plot of latent topics save = (str) filename to save plot kwargs = (dic) takes hyperparameters --> see sklearn.decomposition.<model> ---------------------------------- --Output-- Creates topic_matrix of num_docs X n_topics dimensions, topic weights/importance for each topic, and termite plot of key terms to latent topics ''' if n_highlighted_topics > 6: print('Value Error: n_highlighted_topics must be =< 5') return highlighting = {} self.model = textacy.TopicModel(model_type, n_topics=n_topics, kwargs=kwargs) self.model.fit(self.tfidf) self.topic_matrix = self.model.transform(self.tfidf) for topic_idx, top_terms in self.model.top_topic_terms( self.vectorizer.feature_names, topics=range(n_topics), weights=False): self.latent_topics_top_terms[topic_idx] = top_terms # print('Topic {}: {}' .format(topic_idx, top_terms)) for topic, weight in enumerate( self.model.topic_weights(self.topic_matrix)): self.topic_w_weights[topic] = weight highlighting[weight] = topic # print('Topic {} has weight: {}' .format(topic, weight)) sort_weights = sorted(highlighting.keys())[::-1] highlight = [ highlighting[i] for i in sort_weights[:n_highlighted_topics] ] self.model.termite_plot(self.tfidf, \ self.vectorizer.feature_names, \ topics=-1, \ n_terms=n_terms, \ highlight_topics=highlight,col_labels=['Topic 1','Topic 2','Topic 3','Topic 4','Topic 5','Topic 6','Topic 7','Topic 8','Topic 9']) plt.tight_layout() print('plotting...') if save: plt.savefig(save) if plot: plt.show() return
def test_vectorization_and_topic_modeling_functionality(self): n_topics = 10 top_n = 10 vectorizer = textacy.Vectorizer( weighting='tfidf', normalize=True, smooth_idf=True, min_df=2, max_df=0.95) doc_term_matrix = vectorizer.fit_transform( (doc.to_terms_list(ngrams=1, named_entities=True, as_strings=True) for doc in self.corpus)) model = textacy.TopicModel('nmf', n_topics=n_topics) model.fit(doc_term_matrix) doc_topic_matrix = model.transform(doc_term_matrix) self.assertIsInstance(doc_term_matrix, sp.csr_matrix) self.assertIsInstance(doc_topic_matrix, np.ndarray) self.assertEqual(doc_topic_matrix.shape[1], n_topics) for topic_idx, top_terms in model.top_topic_terms(vectorizer.id_to_term, top_n=top_n): self.assertIsInstance(topic_idx, int) self.assertEqual(len(top_terms), top_n)
def compute(corpus, tick=utility.noop, method='sklearn_lda', vec_args=None, term_args=None, tm_args=None, **args): tick() vec_args = utility.extend({}, DEFAULT_VECTORIZE_PARAMS, vec_args) terms = [ list(doc) for doc in textacy_utility.extract_corpus_terms(corpus, term_args) ] fx_terms = lambda: terms # [ doc for doc in textacy_utility.extract_corpus_terms(corpus, term_args) ] perplexity_score = None coherence_score = None vectorizer = None doc_topic_matrix = None doc_term_matrix = None documents = textacy_utility.get_corpus_documents(corpus) if method.startswith('sklearn'): vectorizer = textacy.Vectorizer(**vec_args) doc_term_matrix = vectorizer.fit_transform(fx_terms()) model = textacy.TopicModel(method.split('_')[1], **tm_args) model.fit(doc_term_matrix) tick() doc_topic_matrix = model.transform(doc_term_matrix) tick() id2word = vectorizer.id_to_term bow_corpus = gensim.matutils.Sparse2Corpus(doc_term_matrix, documents_columns=False) # FIXME!!! perplexity_score = None coherence_score = None elif method.startswith('gensim_'): algorithm = method.split('_')[1].upper() id2word = gensim.corpora.Dictionary(fx_terms()) bow_corpus = [ id2word.doc2bow(tokens) for tokens in fx_terms() ] if args.get('tfidf_weiging', False): # assert algorithm != 'MALLETLDA', 'MALLET training model cannot (currently) use TFIDF weighed corpus' tfidf_model = gensim.models.tfidfmodel.TfidfModel(bow_corpus) bow_corpus = [ tfidf_model[d] for d in bow_corpus ] algorithms = setup_gensim_algorithms(corpus, bow_corpus, id2word, tm_args) engine = algorithms[algorithm]['engine'] engine_options = algorithms[algorithm]['options'] model = engine(**engine_options) if hasattr(model, 'log_perplexity'): perplexity_score = model.log_perplexity(bow_corpus, len(bow_corpus)) try: coherence_model_lda = gensim.models.CoherenceModel(model=model, texts=fx_terms(), dictionary=id2word, coherence='c_v') coherence_score = coherence_model_lda.get_coherence() except Exception as ex: logger.error(ex) coherence_score = None processed = topic_model_utility.compile_metadata( model, bow_corpus, id2word, documents, vectorizer=vectorizer, doc_topic_matrix=doc_topic_matrix, n_tokens=200 ) model_data = types.SimpleNamespace( topic_model=model, id2term=id2word, bow_corpus=bow_corpus, doc_term_matrix=doc_term_matrix, #doc_topic_matrix=doc_topic_matrix, #vectorizer=vectorizer, processed=processed, perplexity_score=perplexity_score, coherence_score=coherence_score, options=dict(method=method, vec_args=vec_args, term_args=term_args, tm_args=tm_args, **args), coherence_scores=None ) tick(0) return model_data
for sgr in textrank: print(sgr) print("==key_terms_from_semantic_network==") for trip in key_terms_from_semantic_network: print(trip) print("==matches==") for match in matches: print(match) print("\n") vectorizer = textacy.Vectorizer(weighting='tfidf', normalize=True, smooth_idf=True, min_df=3, max_df=0.95) doc_term_matrix = vectorizer.fit_transform( (doc.to_terms_list(ngrams=1, named_entities=True, as_strings=True) for doc in corpus)) print(repr(doc_term_matrix)) models = ['nmf', 'lda', 'lsa'] for m in models: model = textacy.TopicModel(m, n_topics=10) model.fit(doc_term_matrix) doc_topic_matrix = model.transform(doc_term_matrix) print("==", m, "==") print(doc_topic_matrix.shape) for topic_idx, top_terms in model.top_topic_terms( vectorizer.id_to_term, top_n=10): print('topic', topic_idx, ':', ' '.join(top_terms))
nlp = spacy.load('en') debates = [] data = csv.reader(open('../debate_csvs/HanDeSeT.csv', 'r')) for row in data: # adapted for handeset which features 7 columns of text per document (row) debates.append( [row[1], row[6] + row[7] + row[8] + row[9] + row[10] + row[11]]) df = pd.DataFrame(debates, columns=['title', 'text']) chat_concat = (df.sort_values('title').groupby('title')['text'].agg( lambda col: '\n'.join(col.astype(str)))) docs = list(chat_concat.apply(lambda x: nlp(x))) corpus = textacy.corpus.Corpus(nlp, docs=docs) vectorizer = textacy.Vectorizer(tf_type='linear', apply_idf=True, idf_type='smooth', norm='l2', min_df=2, max_df=5) doc_term_matrix = vectorizer.fit_transform( (doc.to_terms_list(ngrams=1, named_entities=True, as_strings=True) for doc in corpus)) model = textacy.TopicModel('nmf', n_topics=10) model.fit(doc_term_matrix) doc_topic_matrix = model.transform(doc_term_matrix) for topic_idx, top_terms in model.top_topic_terms(vectorizer.id_to_term, top_n=10): print('topic', topic_idx, ':', ' '.join(top_terms))
records, 'text') corpus = textacy.Corpus('en', texts=text_stream, metadatas=metadata_stream) corpus corpus_processed = (textacy.preprocess_text(doc.text, lowercase=True, no_punct=True) for doc in corpus) vectorizer = textacy.Vectorizer(weighting='tf', normalize=True, smooth_idf=True, min_df=2, max_df=0.95) doc_term_matrix = vectorizer.fit_transform( (doc.to_terms_list(ngrams=1, named_entities=True, as_strings=True) for doc in corpus)) print(repr(doc_term_matrix)) model = textacy.TopicModel('lda', n_topics=10) model.fit(doc_term_matrix) doc_topic_matrix = model.transform(doc_term_matrix) doc_topic_matrix.shape for topic_idx, top_terms in model.top_topic_terms(vectorizer.id_to_term, top_n=10): print('topic', topic_idx, ':', ' '.join(top_terms)) model.termite_plot(doc_term_matrix, vectorizer.id_to_term, topics=-1, n_terms=25, sort_terms_by='seriation')