def compute_values(self, kmin, kmax, kstep): # vectorize doc vec = CountVectorizer() X = vec.fit_transform(self.docs) # get vocabulary and biterms from docs vocab = np.array(vec.get_feature_names()) biterms = vec_to_biterms(X) # create a BTM and pass the biterms to train it btm = oBTM(num_topics = 20, V = vocab) topics = btm.fit_transform(biterms, iterations=100) topic_summuary(btm.phi_wz.T, X, vocab, 10)
def biterm_topic_model_topic_extraction(): """ Function performs topic extraction on Tweets using the Gensim HDP model. :return: None. """ # LDA can only use raw term counts for LDA because it is a probabilistic graphical model. tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=1000, stop_words='english') tf = tf_vectorizer.fit_transform(slo_feature_series) tf_feature_names = tf_vectorizer.get_feature_names() log.info(f"\n.fit_transform - Learn the vocabulary dictionary and return term-document matrix.") log.info(f"{tf}\n") log.info(f"\n.get_feature_names - Array mapping from feature integer indices to feature name") log.info(f"{tf_feature_names}\n") # Convert corpus of documents (vectorized text) to numpy array. tf_array = tf.toarray() # Convert dictionary of words (vocabulary) to numpy array. tf_feature_names = np.array(tf_vectorizer.get_feature_names()) # get biterms biterms = vec_to_biterms(tf_array) # create btm btm = oBTM(num_topics=20, V=tf_feature_names) print("\n\n Train Online BTM ..") for i in range(0, len(biterms), 100): # prozess chunk of 200 texts biterms_chunk = biterms[i:i + 100] btm.fit(biterms_chunk, iterations=50) topics = btm.transform(biterms) time.sleep(3) # print("\n\n Visualize Topics ..") # vis = pyLDAvis.prepare(btm.phi_wz.T, topics, np.count_nonzero(tf_array, axis=1), tf_feature_names, np.sum(tf_array, axis=0)) # pyLDAvis.save_html(vis, './vis/online_btm.html') print("\n\n Topic coherence ..") topic_summuary(btm.phi_wz.T, tf_array, tf_feature_names, 10) print("\n\n Texts & Topics ..") for i in range(1, 10): print("{} (topic: {})".format(slo_feature_series[i], topics[i].argmax()))
if __name__ == "__main__": texts = open('./data/reuters.titles').read().splitlines() # vectorize texts vec = CountVectorizer(stop_words='english') X = vec.fit_transform(texts).toarray() # get vocabulary vocab = np.array(vec.get_feature_names()) # get biterms biterms = vec_to_biterms(X) # create btm btm = oBTM(num_topics=20, V=vocab) print("\n\n Train Online BTM ..") for i in range(0, len(biterms), 100): # prozess chunk of 200 texts biterms_chunk = biterms[i:i + 100] btm.fit(biterms_chunk, iterations=50) topics = btm.transform(biterms) print("\n\n Visualize Topics ..") vis = pyLDAvis.prepare(btm.phi_wz.T, topics, np.count_nonzero(X, axis=1), vocab, np.sum(X, axis=0)) pyLDAvis.save_html(vis, './vis/online_btm.html') print("\n\n Topic coherence ..") topic_summuary(btm.phi_wz.T, X, vocab, 10)
df_cl.query('cluster ==1') # %% # Biterm topic model # get bigrams from biterm.utility import vec_to_biterms vocab = np.array(count_vect.get_feature_names()) biterms = vec_to_biterms(doc_term_matrix[:1000, :]) # %% from biterm.cbtm import oBTM btm = oBTM(num_topics=3, V=vocab) topics = btm.fit_transform(biterms, iterations=100) # %% topics.shape # %% # Find subjects of sentences import spacy nlp = spacy.load("en_core_web_sm") doc = nlp("Apple is looking at buying U.K. startup for $1 billion") for token in doc: print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop)