def main(folder): word2idx = pickle.load(open(os.path.join(folder, "word_idx.p"), "rb")) print(word2idx) # Load seed topics seed_topics_dic, topics = seed_topics(word2idx) idx_to_word = {v: k for k, v in word2idx.items()} # Load data print("Starting training...") lda = guidedlda.GuidedLDA(n_topics=len(topics), n_iter=100, random_state=7, refresh=20) ## Concat data row, col, data = np.array(()), np.array(()), np.array(()) matrix_data_list = glob.glob(os.path.join(folder, "matrix_data_*.p")) np.random.shuffle(matrix_data_list) for doc in tqdm.tqdm(matrix_data_list): print("Partial fitting", doc) res = pickle.load(open(doc, "rb")) row = np.append(row, np.int32(res["I"])) col = np.append(col, np.int32(res["J"])) data = np.append(data, np.int32(res["data"])) X = coo_matrix((np.int32(data), (np.int32(row), np.int32(col)))) lda.fit(X, seed_topics=seed_topics_dic, seed_confidence=0) print("Training done") def print_top_words(model, n_top_words): for topic_idx, topic in enumerate(model.components_): message = "Topic #{} - {}: ".format(topic_idx, topics[topic_idx]) message += " ".join([idx_to_word[i] for i in topic.argsort()[:-n_top_words - 1:-1]]) print(message) def print_sentence_and_topic(sentence, topic): print(colored("Sentence:", "blue"), colored(sentence, "green")) print(colored("Topic: ", "blue"), colored(topic, "red")) print_top_words(lda, 20) np.save(open(os.path.join(folder, "guided_components.npy"), "wb"), lda.components_) ## Test for input sentences stemmer = WordNetLemmatizer() while True: sentence = input() list_words = [w.lower() for w in sentence.split()] np_array = np.zeros([1, len(word2idx.keys())]) for word in list_words: stemmed_word = stemmer.lemmatize(word) if stemmed_word in word2idx: print(stemmed_word) np_array[0, word2idx[stemmed_word]] += 1 topic_dist = lda.transform(np.int32(np_array)) print_sentence_and_topic(sentence, topics[np.argmax(topic_dist)])
def run_lda_sklearn(self, n_topics): n_top_words = 12 lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5, learning_method='online', learning_offset=50., random_state=0) lda.fit(self.tf) print("\nTopics in LDA model:") tf_feature_names = self.tf_vectorizer.get_feature_names() self.print_top_words(lda, tf_feature_names, n_top_words)
count_vectorizer = CountVectorizer( max_df=0.99, min_df=3, ngram_range=(1, 1), stop_words=new_stop_word) # Fit and transform the processed titles count_data = count_vectorizer.fit_transform( full_text_list_processed) # Visualise the 30 most common words plot_30_most_common_ngrams(count_data, count_vectorizer) import warnings warnings.simplefilter("ignore", DeprecationWarning) # Load the LDA model from sk-learn from sklearn.decomposition import LatentDirichletAllocation as LDA import lda # Helper function def print_topics(model, count_vectorizer, n_top_words): words = count_vectorizer.get_feature_names() for topic_idx, topic in enumerate(model.components_): print("\nTopic #%d:" % topic_idx) print(" ".join( [words[i] for i in topic.argsort()[:-n_top_words - 1:-1]])) # Tweak the two parameters below number_topics = 20 number_words = 10 # Create and fit the LDA model lda = lda.LDA(n_topics=number_topics) lda.fit(count_data) # Print the topics found by the LDA model print("Topics found via LDA:") print_topics(lda, count_vectorizer, number_words)
# # Part 5: Topic Modeling - Latent Dirichlet Allocation # In[28]: from sklearn.decomposition import LatentDirichletAllocation lda = LatentDirichletAllocation(n_components=5, learning_method = 'online') tfidf_matrix_lda = (tfidf_matrix * 100) tfidf_matrix_lda = tfidf_matrix_lda.astype(int) # In[29]: lda.fit(tfidf_matrix_lda) # In[30]: #5 group, 44 selected words topic_word = lda.components_ print topic_word.shape # In[31]: n_top_words = 10 topic_keywords_list = []
X = np.array([[process(document,word) for word in all_words] for document in corpus]) bar.finish() ''' print("Extracting tf features for LDA...") n_features = 1000 n_topics = 50 n_top_words = 20 tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=n_features, stop_words='english') tf = tf_vectorizer.fit_transform(corpus) lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5, learning_method='online', learning_offset=50., random_state=0) lda.fit(tf) tf_feature_names = tf_vectorizer.get_feature_names() print_top_words(lda, tf_feature_names, n_top_words) ''' rcParams['text.usetex'] = True fig = plt.figure() ax = fig.add_subplot(111) ax.imshow(lda.components_[:,:n_top_words],interpolation='nearest',aspect='auto',cmap=plt.cm.bone_r) artist.adjust_spines(ax) ax.set_xticks(xrange(n_top_words)) ax.set_xticklabels(map(artist.format,tf_feature_names[:n_top_words]),rotation='vertical') ax.set_ylabel(artist.format('Topic')) plt.tight_layout() plt.show() '''
# preprocessor=None, stop_words=None, token_pattern=r"(?u)\b\w+\b", ngram_range=(1,1), max_features=None) # tf = vectorizer.fit_transform(clean_content) n_features = 1000 tf_vectorizer = TfidfVectorizer(strip_accents='unicode', max_features=n_features, stop_words='english') tf = tf_vectorizer.fit_transform(clean_content) # 定义主题数量 n_topics = 1 lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=50, learning_method='online', learning_offset=50, random_state=0) lda.fit(tf) # n_topics = 5 # model = lda.LDA(n_topics = n_topics,n_iter = 500,random_state = 1) # model.fit(tf) ''''' #主题-单词分布 topic_word = model.topic_word_ print("type(topic_word): {}".format(type(topic_word))) print("shape: {}".format(topic_word.shape)) print(clean_content[:3]) print(topic_word[:, :3]) for n in range(5): sum_pr = sum(topic_word[n,:]) print("topic: {} sum: {}".format(n, sum_pr))
def fit(self, params = solve_shared.Params(), callback = None): """Fits a model to this Corpus. params is a Params object from solve-shared. callback if provided should take two numbers - the first is the number of iterations done, the second the number of iterations that need to be done; used to report progress. Note that it will probably not be called for every iteration for reasons of efficiency.""" lda.fit(self, params, callback)
'一种','位于','之一','天空','没有','很多','有点','什么','五个', '特别','微博','链接','全文','展开','网页','自己','今天','现在','视频'], max_df = 0.99, min_df = 0.002) #去除文档内出现几率过大或过小的词汇 tf = tf_vectorizer.fit_transform(corpus) print(tf.shape) print(tf) #------------------------- 第三步 LDA分析 ------------------------ # 设置主题数 n_topics = 1 lda = lda.LDA(n_topics=1, n_iter=1500, random_state=1) lda.fit(tf.A.astype(np.int32)) # 显示主题数 model.topic_word_ print(lda.components_) # 几个主题就是几行 多少个关键词就是几列 print(lda.components_.shape) # 主题-关键词分布 def print_top_words(model, tf_feature_names, n_top_words): for topic_idx,topic in enumerate(model.components_): # lda.component相当于model.topic_word_ print('Topic #%d:' % topic_idx) print(' '.join([tf_feature_names[i] for i in topic.argsort()[:-n_top_words-1:-1]])) print("") # 定义好函数之后 暂定每个主题输出前20个关键词 n_top_words = 20
cur = con.cursor() cur.execute("select * from headlines") results = cur.fetchall() #tf-idf the articles vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 3)) X = vectorizer.fit_transform([*map(lambda x: x['text'], results)]) for item in X[0]: print(item) #print(vectorizer.get_feature_names()) svd = TruncatedSVD(n_components=100, n_iter=100) lda = LatentDirichletAllocation(n_components=10) L = lda.fit(X) S = svd.fit(X) #normalizer = Normalizer(copy=False) #lsa = make_pipeline(svd, normalizer) #X = lsa.fit_transform(X) terms = vectorizer.get_feature_names() for i, comp in enumerate(S.components_): termsInComp = zip(terms, comp) sortedTerms = sorted(termsInComp, key=lambda x: x[1], reverse=True)[:20] print("Concept %d:" % i) for term in sortedTerms: print(term[0]) print(" ") #jaccard similarity on vector
def fit(self, params=solve_shared.Params(), callback=None): """Fits a model to this Corpus. params is a Params object from solve-shared. callback if provided should take two numbers - the first is the number of iterations done, the second the number of iterations that need to be done; used to report progress. Note that it will probably not be called for every iteration for reasons of efficiency.""" lda.fit(self, params, callback)
matrix_data_list = glob.glob("ECJ_gendered/matrix_data_*.p") np.random.shuffle(matrix_data_list) for doc in tqdm.tqdm(matrix_data_list): if MODEL == "sklearn": row, col, data = np.array(()), np.array(()), np.array(()) print("Partial fitting", doc) res = pickle.load(open(doc, "rb")) row = np.append(row, np.int32(res["I"])) col = np.append(col, np.int32(res["J"])) data = np.append(data, np.int32(res["data"])) X = coo_matrix((np.int32(data), (np.int32(row), np.int32(col)))) if MODEL == "sklearn": lda.partial_fit(X) if MODEL != "sklearn": lda.fit(X) # break print("Training done") def print_top_words(model, n_top_words): for topic_idx, topic in enumerate(model.components_): message = "Topic #%d: " % topic_idx message += " ".join([idx_to_word[i] for i in topic.argsort()[:-n_top_words - 1:-1]]) print(message) def print_sentence_and_topic(sentence, topic): print(colored("Sentence:", "blue"), colored(sentence, "green")) print(colored("Topic: ", "blue"), colored(topic, "red")) print_top_words(lda, 20)