def lda_tuner(ingroup_otu, best_models): best_score = -1*np.inf dtp_series = [0.0001, 0.001, 0.01, 0.1, 0.2] twp_series = [0.0001, 0.001, 0.01, 0.1, 0.2] topic_series = [3] X = ingroup_otu.values eval_counter = 0 for topics in topic_series: for dtp in dtp_series: for twp in twp_series: eval_counter +=1 X_train, X_test = train_test_split(X, test_size=0.5) lda = LatentDirichletAllocation(n_topics=topics, doc_topic_prior=dtp, topic_word_prior=twp, learning_method='batch', random_state=42, max_iter=20) lda.fit(X_train) this_score = lda.score(X_test) this_perplexity = lda.perplexity(X_test) if this_score > best_score: best_score = this_score print "New Max Likelihood: {}".format(best_score) print "#{}: n:{}, dtp:{}, twp:{}, score:{}, perp:{}".format(eval_counter, topics, dtp, twp, this_score, this_perplexity) best_models.append({'n': topics, 'dtp': dtp, 'twp': twp, 'score': this_score, 'perp': this_perplexity}) if (dtp == dtp_series[-1]) and (twp == twp_series[-1]): eval_counter +=1 X_train, X_test = train_test_split(X, test_size=0.5) lda = LatentDirichletAllocation(n_topics=topics, doc_topic_prior=1./topics, topic_word_prior=1./topics, learning_method='batch', random_state=42, max_iter=20) lda.fit(X_train) this_score = lda.score(X_test) this_perplexity = lda.perplexity(X_test) if this_score > best_score: best_score = this_score print "New Max Likelihood: {}".format(best_score) print "#{}: n:{}, dtp:{}, twp:{}, score:{} perp: {}".format(eval_counter, topics, (1./topics), (1./topics), this_score, this_perplexity) best_models.append({'n': topics, 'dtp': (1./topics), 'twp': (1./topics), 'score': this_score, 'perp': this_perplexity}) return best_models
def test_perplexity_input_format(): # Test LDA perplexity for sparse and dense input # score should be the same for both dense and sparse input n_components, X = _build_sparse_mtx() lda = LatentDirichletAllocation(n_components=n_components, max_iter=1, learning_method='batch', total_samples=100, random_state=0) lda.fit(X) perp_1 = lda.perplexity(X) perp_2 = lda.perplexity(X.toarray()) assert_almost_equal(perp_1, perp_2)
def plot_perplexity_iter(A_tfidf, num_topics): print "computing perplexity vs iter..." max_iter = 5 perplexity = [] em_iter = [] for sweep in range(1,max_iter+1): lda = LatentDirichletAllocation(n_topics = num_topics, max_iter=sweep, learning_method='online', batch_size = 512, random_state=0, n_jobs=-1) tic = time() lda.fit(A_tfidf) #online VB toc = time() print "sweep %d, elapsed time: %.4f sec" %(sweep, toc - tic) perplexity.append(lda.perplexity(A_tfidf)) em_iter.append(lda.n_batch_iter_) #end np.save('./data/perplexity_iter.npy', perplexity) f = plt.figure() plt.plot(em_iter, perplexity, color='b', marker='o', lw=2.0, label='perplexity') plt.title('Perplexity (LDA, online VB)') plt.xlabel('EM iter') plt.ylabel('Perplexity') plt.grid(True) plt.legend() plt.show() f.savefig('./figures/perplexity_iter.png')
def plot_perplexity_topics(A_tfidf): print "computing perplexity vs K..." max_iter = 5 #based on plot_perplexity_iter() #num_topics = np.linspace(2,20,5).astype(np.int) num_topics = np.logspace(1,2,5).astype(np.int) perplexity = [] em_iter = [] for k in num_topics: lda = LatentDirichletAllocation(n_topics = k, max_iter=max_iter, learning_method='online', batch_size = 512, random_state=0, n_jobs=-1) tic = time() lda.fit(A_tfidf) #online VB toc = time() print "K= %d, elapsed time: %.4f sec" %(k, toc - tic) perplexity.append(lda.perplexity(A_tfidf)) em_iter.append(lda.n_batch_iter_) #end np.save('./data/perplexity_topics.npy', perplexity) np.save('./data/perplexity_topics2.npy', num_topics) f = plt.figure() plt.plot(num_topics, perplexity, color='b', marker='o', lw=2.0, label='perplexity') plt.title('Perplexity (LDA, online VB)') plt.xlabel('Number of Topics, K') plt.ylabel('Perplexity') plt.grid(True) plt.legend() plt.show() f.savefig('./figures/perplexity_topics.png')
def plot_perplexity_batch(A_tfidf, num_docs): print "computing perplexity vs batch size..." max_iter = 5 num_topics = 10 batch_size = np.logspace(6, 10, 5, base=2).astype(int) perplexity = np.zeros((len(batch_size),max_iter)) em_iter = np.zeros((len(batch_size),max_iter)) for ii, mini_batch in enumerate(batch_size): for jj, sweep in enumerate(range(1,max_iter+1)): lda = LatentDirichletAllocation(n_topics = num_topics, max_iter=sweep, learning_method='online', batch_size = mini_batch, random_state=0, n_jobs=-1) tic = time() lda.fit(A_tfidf) #online VB toc = time() print "sweep %d, elapsed time: %.4f sec" %(sweep, toc - tic) perplexity[ii,jj] = lda.perplexity(A_tfidf) em_iter[ii,jj] = lda.n_batch_iter_ #end #end np.save('./data/perplexity.npy', perplexity) np.save('./data/em_iter.npy', em_iter) f = plt.figure() for mb in range(len(batch_size)): plt.plot(em_iter[mb,:], perplexity[mb,:], color=np.random.rand(3,), marker='o', lw=2.0, label='mini_batch: '+str(batch_size[mb])) plt.title('Perplexity (LDA, online VB)') plt.xlabel('EM iter') plt.ylabel('Perplexity') plt.grid(True) plt.legend() plt.show() f.savefig('./figures/perplexity_batch.png')
def test_lda_perplexity(): # Test LDA perplexity for batch training # perplexity should be lower after each iteration n_topics, X = _build_sparse_mtx() for method in ('online', 'batch'): lda_1 = LatentDirichletAllocation(n_topics=n_topics, max_iter=1, learning_method=method, total_samples=100, random_state=0) lda_2 = LatentDirichletAllocation(n_topics=n_topics, max_iter=10, learning_method=method, total_samples=100, random_state=0) distr_1 = lda_1.fit_transform(X) perp_1 = lda_1.perplexity(X, distr_1, sub_sampling=False) distr_2 = lda_2.fit_transform(X) perp_2 = lda_2.perplexity(X, distr_2, sub_sampling=False) assert_greater_equal(perp_1, perp_2) perp_1_subsampling = lda_1.perplexity(X, distr_1, sub_sampling=True) perp_2_subsampling = lda_2.perplexity(X, distr_2, sub_sampling=True) assert_greater_equal(perp_1_subsampling, perp_2_subsampling)
def test_lda_score_perplexity(): # Test the relationship between LDA score and perplexity n_components, X = _build_sparse_mtx() lda = LatentDirichletAllocation(n_components=n_components, max_iter=10, random_state=0) lda.fit(X) perplexity_1 = lda.perplexity(X, sub_sampling=False) score = lda.score(X) perplexity_2 = np.exp(-1. * (score / np.sum(X.data))) assert_almost_equal(perplexity_1, perplexity_2)
def test_lda_perplexity(method): # Test LDA perplexity for batch training # perplexity should be lower after each iteration n_components, X = _build_sparse_mtx() lda_1 = LatentDirichletAllocation(n_components=n_components, max_iter=1, learning_method=method, total_samples=100, random_state=0) lda_2 = LatentDirichletAllocation(n_components=n_components, max_iter=10, learning_method=method, total_samples=100, random_state=0) lda_1.fit(X) perp_1 = lda_1.perplexity(X, sub_sampling=False) lda_2.fit(X) perp_2 = lda_2.perplexity(X, sub_sampling=False) assert_greater_equal(perp_1, perp_2) perp_1_subsampling = lda_1.perplexity(X, sub_sampling=True) perp_2_subsampling = lda_2.perplexity(X, sub_sampling=True) assert_greater_equal(perp_1_subsampling, perp_2_subsampling)
def test_lda_fit_perplexity(): # Test that the perplexity computed during fit is consistent with what is # returned by the perplexity method n_components, X = _build_sparse_mtx() lda = LatentDirichletAllocation(n_components=n_components, max_iter=1, learning_method='batch', random_state=0, evaluate_every=1) lda.fit(X) # Perplexity computed at end of fit method perplexity1 = lda.bound_ # Result of perplexity method on the train set perplexity2 = lda.perplexity(X) assert_almost_equal(perplexity1, perplexity2)
def test_topic_ks(text, ck = 80): #text is a list of documents count_vectorizer = CountVectorizer(stop_words='english') count_data = count_vectorizer.fit_transform(text) print("testing Ks...") cks = range(ck) candidate_ks = cks[40:] for number_topics in candidate_ks: print("K =", number_topics) lda = LDA(n_components=number_topics, n_jobs=-1) lda.fit(count_data) # Log Likelihood: Higher the better print("---> Log Likelihood: ", lda.score(count_data)) # Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word) print("---> Perplexity: ", lda.perplexity(count_data))
def __init__(self, X, features, Klist=list(range(1, 10)), random_state=0): self.Klist = Klist self.features = features self.random_state = random_state self.X = X self.lda = [] self.perplex = [] self.score = [] for k in Klist: lda = LatentDirichletAllocation(n_components=k, random_state=random_state) lda.fit(X) self.lda.append(lda) px = lda.perplexity(X) ll = lda.score(X) self.perplex.append(px) self.score.append(ll) print('K = %i, perplex = %f, log-like = %f' % (k, px, ll))
def lda_analysis(tf, tf_vectorizer): """ lda分析 :param tf: :param tf_vectorizer: :return: """ # 设置主题数 n_topics = 2 lda = LatentDirichletAllocation(n_components=n_topics, max_iter=100, learning_method='online', learning_offset=50, random_state=0) lda.fit(tf) # 显示主题数 model.topic_word_ # print(lda.components_) # # 几个主题就是几行 多少个关键词就是几列 # print(lda.components_.shape) # 计算困惑度 print(u'困惑度:') print(lda.perplexity(tf, sub_sampling=False)) # 主题-关键词分布 def print_top_words(model, tf_feature_names, n_top_words): for topic_idx, topic in enumerate( model.components_): # lda.component相当于model.topic_word_ print('Topic #%d:' % topic_idx) print(' '.join([ tf_feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1] ])) print("") # 定义好函数之后 暂定每个主题输出前20个关键词 n_top_words = 20 tf_feature_names = tf_vectorizer.get_feature_names() # 调用函数 print_top_words(lda, tf_feature_names, n_top_words) return lda
def lda(data): tf_ModelPath = os.path.join('model', 'tfVector.model') # 保存词频模型 lda_ModelPath = os.path.join('model', 'ldaModels.model') # 保存训练的lda模型 bestModelPath = os.path.join('model', 'bestLDAModel.model') tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,) tf = tf_vectorizer.fit_transform(data) lda_models = [] for idx, n_topic in enumerate(n_topics): lda = LatentDirichletAllocation(n_components = n_topic, max_iter=8000, learning_method='batch', evaluate_every=200, perp_tol=0.01) t0 = time() lda.fit(tf) perplexityLst[idx] = lda.perplexity(tf) lda_models.append(lda) print("残差数组结果为:", perplexityLst) print("# of Topic: %d, " % n_topics[idx], end=' ') print("done in %0.3fs, N_iter %d, " % ((time() - t0), lda.n_iter_), end=' ') print("Perplexity Score %0.3f" % perplexityLst[idx]) # 打印最佳模型 best_index = perplexityLst.index(min(perplexityLst)) best_n_topic = n_topics[best_index] best_model = lda_models[best_index] print("Best # of Topic: ", best_n_topic) print("Best Model: ") # 保存每个n_topics下的LDA模型,以便后续查看使用 joblib.dump(tf_vectorizer, tf_ModelPath) joblib.dump(lda_models, lda_ModelPath) joblib.dump(best_model, bestModelPath) # 保存并输出topic_word矩阵 print("#########Topic-Word Distribution#########") tf_vectorizer._validate_vocabulary() tf_feature_names = tf_vectorizer.get_feature_names() print_top_words(best_model, tf_feature_names, n_top_words) # print(docres) # joblib.dump(tf_vectorizer, tf_ModelPath) return best_model, tf_vectorizer
def train_lda(): # from sklearn.feature_extraction.text import CountVectorizer from sklearn.externals import joblib # 也可以选择p tf_ModelPath = r'E:\能搜\tf_model.pkl' docLst = get_docLst() tf_vectorizer = joblib.load(tf_ModelPath) tf = tf_vectorizer.fit_transform(docLst) # xx=tf_vectorizer.get_feature_names() from sklearn.decomposition import LatentDirichletAllocation n_topics = 13 lda = LatentDirichletAllocation(n_components=n_topics, max_iter=300, learning_method='batch') lda.fit(tf) # tf即为Document_word Sparse Matrix n_top_words = 20 tf_feature_names = tf_vectorizer.get_feature_names() print_top_words(lda, tf_feature_names, n_top_words) print('lda:', lda.perplexity(tf))
def lda_train(): tf, count_vec = load_data_vector() n_topics = 20 lda = LatentDirichletAllocation(n_components=n_topics, max_iter=10, learning_method='batch', random_state=0, perp_tol=0.01, topic_word_prior=0.2, n_jobs=-1) lda.fit(tf) doc_topic_dist = lda.transform(tf) print(doc_topic_dist) n_top_words = 20 tf_feature_names = count_vec.get_feature_names() print_top_words(lda, tf_feature_names, n_top_words) print(lda.perplexity(tf))
def lda_model(mat): print('开始训练lda模型') n_topic = 50 lda = LatentDirichletAllocation(n_components=n_topic, max_iter=1000, learning_method='batch') docres = lda.fit_transform(mat) # Document_word Sparse Matrix 返回文档主题矩阵 # 困惑度 print('困惑度为:' + lda.perplexity(mat)) # 收敛效果 print(len(docres)) # 文档数目 print(len(docres[0])) # 主题数目 print('lda模型训练结束') # 保存lda模型 docres.dump('doc_topic_result.dat') return docres
def k_grid_search(X, test_size=0.25, gridval=[10, 100, 10], n_iter=30, seed=23): X_train, X_test = train_test_split( X, test_size=test_size, random_state=seed ) grid = range(gridval[0], gridval[1], gridval[2]) loglik = list() perplex = list() for k in grid: print("Estimating model at k: {}".format(k)) lda = LatentDirichletAllocation( n_components=k, max_iter=n_iter, learning_method='online', learning_offset=50., random_state=seed, n_jobs=6 ) lda.fit(X_train) loglik.append(lda.score(X_test)) perplex.append(lda.perplexity(X_test)) lda = None return list(grid), loglik, perplex
class Model: def __init__(self, V, K=None, train=True): """ V: doc-term matrix (n docs x n terms) K: number of topics (n topics) """ self.V = V if train: self.K = K self.model = LatentDirichletAllocation(n_topics=self.K, max_iter=25, learning_method='batch') def train(self): """ V: doc-term matrix (n docs x n terms) W,H: factorization W*H, W is doc-topic, H is topic-term """ self.W = self.model.fit_transform(self.V) self.H = self.model.components_ def predict(self): """ :return: doc-topic matrix (W), where V = W*H """ self.W = self.model.transform(self.V) def load(self,filename): """Load vectorizer by unpickling.""" with open(filename, 'rb') as fid: self.model = pickle.load(fid) self.H = self.model.components_ def save(self, filename): """Save vectorizer by pickling.""" with open(filename, 'wb') as fid: pickle.dump(self.model, fid) def calculate_perplexity(self): """ :return: perplexity of model for this dataset """ return self.model.perplexity(self.V, self.W)
def plot_perplexity_batch(A_tfidf, num_docs): print "computing perplexity vs batch size..." max_iter = 5 num_topics = 10 batch_size = np.logspace(6, 10, 5, base=2).astype(int) perplexity = np.zeros((len(batch_size), max_iter)) em_iter = np.zeros((len(batch_size), max_iter)) for ii, mini_batch in enumerate(batch_size): for jj, sweep in enumerate(range(1, max_iter + 1)): lda = LatentDirichletAllocation(n_topics=num_topics, max_iter=sweep, learning_method='online', batch_size=mini_batch, random_state=0, n_jobs=-1) tic = time() lda.fit(A_tfidf) #online VB toc = time() print "sweep %d, elapsed time: %.4f sec" % (sweep, toc - tic) perplexity[ii, jj] = lda.perplexity(A_tfidf) em_iter[ii, jj] = lda.n_batch_iter_ #end #end np.save('./data/perplexity.npy', perplexity) np.save('./data/em_iter.npy', em_iter) f = plt.figure() for mb in range(len(batch_size)): plt.plot(em_iter[mb, :], perplexity[mb, :], color=np.random.rand(3, ), marker='o', lw=2.0, label='mini_batch: ' + str(batch_size[mb])) plt.title('Perplexity (LDA, online VB)') plt.xlabel('EM iter') plt.ylabel('Perplexity') plt.grid(True) plt.legend() plt.show() f.savefig('./figures/perplexity_batch.png')
def build_topics(use_spacy=True): greek_stopwords = build_greek_stoplist() data_samples, indices = build_data_samples(use_spacy=use_spacy) greek_stopwords, words = build_gg_stoplist(data_samples, greek_stopwords) # Initial Parameters no_features = 1000 # Number of features n_samples = len(data_samples) # Len of data samples no_top_words = 100 # Number of top words in each topic n_components = 100 # Number of topics # How many correlations under each topic no_top_data_samples = math.ceil(n_samples / n_components) # LDA can only use raw term counts for LDA because it is a probabilistic # graphical model tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words=greek_stopwords) tf = tf_vectorizer.fit_transform(data_samples) tf_feature_names = tf_vectorizer.get_feature_names() lda_model = LatentDirichletAllocation(n_components=n_components, max_iter=10, learning_method='online', learning_offset=50., verbose=1, n_jobs=cpu_count() - 1, random_state=0) lda_model.fit(tf) print("Best Perplexity Score: ", lda_model.perplexity(tf)) lda_W = lda_model.transform(tf) lda_H = lda_model.components_ graph_lda, topics, top_doc_indices = process_topics( lda_H, lda_W, tf_feature_names, data_samples, no_top_words, no_top_data_samples, indices) pickle.dump(lda_model, open('lda_model.pickle', 'wb')) pickle.dump(tf, open('tf.pickle', 'wb'))
def run_perplexity_grid_search(self): i_counter = 1 n_topic_range = range(self.min_n_topics, (self.max_n_topics + 1)) n_iterations = len(n_topic_range) perplexity_list = [] for i in n_topic_range: print_timestamp_message( f'Starting lda fit iteration {i_counter} of {n_iterations}') fit_lda = LatentDirichletAllocation( n_components=i, max_iter=self.max_iter, learning_method=self.learning_method, random_state=self.random_state).fit(self.tfid_vector) perplexity_list.append(fit_lda.perplexity(self.tfid_vector)) i_counter += 1 output_df = pd.DataFrame({ 'n_topics': list(n_topic_range), 'perplexity': perplexity_list }) return output_df
def fit_topic_model(tweets: List[np.ndarray], n_components: int, n_words: int, vocab: List[str], trials: int): best_model = None best_perplexity = 1e10 for _ in range(trials): lda = LatentDirichletAllocation(n_components) lda.fit(tweets) perplexity = lda.perplexity(tweets) if perplexity < best_perplexity: best_perplexity = perplexity best_model = lda print('Best Perplexity: {0}'.format(perplexity)) for index, component in enumerate(best_model.components_): top_indices = np.argsort(component)[::-1][:n_words] topic_words = [vocab[i] for i in top_indices] print('Topic {0}: {1}'.format(index, ' '.join(topic_words)))
def test_topic_ks(text, ck=20, number_words=10): #text is a list of documents print("cleaning and vectorizing....") for i in range(len(text)): text[i] = text[i].replace('‘', '\'').replace('’', '\'').replace( '“', '"').replace('”', '"').replace('—', '-').replace('\n', ' ') text[i] = text[i].translate( str.maketrans(string.punctuation, ' ' * len(string.punctuation))).lower() word_list = text[i].split(" ") go_words = [ word for word in [word for word in word_list if word not in stops] ] text[i] = ' '.join(go_words) count_vectorizer = CountVectorizer(stop_words='english') count_data = count_vectorizer.fit_transform(text) # plot_10_most_common_words(count_data, count_vectorizer) print("Testing Numbers of Topics (k)") cks = range(ck) candidate_ks = cks[ 1:] #could filter to every other, but for now keep as is prev_prep = 0 print("{:<3}\t{:<7}\t{:<7}".format('k:', 'perplexity:', 'delta:')) for number_topics in candidate_ks: # print("K =", number_topics) lda = LDA(n_components=number_topics, n_jobs=-1) lda.fit(count_data) perp = lda.perplexity(count_data) print("{:<3}\t{:<7.3f}\t{:<7.3f}".format(number_topics, perp, perp - prev_prep)) prev_prep = perp
def lda_build(data, savepath, n_topic): """ 在原有tfidf或cv的基础上训练lda :param data: :return: """ tv = pickle.load(open("MODELS\\tfidf\\tfidf.pk", "rb")) # tv = pickle.load(open("MODELS\\tfidf\\cv.pk", "rb")) data = fenci(data) data_tfidf = tv.transform(data) data_tfidf = data_tfidf.toarray() print(data_tfidf.shape) lda = LatentDirichletAllocation(n_components=n_topic, max_iter=1000, verbose=True) lda.fit(data_tfidf) with open(savepath, "wb") as f: pickle.dump(lda, f) print(lda.perplexity(data_tfidf))
def Proceeding_LDA(n_component, ngram_tf_train): print("Fitting LDA models with tf features,") print(" n_components = %d" % n_component) lda = LatentDirichletAllocation( n_components = n_component, learning_method = 'online', random_state = 0, # doc_topic_prior = 1.0, # topic_word_prior = 1.0 ) lda.fit(ngram_tf_train) lda_train = lda.fit_transform(ngram_tf_train) lda_train_perplexity = lda.perplexity(ngram_tf_train) # To use ngram_tf_text: # lda_test = lda.fit(ngram_tf_test) print("lda_train:", type(lda_train), np.shape(ngram_tf_train)) print("lda_train_perplexity:", lda_train_perplexity) return lda, lda_train, lda_train_perplexity
def lda_decomp(t, n_components, learning_method="online", learning_offset=10.0, max_iter=20, random_state=1): #t0=time() #print(f"Fit LDA with {n_components} components") lda = LatentDirichletAllocation(n_components=n_components, max_iter=max_iter, learning_method=learning_method, learning_offset=learning_offset, random_state=random_state).fit(t) #print(f"Transform TD/IDF matrix with {n_components} components LDA") t_lda = lda.transform(t) score = lda.score(t) perplexity = lda.perplexity(t) #print("Approximate log likelihood score (higher the better): %.3f" % score) #print("Approximate perplexity (lower the better): %.3f" % perplexity) #print("done in %0.3fs." % (time() - t0)) return (lda, t_lda)
def train_topic_models(file): corpus = dtm(file, 10000) features = corpus.columns.values y = pd.read_csv(file)['class'] lda_5 = LatentDirichletAllocation(n_topics=5, max_iter=5, learning_method='online', learning_offset=50., random_state=0).fit(corpus) lda_10 = LatentDirichletAllocation(n_topics=5, max_iter=5, learning_method='online', learning_offset=50., random_state=0).fit(corpus) components_5 = np.argsort(lda_5.components_)[::1] components_10 = np.argsort(lda_10.components_)[::1] print("Top 10 word for 5 topic model") for i, item in enumerate(components_5): words = [] for j in range(0, 10): words += features[components_5[i, j]] print(words) print("Top 10 word for 10 topic model") for i, item in enumerate(components_10): words = [] for j in range(0, 10): words += features[components_10[i, j]] print(words) for i in range(2, 11): lda = LatentDirichletAllocation(n_topics=i, max_iter=5, learning_method='online', learning_offset=50., random_state=0).fit(corpus) print("Perplexity{}:{}".format(i, lda.perplexity(corpus)))
def run_lda(documents, feature_names, saveFileDir, topic_nums=10, top_words_nums=20): lda = LatentDirichletAllocation(n_topics=topic_nums, max_iter=5, learning_method='online', learning_offset=50., random_state=0).fit(documents) saveFileHeader = "%s/LDA_TopWords_Topic%s" % (saveFileDir, topic_nums) ### save lda outcomes saveFile = "%s.txt" % (saveFileHeader) if os.path.exists(saveFile): os.remove(saveFile) ## Save Topic top words save_topics(lda, feature_names, saveFile, topic_nums, top_words_nums) ## Save Topic-words Matrix np.savetxt("%s_Topic_Words_matrix.txt" % (saveFileHeader), lda.components_, fmt="%.6f") ## Save documents-topics documents_topics = lda.transform(documents) np.savetxt("%s_Document_Topics_matrix.txt" % (saveFileHeader), documents_topics, fmt="%.6f") np.savetxt("%s_Document_Topic.txt" % (saveFileHeader), np.argmax(documents_topics, axis=1).reshape(len(documents_topics), 1), fmt="%d") ## Save perplexity # print(lda.perplexity(documents)) np.savetxt("%s_perplexity.txt" % (saveFileHeader), [-1, lda.perplexity(documents)], fmt="%.6f")
def plot_perplexity_topics(A_tfidf): print "computing perplexity vs K..." max_iter = 5 #based on plot_perplexity_iter() #num_topics = np.linspace(2,20,5).astype(np.int) num_topics = np.logspace(1, 2, 5).astype(np.int) perplexity = [] em_iter = [] for k in num_topics: lda = LatentDirichletAllocation(n_topics=k, max_iter=max_iter, learning_method='online', batch_size=512, random_state=0, n_jobs=-1) tic = time() lda.fit(A_tfidf) #online VB toc = time() print "K= %d, elapsed time: %.4f sec" % (k, toc - tic) perplexity.append(lda.perplexity(A_tfidf)) em_iter.append(lda.n_batch_iter_) #end np.save('./data/perplexity_topics.npy', perplexity) np.save('./data/perplexity_topics2.npy', num_topics) f = plt.figure() plt.plot(num_topics, perplexity, color='b', marker='o', lw=2.0, label='perplexity') plt.title('Perplexity (LDA, online VB)') plt.xlabel('Number of Topics, K') plt.ylabel('Perplexity') plt.grid(True) plt.legend() plt.show() f.savefig('./figures/perplexity_topics.png')
def LDA_SK(data_vectorized, vectorizer): #Build LDA Model '''lda_model = LatentDirichletAllocation(n_topics=20, # Number of topics max_iter=10, # Max learning iterations learning_method='online', random_state=100, # Random state batch_size=128, # n docs in each learning iter evaluate_every = -1, # compute perplexity every n iters, default: Don't n_jobs = -1, # Use all available CPUs ) ''' lda_model = LatentDirichletAllocation(batch_size=128, doc_topic_prior=None, evaluate_every=-1, learning_decay=0.7, learning_method='online', learning_offset=10.0, max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001, n_components=10, n_jobs=-1, n_topics=10, perp_tol=0.1, random_state=100, topic_word_prior=None, total_samples=1000000.0, verbose=0) lda_output = lda_model.fit_transform(data_vectorized) #print(lda_model) # Model attributes # Log Likelyhood: Higher the better print("Log Likelihood: ", lda_model.score(data_vectorized)) # Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word) print("Perplexity: ", lda_model.perplexity(data_vectorized)) return lda_output
def cluster_sk_latent_dirichlet_allocation(content): """ SK LDA """ _config = LatentDirichletAllocation( n_components=content['n_components'], doc_topic_prior=None, topic_word_prior=None, learning_method=content['learning_method'], learning_decay=content['learning_decay'], learning_offset=content['learning_offset'], max_iter=10, batch_size=128, mean_change_tol=content['mean_change_tol'], n_jobs=-1) _result = _config.fit(content['data']).transform(content['data']) return httpWrapper(json.dumps({ 'result': _result.tolist(), 'components': _config.components_.tolist(), 'batchIter': _config.n_batch_iter_, 'nIter': _config.n_iter_, 'perplexity': _config.perplexity(content['data']), 'score': _config.score(content['data']) }, ignore_nan=True ))
class DMFVI(Model): MODEL_NAME = "dmfvi" _default_cfg = { "learning_method": "batch", "max_iter": 10, "batch_size": 128, "perp_tol": 0.1, "evaluate_every": 10 } def __init__(self, cfg, train_cfg): super(DMFVI, self).__init__(cfg, train_cfg) self.cfg = copy.deepcopy(self._default_cfg) self.cfg.update(cfg) model_kwargs = {k: v for k, v in self.cfg.iteritems() if k in self._default_cfg} self.model = LatentDirichletAllocation(n_components=self.topic_dim, verbose=2, **model_kwargs) print("DMFVI: Use model configration:\n{}".format("\n".join("\t{:30}: {}".format(k, v) for k, v in sorted(model_kwargs.iteritems(), key=lambda item: item[0])))) def perplexity(self, x): return self.model.perplexity(np.array(x)) def topic_prop(self, x): return self.model.transform(x) @property def topic_components(self): return self.model.components_ def train(self): train_data = self.reader.get_data_from_type("train") self.model.fit(np.array([self.reader.onehot(data) for data in train_data if data != []])) print ("{}: trained for {} epochs; {} EM iterations.".format(datetime.now(), self.model.n_iter_, self.model.n_batch_iter_)) def save(self, path): cPickle.dump(self.model, open(path, "w")) def load(self, path): self.model = cPickle.load(open(path, "r"))
def run_multiple_LDA(biom_data, file_name, n_com_list): '''Return list of LDA models with number of communities specified in n_com_list Extract sparse matrix from biom-format. Run scikit-learn LDA for each number of communities specified. Calculate final perplexity of training data and time to run. ''' models = [] SampleX = biom_data.matrix_data.transpose().astype('int') f = open(file_name, 'wb') for i in n_com_list: starttime = time.time() model = LatentDirichletAllocation(n_components=i, learning_method='batch', max_iter=100, evaluate_every=10, max_doc_update_iter=100) model.fit(SampleX) print('perplexity', model.perplexity(SampleX)) endtime = time.time() print(endtime - starttime) pickle.dump(model, f) models.append(model) return models
def plot_perplexity_iter(A_tfidf, num_topics): print "computing perplexity vs iter..." max_iter = 5 perplexity = [] em_iter = [] for sweep in range(1, max_iter + 1): lda = LatentDirichletAllocation(n_topics=num_topics, max_iter=sweep, learning_method='online', batch_size=512, random_state=0, n_jobs=-1) tic = time() lda.fit(A_tfidf) #online VB toc = time() print "sweep %d, elapsed time: %.4f sec" % (sweep, toc - tic) perplexity.append(lda.perplexity(A_tfidf)) em_iter.append(lda.n_batch_iter_) #end np.save('./data/perplexity_iter.npy', perplexity) f = plt.figure() plt.plot(em_iter, perplexity, color='b', marker='o', lw=2.0, label='perplexity') plt.title('Perplexity (LDA, online VB)') plt.xlabel('EM iter') plt.ylabel('Perplexity') plt.grid(True) plt.legend() plt.show() f.savefig('./figures/perplexity_iter.png')
def analyser(data): _, data_vectorized = get_vectorized_data(data) # Build LDA Model lda_model = LatentDirichletAllocation( n_components=20, # Number of topics max_iter=10, # Max learning iterations learning_method='online', random_state=100, # Random state batch_size=128, # n docs in each learning iter evaluate_every=-1, # compute perplexity every n iters, default: Don't n_jobs=-1, # Use all available CPUs ) lda_output = lda_model.fit_transform(data_vectorized) print(lda_output) # Log Likelyhood: Higher the better print("Log Likelihood: ", lda_model.score(data_vectorized)) # Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word) print("Perplexity: ", lda_model.perplexity(data_vectorized)) # See model parameters pprint(lda_model.get_params())
def Choosing_n_components(n_features, var_n_components, ngram_range, train_data, stop_words): # split_ratio = int(len(train_data)*0.7) split_ratio = int(len(train_data) * 1) perplexities = [] for i in var_n_components: print("\n Start LDA iteration with var_n_components") n_components = i ngram_tf = CountVectorizer(stop_words=stop_words, ngram_range=ngram_range, max_features=n_features) ngram_tf_train = ngram_tf.fit_transform(train_data[:split_ratio]) # ngram_tf_test = ngram_tf.transform(train_data[split_ratio:]) print("ngram_tf_train_fit_transformed:", type(ngram_tf_train), "np.shape:", np.shape(ngram_tf_train)) print( "Fitting LDA models with tf features,", "n_components = %d, n_features = %d" % (n_components, n_features)) lda = LatentDirichletAllocation(n_components=n_components, learning_method='online', random_state=0) lda.fit(ngram_tf_train) lda_train = lda.fit_transform(ngram_tf_train) print("lda_train_data:", np.shape(lda_train)) # lda_test = lda.transform(ngram_tf_test) # print("lda_test:", type(lda_test), "np.shape:", np.shape(ngram_tf_test)) lda_train_perplexity = lda.perplexity(ngram_tf_train) perplexities.append(lda_train_perplexity) print("lda_train_perplexity:", lda_train_perplexity) return perplexities
# Count vectorizer vectorizer = CountVectorizer(stop_words=stop_words,token_pattern='[a-zA-Z0-9]{3,}',) # Use a list of the full documents as the input, not the tokens data_vectorized=vectorizer.fit_transform(tlj['Reviews']) # Build sklearn LDA model skl_lda_model = LatentDirichletAllocation(n_components=20, # Let's start on the higher end of topics max_iter=10, learning_method='batch', random_state=100, batch_size=128, evaluate_every= -1, # Don't compute perplexity with every iteration n_jobs = -1 # Use all available CPUs ) # Fit model start_time = time.time() skl_lda_model.fit(data_vectorized) end_time = time.time() # Print metrics and params print("Model Fit Time:", end_time-start_time) print("Log-Likelihood: ", skl_lda_model.score(data_vectorized)) print("Perplexity: ", skl_lda_model.perplexity(data_vectorized)) pprint(skl_lda_model.get_params) # Save results pickle.dump(vectorizer, open('../../../data/pickles/lda/lda_skl_default_vectorizer.pkl', 'wb')) pickle.dump(data_vectorized, open('../../../data/pickles/lda/lda_skl_default_data_vectorized.pkl', 'wb')) pickle.dump(skl_lda_model, open('../../../data/pickles/lda/lda_skl_default_model.pkl', 'wb'))
vectorizer.get_feature_names() vect_df = pd.DataFrame(X.toarray(), columns=[vectorizer.get_feature_names()]) vect_df.shape vect_df.head() lda_range= range(1,20) lda_eval = [] for n in lda_range: lda = LatentDirichletAllocation(n_topics=n, max_iter=5, learning_method='online', learning_offset=50., random_state=0) lda.fit(vect_df) score = lda.score(vect_df) perplexity = lda.perplexity(vect_df) print n,score,perplexity lda_eval.append({'topics':n,'score':score,'perplexity':perplexity}) for item in lda_eval: print item lda = LatentDirichletAllocation(n_topics=5, n_jobs=-1) topics = lda.fit_transform(vect_df) lda.perplexity(vect_df) lda.score(vect_df) topics[2545] df.ix[2545].text
n_features = 1000 n_topics = 10 n_top_words = 20 lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5, learning_method='online', learning_offset=50., random_state=0) lda.fit(corpusVect) tf_feature_names = vectorizer.get_feature_names() print_top_words(lda, tf_feature_names, n_top_words) lda.score(corpusVect) lda.perplexity(corpusVect) #### Titles corp2 = dataWeek.title CleanTextTransformer().fit(corp2) corpCTT2 = CleanTextTransformer().transform(corp2) corpCTTvect = vectorizer.fit_transform(corpCTT2) corpusTitlesVect = pd.DataFrame(corpCTTvect.todense(),columns=vectorizer.get_feature_names()) lda2 = LatentDirichletAllocation(n_topics=n_topics, max_iter=5, learning_method='online', learning_offset=50., random_state=0) for n in range(2,16):
f = plt.figure() plt.matshow(topics, cmap = 'gray') plt.gca().set_aspect('auto') plt.title('learned topic matrix') plt.ylabel('topics') plt.xlabel('dictionary') plt.show() f.savefig('./figures/topic.png') #topic proportions matrix: D x K #note: np.sum(H, axis=1) is not 1 H = lda_vb.transform(A_tfidf_sp) f = plt.figure() plt.matshow(H, cmap = 'gray') plt.gca().set_aspect('auto') plt.show() plt.title('topic proportions') plt.xlabel('topics') plt.ylabel('documents') f.savefig('./figures/proportions.png') #compute perplexity print "perplexity: %.2f" % lda_vb.perplexity(A_tfidf_sp) plot_perplexity_iter(A_tfidf_sp, num_topics) plot_perplexity_topics(A_tfidf_sp) plot_perplexity_batch(A_tfidf_sp, A_tfidf_sp.shape[0]) print "LDA topics:" display_topics(lda_vb, tfidf_dict, 20)
for i in range(int(max_iter / valid_iter)): train_s = [] test_s = [] train_p = [] test_p = [] print '\ntraining ', i * valid_iter + 1, '-th iteration' for train_index, test_index in splited_index: train_data, test_data = dataset[train_index], dataset[test_index] lda_model.partial_fit(train_data) train_s.append(lda_model.score(train_data)) test_s.append(lda_model.score(test_data)) train_p.append(lda_model.perplexity(train_data)) test_p.append(lda_model.perplexity(test_data)) train_scores.append(train_s) test_scores.append(test_s) train_perplexities.append(train_p) test_perplexities.append(test_p) print "train_scores: ", train_scores[i], " test_scores: ", test_scores[i], " train_perplexities: ", train_perplexities[i], " test_perplexities: ", test_perplexities[i] dict_num_topic[str(n_component) + '_topics'] = { "max_iter": max_iter, "valid_iter": valid_iter, "train_scores": train_scores, "test_scores": test_scores, "train_perplexities": train_perplexities, "test_perplexities": test_perplexities }
max_iter=10, # Max learning iterations random_state=100, # Random state (seed) learning_method='online', batch_size=128, # No of docs in each iter evaluate_every=-1, # Compute perplexity every n iters n_jobs=-1) # Use all available CPUs lda_output = lda_model.fit_transform(samples) print(lda_model) # Diagnose model performance with perplexity and log-likelihood # Log Likelyhood: Higher the better print "Log Likelihood: ", lda_model.score(samples) # Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word) print("Perplexity: ", lda_model.perplexity(samples)) # See model parameters pprint(lda_model.get_params()) # Perform GridSearch for the best LDA model # Define Search Param search_params = { 'n_components': [6, 7, 8, 9], # take 10 topics 'learning_decay': [0.5, 0.7, 0.9], 'max_iter': [6, 7, 8, 9], 'random_state': [2018] } # Init the Model lda = LatentDirichletAllocation()
tf = tf_vectorizer.fit_transform(blogs.article_body) lda_eval2 = [] ldaRANGE = [9,10,11,12,13,14,15,16,17,18,19,20,30,40,50,60,70,80,90,100,150,200,300] for n in ldaRANGE: lda = LatentDirichletAllocation(n_topics=n, max_iter=5, learning_method='online', learning_offset=50., random_state=0) lda.fit(tf) score = lda.score(tf) perplexity = lda.perplexity(tf) print n,score,perplexity lda_eval2.append({'topics':n,'score':score,'perplexity':perplexity}) for item in lda_eval2: print item lda_eval22 = pd.DataFrame(lda_eval2) lda_eval22 import matplotlib.pyplot as plt lda_eval22 plt.style.use('ggplot') plt.scatter(lda_eval22['topics'],lda_eval22['perplexity'])