def main(): n_samples = 2000 n_features = 1000 n_topics = 20 n_top_words = 15 dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes')) vectorizer = CountVectorizer(token_pattern=r"(?u)\b[^\d\W]\w+\b", max_df=0.9, max_features=n_features, min_df=2, stop_words='english') doc_word_count = vectorizer.fit_transform(dataset.data[:n_samples]) lda = LDA(n_topics=n_topics, kappa=0.7, tau0=1024., n_jobs=4, random_state=0) feature_names = vectorizer.get_feature_names() start_time = time.clock() lda.fit(doc_word_count) end_time = time.clock() # print feature_names[:10] for topic_idx, topic in enumerate(lda.components_): print("Topic #%d:" % topic_idx) print(" ".join( [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])) print 'run time = %.3f seconds' % (end_time - start_time)
def main(): n_samples = 2000 n_features = 1000 n_topics = 20 n_top_words = 15 dataset = fetch_20newsgroups( shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes')) vectorizer = CountVectorizer(token_pattern=r"(?u)\b[^\d\W]\w+\b", max_df=0.9, max_features=n_features, min_df=2, stop_words='english') doc_word_count = vectorizer.fit_transform(dataset.data[:n_samples]) lda = LDA(n_topics=n_topics, kappa=0.7, tau0=1024., n_jobs=4, random_state=0) feature_names = vectorizer.get_feature_names() start_time = time.clock() lda.fit(doc_word_count) end_time = time.clock() # print feature_names[:10] for topic_idx, topic in enumerate(lda.components_): print("Topic #%d:" % topic_idx) print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])) print 'run time = %.3f seconds' % (end_time - start_time)
def fit_reuters(): corpus = Corpus() corpus.load_ldac(menu_path + 'reuters.ldac') model = LDA(n_topic=20) model.fit(corpus, n_iter=50) model.save_model(protocol=2)
def main(): corpus = Corpus() corpus.load_ldac(menu_path + 'reuters.ldac') model = LDA(n_topic=20) model.fit(corpus, valid_split=0.1, n_iter=10) perplexity = model.perplexity(corpus.docs) print perplexity
class TopicModelingLDA(object): #wrapper de la libreriar LDA #permite caracterizar los topicos en base a varios scores encontrados en la literatura def __init__(self,corpus,metrics_criteria='simple'): super(TopicModelingLDA, self).__init__() self.corpus = corpus self.select_metric_criteria(metrics_criteria) self.model = None self.topic_words = None self.top_words = None self.all_words = [] def fit(self,num_topic=5,n_iter=1500): count_vect = CountVectorizer() x_train_counts = count_vect.fit_transform(self.corpus) self.model = LDA(n_topics=num_topic, n_iter=n_iter, random_state=1) self.model.fit(x_train_counts) self.topic_words = self.model.topic_word_ self.vocabulary = count_vect.get_feature_names() def select_metric_criteria(self,metrics_criteria): if metrics_criteria == 'term_score': self.metrics = TopicTermScore() else: self.metrics = TopicSimpleScore() def get_highest_scores(self,k_top=10): #topic_words es una matriz (numero de topicos,palabras) #la fila k indica la distribucion de palabras del topico k num_topics = len(self.topic_words) print ("Numero de topicos",num_topics) top_words = [] self.top_words = {} for topic_k in range(num_topics): scores = [] for v,word in enumerate(self.vocabulary): score = self.metrics.calculate(self.topic_words,topic_k,v) scores.append((word,score)) scores.sort(key=lambda tup: tup[1]) scores = scores[-k_top:] print ("Topico %d"%(topic_k)) for word,score in scores: print ("%s,%.4f"%(word,score)) print ("") self.top_words[topic_k] = [{'word':word,'score':score} for word,score in scores] self.all_words += [ word for word,score in scores] return self.top_words def get_all_words(self): return self.all_words
def generate_topics(self): file_to_tokens = self._get_normalized_corpus(self.files) np_matrix = self._get_document_term_matrix(file_to_tokens) model = LDA(n_topics=self.n_topics, n_iter=self.n_iter, random_state=self.random_state) model.fit(np_matrix) self._lda_model = model
def _getLDA(text, label, n_topic_words): vectorizer = CountVectorizer(min_df=100, max_df=5000) transformer = TfidfTransformer() df = vectorizer.fit_transform(text) tfidf_word_name = vectorizer.get_feature_names() model = LDA(n_topics=20, n_iter=1000, random_state=1) model.fit(df) Dump(model, 'LDA_model', 'joblib') topic_word = model.topic_word_ doc_topic = model.doc_topic_ with open('topic_word.txt', 'w') as f: n_top_words = 300 for i, topic_dist in enumerate(topic_word): topic_words = np.array(tfidf_word_name)[np.argsort( topic_dist)][:-(n_top_words + 1):-1] f.write('Topic {}: {}'.format(i, ' '.join(topic_words)) + '\n') return topic_word, doc_topic
def exampleLDAExecution(): X = data.load_reuters() vocab = data.load_reuters_vocab() titles = data.load_reuters_titles() # document-term matrix X = data.load_reuters() print("type(X): {}".format(type(X))) print("shape: {}\n".format(X.shape)) # the vocab vocab = data.load_reuters_vocab() print("type(vocab): {}".format(type(vocab))) print("len(vocab): {}\n".format(len(vocab))) # titles for each story titles = data.load_reuters_titles() print("type(titles): {}".format(type(titles))) print("len(titles): {}\n".format(len(titles))) doc_id = 0 word_id = 3117 print("doc id: {} word id: {}".format(doc_id, word_id)) print("-- count: {}".format(X[doc_id, word_id])) print("-- word : {}".format(vocab[word_id])) print("-- doc : {}".format(titles[doc_id])) model = LDA(n_topics=20, n_iter=500, random_state=1) model.fit(X) # model.fit_transform(X) is also available topic_word = model.topic_word_ # model.components_ also works n_top_words = 10 for i, topic_dist in enumerate(topic_word): topic_words = np.array(vocab)[np.argsort(topic_dist)][:-n_top_words:-1] print('Topic {}: {}'.format(i, ' '.join(topic_words))) doc_topic = model.doc_topic_ for i in range(10): print("{} (top topic: {})".format(titles[i], doc_topic[i].argmax()))
def RunLDA(FileLocation, NumDocs, NumTopics): # In order to create a Term Document matrix, # We read in every file and then make a list containing the body of # all of the articles fin=open(FileLocation,'r') #Will need to store the urls when we make the tdm UrlArray = [] #Create TDM object. It will also remove stopwords TDM = TermDocumentMatrix(simple_tokenize_remove_stopwords) # Add each article to the TDM object. Also create a list of urls # This is a massive corpus, so we are only doing this for 300 articles. for i in range(NumDocs): Article = fin.next() UrlArray.append(re.split(r'\t',Article)[0]) TDM.add_doc(re.split(r'\t',Article)[1]) # Rows in TDM is an iterable # We can't have that to input it into numpy X = list(TDM.rows()) # Oddly enough the first row of the .rows() iterable in TDM returms a # List of all of the words used. Think of it as a header file Vocab = X[0] Y = [] #creating a 2d list containing the rows of the document matrix for i in range(len(X)-1): Y.append(X[i+1]) # Create the LDA model object. 20 topics this time, but that can be changed. model = LDA(n_topics=20, n_iter=1500, random_state=1) # Make a numpy Array to use as input Yarray = np.asarray(Y) #Fit the model. This process is similiar to scikit-learn's algorithms model.fit(Yarray) TopicWords = [] topic_word = model.topic_word_ n_top_words = 50 for i, topic_dist in enumerate(topic_word): topic_words = np.array(Vocab)[np.argsort(topic_dist)][:-n_top_words:-1] TopicWords.append(topic_words) print('Topic {}: {}'.format(i, ' '.join(topic_words)))
def appDescriptionsLDA(): X = data.load_reuters() vocab = data.load_reuters_vocab() titles = data.load_reuters_titles() print X print vocab print titles X.shape X.sum() model = LDA(n_topics=20, n_iter=500, random_state=1) model.fit(X) # model.fit_transform(X) is also available topic_word = model.topic_word_ # model.components_ also works n_top_words = 10 for i, topic_dist in enumerate(topic_word): topic_words = np.array(vocab)[np.argsort(topic_dist)][:-n_top_words:-1] print('Topic {}: {}'.format(i, ' '.join(topic_words))) doc_topic = model.doc_topic_ for i in range(10): print("{} (top topic: {})".format(titles[i], doc_topic[i].argmax()))
from data import Data from lda import LDA data = Data() data.load() data.textPre('r') tf = data.saveModel('r') model = LDA() model.fit(tf) #model.print_top_words(data.tf_vectorizer.get_feature_names())
def fit_model(self, data, params): lda_instance = LDA(**params) lda_instance.fit(data) return lda_instance
import matplotlib.pyplot as plt import numpy as np from lda import LDA from sklearn.model_selection import train_test_split from sklearn.datasets import fetch_openml if __name__ == '__main__': x, y = fetch_openml('mnist_784', version=1, return_X_y=True) y = y.astype(int) x_train, x_test, y_train, y_test = train_test_split(x, y) lda = LDA() lda.fit(x_train, y_train) train_acc = (lda.predict(x_train).argmax(1) == y_train.squeeze()).mean() * 100 test_acc = (lda.predict(x_test).argmax(1) == y_test.squeeze()).mean() * 100 print(f'Train accuracy : {train_acc}%') print(f'Test accuracy : {test_acc}%') # plot generated images = [] for i in range(10): # each class temp = [] for j in range(10): # 10 samples temp.append(gda.generate(i).reshape(28, 28)) images.append(temp) images = np.array(images)
plt.scatter(x1, x2, c=y, edgecolor='none', alpha=0.8, cmap=plt.cm.get_cmap('viridis', 3)) plt.xlabel('Temel Bileşen 1') plt.ylabel('Temel Bileşen 2') plt.colorbar() plt.show() # Verileri iki doğrusal diskriminant ile gösterme from lda import LDA lda = LDA(2) lda.fit(X_min_max, y) X_projected = lda.transform(X_min_max) print('Min-Max Normalizasyonlu X:', X_min_max.shape) # (150, 4) print('LDA Uygulanan X:', X_projected.shape) # (150, 2) x1 = X_projected[:, 0] x2 = X_projected[:, 1] plt.scatter(x1, x2, c=y, edgecolor='none', alpha=0.8, cmap=plt.cm.get_cmap('viridis', 3))
y_train = eval("y_%s_train" % l) X_test = eval("X_%s_test" % l) y_test = eval("y_%s_test" % l) X_combined = np.vstack((X_train, X_test)) y_combined = np.hstack((y_train, y_test)) fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(16, 18)) ax = ax.flatten() print(l) """ Run LDA """ LDA_clf = LDA() LDA_clf.fit(X_train, y_train) lda_train_error = np.mean(LDA_clf.predict(X_train).flatten() != y_train) lda_test_error = np.mean(LDA_clf.predict(X_test).flatten() != y_test) plot_decision_regions(X=X_combined, y=y_combined, classifier=LDA_clf, test_idx=range(X_train.shape[0], X_train.shape[0] + X_test.shape[0]), ax=ax[0]) ax[0].set_xlabel("x1", fontsize="large") ax[0].set_ylabel("x2", fontsize="large") ax[0].legend(loc="upper right", fontsize="large") ax[0].set_title("Generative model (LDA) on dataset %s" % l, fontsize="x-large",
import pickle from lda import LDA from data.datafile import AADataFile dfile = pickle.load(open("data/datafile.pkl")) dt = dfile.DT te = dfile.TE lda = LDA(K=10, n_jobs=8, nr_em_epochs=20) perp, b, g = lda.fit(dt)
from lda import LDA train_corpus = 'data/worldnews_train.csv' test_corpus = 'data/worldnews_test.csv' alpha = 0.01 beta = 0.01 topics = 5 model = LDA(topics, alpha, beta) model.fit(train_corpus, n_iters=10000, burn=8000) model.print_topics() x = input('Press key to start evaluation') model.predict(test_corpus, n_iters=1000, burn=300) model.print_eval_results()
from lda import LDA, LdaType import pandas as pd data = pd.read_csv("./data/fisher.csv") lda = LDA() lda.fit(data=data, target_column_name='target') conversion_data = lda.conversion(LdaType.Two) print(conversion_data)
# In[ ]: #scikit-learn LDA implementation #201 #1121 #4617 #model=LatentDirichletAllocation(n_topics=num_topics,max_iter=100,learning_method='batch',random_state=201)#,doc_topic_prior=50.0/num_topics,topic_word_prior=200.0/num_topics) #model.fit(bag_of_words) # In[ ]: # In[ ]: #lda implementation from https://github.com/ariddell/lda using collapsed gibbs sampling model = LDA(n_topics=num_topics, n_iter=1000, random_state=201, refresh=100) model.fit(bag_of_words) # model.fit_transform(X) is also available #topic_word = model.topic_word_ # model.components_ also works # In[ ]: def print_top_words(model, feature_names, n_top_words): for topic_idx, topic in enumerate(model.components_): print("Topic #%d:" % topic_idx) print(" ".join( [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])) # In[ ]: feature_names = vectorizer.get_feature_names()
elif num_args == 3: params = {'idx_dataset': sys.argv[1], 'train': sys.argv[2] == 'True'} elif num_args == 4: params = { 'idx_dataset': sys.argv[1], 'train': sys.argv[2] == 'True', 'test': sys.argv[3] == 'True' } # load dataset X_train, y_train, X_test, y_test = read_data(**params) if X_train is not None: # Linear Discriminant Analysis (LDA) lda = LDA() lda.fit(X_train, y_train) plot_results(lda, params['idx_dataset'], X_train, y_train, X_test, y_test) print( 'The accuracy on train (test) dataset {} for LDA: {} ({})'.format( params['idx_dataset'], lda.score(X_train, y_train), lda.score(X_test, y_test))) # Logistic regression log_reg = LogisticRegression() log_reg.fit(X_train, y_train) plot_results(log_reg, params['idx_dataset'], X_train, y_train, X_test, y_test) print('The accuracy on train (test) dataset {} for LogReg: {} ({})'. format(params['idx_dataset'], log_reg.score(X_train, y_train), log_reg.score(X_test, y_test)))
utt2class = {} for l in lines: utt2class[l.split()[0]] = l.split()[1] labelled_egs = set(utt2class.keys()) mean = np.load(args.mean) print('INFO:: Data loaded. Normalizing...') reader = script_reader(args.scp) X = [] Y = [] for i, data in enumerate(reader): utt, xvec = data if utt not in utt2class.keys(): continue X.append(xvec - mean) Y.append(label2idx[utt2class[utt]]) print('INFO:: Normalized ' + str(len(Y)) + ' utterences. Now computing LDA...') from lda import LDA lda = LDA() lda.fit(np.array(X), np.array(Y)) print('INFO:: LDA done. Saving Model...') with open(args.out + '/lda.pkl', 'wb') as output: pickle.dump(lda, output, pickle.HIGHEST_PROTOCOL)
def fit_model(self, data, params): from lda import LDA lda_instance = LDA(**params) lda_instance.fit(data) return lda_instance
from lda import LDA, _doc_update, _slice_doc_update import pickle import numpy as np np.seterr(divide="raise") from data.datafile import AADataFile dfile = pickle.load(open("data/datafile.pkl")) dt = dfile.DT te = dfile.TE f = te.toarray().argmax(axis=1) lda = LDA(K=10, n_jobs=8, nr_em_epochs=20) perp, b, g, e = lda.fit(dt, f)
from sklearn import datasets import matplotlib.pyplot as plt import numpy as np from lda import LDA data = datasets.load_iris() X = data.data y = data.target lda = LDA(n_components=2) lda.fit(X, y) X_projected = lda.transform(X) print("shape of X:", X.shape) print("shape of transform X ", X_projected.shape) x1 = X_projected[:, 0] x2 = X_projected[:, 1] plt.scatter(x1, x2, c=y, edgecolors='none', alpha=0.8, cmap=plt.cm.get_cmap('viridis', 3)) plt.colorbar() plt.show()
def clustering_measure(self, n_cluster): km = KMeans(n_cluster) km.fit(self.doc_features) print("Adjusted Rand-Index: %.3f" % metrics.adjusted_rand_score(self.doc_class, km.labels_)) def cross_validation(self): X_train, X_test, y_train, y_test = cross_validation.train_test_split( self.doc_features, self.doc_class, test_size=0.4, random_state=0) clf = svm.SVC(kernel='linear', C=1).fit(X_train, y_train) print ("Cross-Validation Score: %.3f" % clf.score(X_test, y_test)) if __name__ == '__main__': # load dataset dataset = CNN() dataset.load_data('/home/yi/Dropbox/workspace/data/cnn/') # train lda lda = LDA(5) lda.initialize(dataset.data_matrix) #lda.load_label('labels.txt', dataset.dictionary) for iter in range(20): lda.fit(dataset.data_matrix) lda.fininsh() lda.print_top_words(dataset.dictionary, 10) # evaluate lda eval = Evaluator(dataset, lda) eval.clustering_measure(n_cluster=5) eval.cross_validation()
print('input tokens from preprocessing pipeline %d' % toks) print('loading DTM from `%s`...' % DATA_PICKLE_DTM) doc_labels, vocab, dtm, tokens = unpickle_file(DATA_PICKLE_DTM) assert len(doc_labels) == dtm.shape[0] assert len(vocab) == dtm.shape[1] print('loaded DTM with %d documents, %d vocab size, %d tokens' % (len(doc_labels), len(vocab), dtm.sum())) #%% compute model print('generating model with parameters:') pprint(LDA_PARAMS) model = LDA(**LDA_PARAMS) model.fit(dtm) #%% output print('saving model to `%s`' % LDA_MODEL_PICKLE) pickle_data((doc_labels, vocab, dtm, model), LDA_MODEL_PICKLE) print('saving results to `%s`' % LDA_MODEL_EXCEL_OUTPUT) save_ldamodel_summary_to_excel(LDA_MODEL_EXCEL_OUTPUT, model.topic_word_, model.doc_topic_, doc_labels, vocab, dtm=dtm) #%%
from lda import LDA from dataset import TwentyNewsDataset import time dataset = TwentyNewsDataset() dataset.load_data() n_topics = 20 lda = LDA(n_topics) lda.initialize(dataset.data_matrix) lda.load_label('labels.txt', dataset.dictionary) print(lda.print_labels()) for _ in range(100): lda.fit() lda.get_topic_word() lda.get_doc_topic() lda.print_top_words(dataset.dictionary, 10)
train_re_path = '../data/train/relevant.txt' train_ir_path = '../data/train/irrelevant.txt' test2_ir_path = '../data/test2/irrelevant.txt' test2_re_path = '../data/test2/relevant.txt' test1_ir_path = '../data/test1/irrelevant.txt' test1_re_path = '../data/test1/relevant.txt' words_dict, idx_dict = create_dict(full_path, stop_words) train_X = load_data(train_path) train_X = word_to_idx(train_X, words_dict) lda = LDA(5) lda.fit(train_X, words_dict.items()) test1_re_X = load_data(test1_re_path) test1_re_X = word_to_idx(test1_re_X, words_dict) test1_ir_X = load_data(test1_ir_path) test1_ir_X = word_to_idx(test1_ir_X, words_dict) test2_re_X = load_data(test2_re_path) test2_re_X = word_to_idx(test2_re_X, words_dict) test2_ir_X = load_data(test2_ir_path) test2_ir_X = word_to_idx(test2_ir_X, words_dict) target_X = load_data(target_path) target_X = word_to_idx(target_X, words_dict) train_re_X = load_data(train_re_path)
n = 10000 # データ数 data1 = Data2D(mu1, cov, n) data2 = Data2D(mu2, cov, n) X1 = data1.get_data() X2 = data2.get_data() X = np.vstack([X1, X2]) # PCA pca = PCA() pca.fit(X) pca_vec = pca.get_vec() show_hist(project(X1, pca_vec), project(X2, pca_vec)) # LDA lda = LDA() lda.fit(X1, X2) lda_vec = lda.get_vec() show_hist(project(X1, lda_vec), project(X2, lda_vec)) # グラフ描画 # 背景を白にする plt.figure(facecolor="w") axis_x = np.linspace(-10, 10) pca_y = (pca_vec[1] / pca_vec[0]) * axis_x lda_y = (lda_vec[0] / lda_vec[1]) * axis_x plt.plot(axis_x, pca_y, "c-", label="PCA") plt.plot(axis_x, lda_y, "m-", label="LDA") # 散布図をプロットする plt.scatter(data1.x, data1.y, color='r', marker='x')