def fit_topics(data, embeddings, vocab, K): """Fit a topic model to bag-of-words data.""" tic = time.time() model = lda(n_components=K, max_iter=100, learning_method='online', learning_offset=50., doc_topic_prior=1., random_state=0, verbose=1) model.fit(data) topics = model.components_ lda_centers = np.matmul(topics, embeddings) print('LDA Gibbs topics') n_top_words = 20 print_top_words(model, vocab) topics_words = [] for i, topic_dist in enumerate(topics): topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words + 1):-1] topics_words.append(topic_words) topic_proportions = model.transform(data) print("LDA fit done in %0.3fs." % (time.time() - tic)) return topics, lda_centers, topic_proportions, topics_words
def lda_cluster(train_text): vectorizer = TfidfVectorizer(input='content', stop_words='english', lowercase=True, encoding='UTF-8', strip_accents='unicode', analyzer='word', ngram_range=(1, 2), max_features=100) # feature_matrix will be sparse, use "feature_matrix.toarray()" to get an array representation feature_matrix = vectorizer.fit_transform(train_text) corpus_vocab = vectorizer.get_feature_names() corpus_vocab = np.array(corpus_vocab) # now we want to get the most important words in each document feature_matrix = feature_matrix.toarray() # Since values of feature_matrix are between zero and 1, we can say # that for each document, only consider the word as part of the document # if it is of particular importance: feature_matrix = feature_matrix > TFID_cutoff feature_matrix = feature_matrix.astype(int) # now feature matrix is a matrix of 1 or 0 (bag of words) # we use corpus_vocab to get the actual corresponding word my_lda = lda(n_components=num_clusters, random_state=0) # run the LDA algorithm my_lda.fit_transform(feature_matrix) # extract the associated probability for each word word_topics = my_lda.components_ # will be of size (num_topics, num_words), each cell representing probability top_words_list = [] top_words_group = [] # now that we have the top words, we want to see what the top words are in each category for idx in range(num_clusters): category = word_topics[idx][:] # sort the probabilities of the associated words by index # the below will go from smallest to largest sorted_words = np.argsort(category) top_words = sorted_words[-1 * num_LDA_words:] print('###########') print('For Category number {0}'.format(idx)) for word in top_words: top_words_list.append(corpus_vocab[word]) top_words_group.append(idx) print(corpus_vocab[word]) imp_word_list = pd.DataFrame(list(zip(top_words_list, top_words_group)), columns=['word', 'group']) return imp_word_list, num_clusters
def run(self): self.model = lda(n_topics=100, learning_method='online') self.model.fit(self.doc_terms) feature_names = self.cv.get_feature_names() topic_matrix = [] for topic_idx, topic in enumerate(self.model.components_): row = [feature_names[i] for i in topic.argsort()[:-10 - 1:-1]] topic_matrix.append(row) self.topics = np.array(topic_matrix)
def fitlda(x): from sklearn.decomposition import LatentDirichletAllocation as lda from pandas import DataFrame as df # l=lda(n_components =100) l = lda( n_topics=ntopic ) #This parameter has been renamed to n_components and will be removed in version 0.21. .. deprecated:: 0.19 if type(x) == type(df()): x = x.fillna(0) l.fit(x.fillna(0)) return (l)
def fit_topics(data, embeddings, vocab, K): """Fit a topic model to bag-of-words data.""" model = lda(n_components=K, max_iter=1500, random_state=1) model.fit(data) topics = model.components_ lda_centers = np.matmul(topics, embeddings) print('LDA Gibbs topics') n_top_words = 20 topics_words = [] for i, topic_dist in enumerate(topics): topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words + 1):-1] print('Topic {}: {}'.format(i, ' '.join(topic_words))) topics_words.append(topic_words) print('\n') topic_proportions = model.transform(data) return topics, lda_centers, topic_proportions, topics_words
def train_topic_model(self, docs, n_clusters): lda_random_state = 100 lda_n_iter = 100 n_top_words = 20 print("Topic modeling using LDA...") d2w_vect = TfidfVectorizer(stop_words='english', max_df=0.30) d2w = d2w_vect.fit_transform(docs) model = lda(n_components=n_clusters, max_iter=lda_n_iter, random_state=lda_random_state) model.fit(d2w) print("\nTopical words:") print("-" * 20) words = [w for w, i in d2w_vect.vocabulary_.items()] for i, topic_dist in enumerate(model.components_): top_word_ids = np.argsort(topic_dist)[:-n_top_words:-1] topic_words = [words[id_] for id_ in top_word_ids] print('Topic {}: {}'.format(i, ', '.join(topic_words))) topic_values = model.fit_transform(d2w) return topic_values
filename = "labeled_data.csv" #input("enter .csv: ") texts = "original_post" #input("enter text field name: ") label = "5CAT" #input("enter label field name: ") texts, labels = read_file(filename, texts, label) filtered_texts = pre_clean(texts) filtered_texts = number_filter(filtered_texts) filtered_texts = drop_filter(filtered_texts) texts = untokenize(filtered_texts) #tf,voc = tf_idf(filtered_texts) #print(tf.head(2)) tf, voc = tf(texts) clf = lda(n_components=8) model = clf.fit(tf) ##print(model.components_) ##print(len(model.components_[0])) ##for i in (model.components_): ## print(max(i)) ##print(len(model.components_)) print_top_words(model, voc, 20) #trans = model.transform(texts) #print(trans)
import pandas as pd from sklearn.decomposition import LatentDirichletAllocation as lda from scipy.sparse import csr_matrix df2 = pd.read_pickle('all_comments_per_writer_df.pkl') df2['count'] = 1 print(df2.shape) print(df2.name.nunique()) article_id_u = sorted(df2.article_id.unique()) name_u = sorted(df2.name.unique()) data = df2['count'].tolist() row = df2.name.astype('category', categories=name_u).cat.codes col = df2.article_id.astype('category', categories=article_id_u).cat.codes sparse_matrix = csr_matrix((data, (row, col)), shape=(len(name_u), len(article_id_u))) model = lda(n_components=15) res = model.fit_transform(sparse_matrix) df1 = pd.DataFrame(res) df1.index = name_u df1.to_csv('lda_15.csv')
""" Speeches III """ import pickle import glob import pandas as pd from sklearn.decomposition import LatentDirichletAllocation as lda from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer n_features = 1000 n_components = 15 n_top_words = 10 n_topics = 2 count_m = pickle.load(open('./output/speech_matrix.pk', 'rb')) lda_m = lda(n_components=n_topics, random_state=0) topics = lda_m.fit_transform(count_m) files = glob.glob('.\data\speeches\R0*') corpus = [] text = open('.\data\speeches\R021028A', encoding='utf-8') for name in files: try: f = open(name, encoding='utf-8') text = f.read() corpus.append(text) except UnicodeDecodeError: print(name) text = "".join(corpus) def print_top_words(model, feature_names, n_top_words): for topic_idx, topic in enumerate(model.components_):
def iptdata(data_path, embeddings_path, T=70, glove_embeddings=True, stemming=True): data_all = sio.loadmat(data_path, squeeze_me=True, chars_as_strings=True) # dict if 'Y' in data_all: y_all = data_all['Y'].astype(np.int) else: y_all = np.concatenate( (data_all['yte'].astype(np.int), data_all['ytr'].astype(np.int)), axis=1) if 'X' in data_all: embed_all = data_all['X'] else: embed_all = np.concatenate((data_all['xte'], data_all['xtr']), axis=1) if 'BOW_X' in data_all: BOW_all = data_all['BOW_X'] else: BOW_all = np.concatenate((data_all['BOW_xte'], data_all['BOW_xtr']), axis=1) if 'words' in data_all: words_all = data_all['words'] else: words_all = np.concatenate( (data_all['words_tr'], data_all['words_te']), axis=1) vocab = [] vocab_embed = {} l = len(words_all) for i in range(l): word_i = words_all[i] embed_i = embed_all[i] bow_i = BOW_all[i] w = len(word_i) for j in range(w): if type(word_i[j]) == str: if word_i[j] not in vocab: vocab.append(word_i[j]) vocab_embed[word_i[j]] = embed_i[:, j] else: break vocab_BOW = np.zeros((l, len(vocab)), dtype=np.int) l = len(words_all) for i in range(l): word_i = words_all[i] bow_i = BOW_all[i] w = len(word_i) words_idx = [] for j in range(w): if type(word_i[j]) == str: words_idx.append(vocab.index(word_i[j])) else: break vocab_BOW[i, words_idx] = bow_i.astype(np.int) if glove_embeddings: vocab, vocab_embed, vocab_BOW = embeddings_new(vocab, vocab_BOW, embeddings_path) if stemming: vocab, vocab_embed, vocab_BOW = stem_vocab(vocab_BOW, vocab, vocab_embed) #################################################### l1_BOW, l2_BOW = vocab_BOW.shape embed_dat = [[] for _ in range(l1_BOW)] for i in range(l2_BOW): for d in range(l1_BOW): if vocab_BOW[d, i] > 0: for _ in range(vocab_BOW[d, i]): embed_dat[d].append(vocab_embed[vocab[i]]) vocab_embed = [] for doc_i in embed_dat: vocab_embed.append(np.array(doc_i)) # Matrix of word embeddings embeddings = np.array([vocab_embed[w] for w in vocab]) model = lda(n_components=K, random_state=1) model.fit(vocab_BOW) topics = model.components_ n_top_words = 20 topic_dict = {} topic_proportions = model.transform(vocab_BOW) #cost_embeddings_cos = cosine_similarity(embeddings, embeddings) cost_embeddings = euclidean_distances(embeddings, embeddings)**1 cost_topics = np.zeros((topics.shape[0], topics.shape[0])) cost_m = np.zeros((topics.shape[0], topics.shape[0])) for i in range(cost_topics.shape[0]): for j in range(i + 1, cost_topics.shape[1]): #print(i,j) # i_list = topic_dict[i].astype(bool) # j_list = topic_dict[j].astype(bool) # # topic_i = topics[i][i_list] # topic_j = topics[j][j_list] # # cost_e = cost_embeddings[i_list][:,j_list] # # np.ascontiguousarray(topic_i) # print(topic_i.flags['C_CONIGUOUS']) # # np.ascontiguousarray(topic_j) # print(topic_j.flags['C_CONTIGUOUS']) # cost_e = np.ascontiguousarray(cost_e) # print(cost_e.flags['C_CONTIGUOUS']) # cost_m[i,j] = ot.emd2(topic_i, topic_j, cost_e, numItermax=10000) cost_topics[i, j] = ot.emd2(topics[i], topics[j], cost_embeddings, numItermax=10000) cost_topics = cost_topics + np.transpose(cost_topics) outputs = { 'BOW': vocab_BOW, 'class': y_all - 1, 'topic_proportions': topic_proportions, 'cost_embeddings': cost_embeddings, 'cost_topics': cost_topics } return outputs
import csv import numpy as np # import matplotlib.pyplot as plt from sklearn.decomposition import LatentDirichletAllocation as lda from sklearn.externals import joblib COMPILATION = 'polytics' def read(name, sign=','): with open(name + '.csv', 'r') as file: return [i for i in csv.reader(file, delimiter=sign, quotechar=' ')] vectors = read('data/history/{}/vectors'.format(COMPILATION)) dataset = np.array(vectors, dtype='float') model = lda(n_components=2 ) # 60, max_iter=30, n_jobs=6, learning_method='batch', verbose=1) model.fit(dataset) for i in model.components_: print(i) model.show_topic(0, topn=10) joblib.dump(model, 'data/history/{}/lda.txt'.format(COMPILATION))
mod.components_.shape mod.transform(matrix) topics=pd.DataFrame({'topic1':mod.components_[0],'topic2':mod.components_[1]},index=tf.get_feature_names()) mod.transform(matrix) topics['topic1'].sort_values(ascending=False).head() topics['topic2'].sort_values(ascending=False).head() ## Using lda from sklearn.decomposition import LatentDirichletAllocation as lda mod1=lda(n_topics=2) mod1.fit(matrix) mod1.components_ topics_lda=pd.DataFrame({'topic1':mod1.components_[0],'topic2':mod1.components_[1]},index=tf.get_feature_names()) mod1.transform(matrix) topics_lda['topic1'].sort_values(ascending=False).head() topics_lda['topic2'].sort_values(ascending=False).head() ## Visualising lda model
def run(self): self.model = lda(learning_method='online') self.model.fit(self.doc_terms)