def nmf_analysis_comparison(self): print('Running NMF component comparisons') fig=plt.figure(figsize=(10, 2*len(self.values))) columns = self.n_classes+1 rows = len(self.values) for indx, val in enumerate(self.values): self.nmf = decomposition.NMF(n_components=val) self.nmf.fit(self.x_train_flat) fig.add_subplot(rows, columns, 1+(indx*columns)) plt.imshow(self.nmf.components_.mean(0).reshape(48, 48), cmap=plt.cm.bone) plt.gca().set_title('Overall') plt.axis('off') for emo_val, i in enumerate(range(2, columns+1)): t1 = self.df[self.df['emotion']==emo_val] temp_x_train = np.stack(t1.pop('img_array').values) temp_x_train_flat = temp_x_train.reshape(temp_x_train.shape[0],-1) # print(temp_x_train_flat.shape) fig.add_subplot(rows, columns, i+(indx*columns)) nmf = decomposition.NMF(n_components=val) nmf.fit(temp_x_train_flat) plt.imshow(nmf.components_.mean(0).reshape(48, 48), cmap=plt.cm.bone) plt.gca().set_title(self.emo_list[emo_val]) plt.axis('off') plt.savefig('../images/nmf_images_comparison.png') # plt.show() plt.close()
def build_embedding(path, embedding=None): """ Build the desired embedding and save to specified path. Available embeddings : * Interaction Matrix * t-SNE (unused) * Spectral Embedding * Locally Linear Embedding * Non-negative Matrix Factorisation * Factor Analysis """ if embedding == 'spectral': mat = np.load(path) u_spectral = manifold.SpectralEmbedding(n_components=64, random_state=0, n_jobs=8).fit_transform(mat) i_spectral = manifold.SpectralEmbedding(n_components=64, random_state=0, n_jobs=8).fit_transform(mat.T) return u_spectral, i_spectral elif embedding == 'lle': mat = np.load(path) u_lle = manifold.LocallyLinearEmbedding(n_components=64, random_state=0, n_jobs=8).fit_transform(mat) i_lle = manifold.LocallyLinearEmbedding(n_components=64, random_state=0, n_jobs=8).fit_transform(mat.T) return u_lle, i_lle elif embedding == 'fa': mat = np.load(path) u_fa = decomposition.FactorAnalysis(n_components=64, random_state=0).fit_transform(mat) i_fa = decomposition.FactorAnalysis(n_components=64, random_state=0).fit_transform(mat.T) return u_fa, i_fa elif embedding == 'nmf': mat = np.load(path) u_nmf = decomposition.NMF(n_components=64, random_state=0).fit_transform(mat) i_nmf = decomposition.NMF(n_components=64, random_state=0).fit_transform(mat.T) return u_nmf, i_nmf
def nmf_step(spectra, n_components, sparsity='components', beta=1e-5, **kwargs): """ Performs the non-negative matrix factorization of the spectra into a partial spectra aka components matrix, and a mixing coefficients matrix. kwargs are passed to decomposition.NMF. Parameters ---------- spectra : numpy.ndarray, n_spectra * n_features Clean input spectra n_components : int Number of significant components sparsity : {'data', 'components', None}, default 'components' Where to enforce sparsity in the model. beta : double, default 1e-5 Degree of sparseness, if sparseness is not None. Larger values mean more sparseness Returns ------- components : numpy.ndarray, n_components * n_features Resulting components, aka H matrix mixing_matrix : numpy.ndarray, n_samples * n_components Resulting mixing coefficients, aka W matrix reconstruction_error : float Frobenius norm of (S - WH) """ init = kwargs.pop('init', 'nndsvda') max_iter = kwargs.pop('max_iter', 1000) nls_max_iter = kwargs.pop('nls_max_iter', 10000) try: # Old version of sklearn.decomposition.nfm nmf = decomposition.NMF(n_components=n_components, init=init, sparseness=sparsity, beta=beta, tol=1e-5, max_iter=max_iter, nls_max_iter=nls_max_iter, **kwargs) except: # New version of sklearn.decomposition.nfm (beta and nls_max_iter are # no longer arguments). nmf = decomposition.NMF(n_components=n_components, init=init, tol=1e-5, max_iter=max_iter, **kwargs) mix = nmf.fit_transform(spectra) components = nmf.components_ print('Reconstruction error: {:.3e}'.format(nmf.reconstruction_err_)) return components, mix, nmf.reconstruction_err_
def get_matrix_nmf(features, num_topics): """ Performs non-negative matrix factorization. Parameters ---------- features: pandas.DataFrame of features. num_topics: number of topics to return. Returns ------- W: pandas.DataFrame (shape= # participants by num_topics)... relates the participants to the topics. H: pandas.DataFrame (shape= num_topics by # questions)... relates the topics to the questions. """ mat = pd.DataFrame(features) nmf = decomposition.NMF(n_components=num_topics) nmf.fit(mat) W = nmf.transform(mat) H = nmf.components_ W = pd.DataFrame(W) H = pd.DataFrame(H) W, H = (np.around(x, 2) for x in (W, H)) # this shows the components print(W.head(30), '\n\n', H.head(num_topics)) return (W, H)
def nmf_topic_modeling(self): corpus = [] client = MongoDB() tweetsCollection = client.get_collection('illinois') findCorpusQuery = tweetsCollection.find() for fileid in findCorpusQuery: corpus.append(fileid['text']) cachedStopWords = set(STOPWORDS) cachedStopWords.update( ('well', 'say', 'New', 'RT', 'now', 'https', 'via', 'CIA', 'make', 'says', 'new', 'will', 'said', 'take', 'amp', 'one', 'go', 'know', 'day', 'look', 'think', 'gt', 'lt', 'co', 'rt', 'zbvnkkvl48', 'gtwkxla6bn', 'cia', 'elizasoul80', '5febckimwt', 'af', 'ex', 'ahca', 'heioaednwr', 'realdonaldtrump', 'mitchellvii', 'f0mei9xcwv', 're', '63', 'went', 'still', 'thanks', 'vp', 'olson_micki', 'going', 'il', 'potus', 'housegop')) vectorizer = TfidfVectorizer(stop_words=cachedStopWords, min_df=2) dtm = vectorizer.fit_transform(corpus) vocab = vectorizer.get_feature_names() num_topics = 5 clf = decomposition.NMF(n_components=num_topics, random_state=1) doctopic = clf.fit_transform(dtm) topic_words = [] num_top_words = 5 for topic in clf.components_: word_idx = np.argsort(topic)[::-1][0:num_top_words] # get indexes with highest weights topic_words.append([vocab[i] for i in word_idx]) for t in range(len(topic_words)): print "Topic {}: {}".format(t, ' '.join(topic_words[t][:15]))
def topic_model_nmf(chunks, num_of_topics=20, topic_size=10, max_iterations=200): from sklearn.feature_extraction import DictVectorizer from sklearn import decomposition import numpy as np v = DictVectorizer(sparse=True) D = chunks X = v.fit_transform(D) num_terms = len(v.vocabulary_) terms = [""] * num_terms for term in v.vocabulary_.keys(): terms[v.vocabulary_[term]] = term model = decomposition.NMF(init="nndsvd", n_components=num_of_topics, max_iter=max_iterations) W = model.fit_transform(X) H = model.components_ topics = [] for topic_index in range(H.shape[0]): top_indices = np.argsort(H[topic_index, :])[::-1][0:topic_size] term_ranking = [terms[i] for i in top_indices] #print(term_ranking) topics += [", ".join(term_ranking)] return topics
def build_stable_topic_model(self): matrices = [] for i in range(Constants.TOPIC_MODEL_PASSES): topic_term_matrix = self.build_single_topic_model().transpose() matrices.append(topic_term_matrix) stack_matrix = numpy.hstack(matrices) stack_matrix = normalize(stack_matrix, axis=0) stack_matrix = stack_matrix.transpose() print "Stack matrix M of size %s" % str(stack_matrix.shape) self.topic_model = decomposition.NMF( init="nndsvd", n_components=self.num_topics, max_iter=Constants.TOPIC_MODEL_ITERATIONS, alpha=Constants.NMF_REGULARIZATION, l1_ratio=Constants.NMF_REGULARIZATION_RATIO) self.document_topic_matrix = \ self.topic_model.fit_transform(stack_matrix) self.topic_term_matrix = self.topic_model.components_ row_sums = self.topic_term_matrix.sum(axis=1) self.topic_term_matrix /= row_sums[:, numpy.newaxis] print "Generated factor W of size %s and factor H of size %s" % (str( self.document_topic_matrix.shape), str( self.topic_term_matrix.shape))
def NMF(arquivo, nt): vector = text.CountVectorizer(input='arquivo', stop_words='english', min_df=1, strip_accents='unicode') arqArray = vector.fit_transform(arquivo).toarray() vocabulario = np.array(vector.get_feature_names()) ntw = arqArray.shape[0] #NMF num_topics = nt num_top_words = ntw #Decomposição clf = decomposition.NMF(n_components=num_topics, random_state=1) doctopic = clf.fit_transform(arqArray) topic_words = [] for topic in clf.components_: word_idx = np.argsort(topic)[::-1][0:num_top_words] topic_words.append([vocabulario[i] for i in word_idx]) with np.errstate(invalid='ignore'): doctopic = doctopic / np.sum(doctopic, axis=1, keepdims=True) #print(doctopic) FechaDocumento(arquivo) return(doctopic)
def NMF(self,N_component):#Non-Negative Matrix Factorization NMF_calculator = skdecomp.NMF(N_component,max_iter=1500,tol = 0.0001) self.NMFed_data = NMF_calculator.fit(self.vector) all_NMFs = self.NMFed_data.components_ pp.save_variable(all_NMFs,save_folder+r'\\NMFed_Data.pkl') print('NMF calculation done, generating graphs') self.cell_graph_plot('NMF',all_NMFs)
def fast_ai_nlp_nmf(content): nltk.download('stopwords') stop_words = stopwords.words('russian') # class LemmaTokenizer(object): # def __init__(self): # self.wnl = stem.WordNetLemmatizer() # # def __call__(self, doc): # return [self.wnl.lemmatize(word) for word in word_tokenize(doc)] vectorizer = CountVectorizer(stop_words=stop_words) #, tokenizer=LemmaTokenizer()) vectors = vectorizer.fit_transform(content).todense() # (documents, vocab) vocab = np.array(vectorizer.get_feature_names()) m, n = vectors.shape d = 5 # num topics clf = decomposition.NMF(n_components=d, random_state=1) W1 = clf.fit_transform(vectors) H1 = clf.components_ print(show_topics(H1, vocab)) vectorizer_tfidf = TfidfVectorizer(stop_words=stop_words) vectors_tfidf = vectorizer_tfidf.fit_transform(content) # (documents, vocab) W1 = clf.fit_transform(vectors_tfidf) H1 = clf.components_ print(show_topics(H1, vocab))
def apply(self, k=-1, alpha=1.0, l1=0.75, max_iter=100, rel_err=1e-3): if k == -1: k = self.num_cluster X = self.pre_processing() fixed_W = pd.get_dummies(self.labels) fixed_W_t = fixed_W.T # interpret W as H (transpose), you can only fix H, while optimizing W in the code. So we simply switch those matrices (invert their roles). learned_H_t, fixed_W_t_same, n_iter = decomp.non_negative_factorization(X.astype(np.float), n_components=k, init='custom', random_state=0, update_H=False, H=fixed_W_t.astype(np.float), alpha=alpha, l1_ratio=l1, max_iter=max_iter, shuffle=True, solver='cd',tol=rel_err, verbose=0) init_W = fixed_W_t_same.T init_H = learned_H_t.T nmf = decomp.NMF(alpha=alpha, init='custom',l1_ratio=l1, max_iter=max_iter, n_components=k, random_state=0, shuffle=True, solver='cd', tol=rel_err, verbose=0) W = nmf.fit_transform(X.T, W=init_W, H = init_H) H = nmf.components_ self.cluster_labels = np.argmax(W, axis=1) if np.any(np.isnan(H)): raise Exception('H contains NaNs (alpha={0}, k={1}, l1={2}, data={3}x{4}'.format( alpha, k, l1, X.shape[0], X.shape[1])) if np.any(np.isnan(W)): raise Exception('W contains NaNs (alpha={0}, k={1}, l1={2}, data={3}x{4}'.format( alpha, k, l1, X.shape[0], X.shape[1])) # self.print_reconstruction_error(X, W, H) self.dictionary = H.T self.data_matrix = W.T
def __get_implementation(self, method: Method, n_components): # decomposition if method == self.Method.PCA: return self.Implementation( 'pca', decomposition.PCA(n_components=n_components, svd_solver='randomized', whiten=True)) elif method == self.Method.ICA: return self.Implementation( 'ica', decomposition.FastICA(n_components=n_components, whiten=True, max_iter=1000)) elif method == self.Method.FA: return self.Implementation( 'fa', decomposition.FactorAnalysis(n_components=n_components)) elif method == self.Method.TSVD: return self.Implementation( 'tsvd', decomposition.TruncatedSVD(n_components=n_components)) elif method == self.Method.NMF: return self.Implementation( 'nmf', decomposition.NMF(n_components=n_components)) #clustering elif method == self.Method.KMEANS: return self.Implementation( 'kmeans', cluster.MiniBatchKMeans(n_clusters=n_components, tol=1e-3)) else: raise Exception( 'Error creating estimator. Invalid Type specified.')
def apply(self, k=-1, mix=0.0, reject_ratio=0., alpha=1.0, l1=0.75, max_iter=100, rel_err=1e-3, calc_transferability=False): if k == -1: k = self.num_cluster mixed_data, new_trg_data, trg_data = self.get_mixed_data( mix=mix, reject_ratio=reject_ratio, max_iter=max_iter, rel_err=rel_err, calc_transferability=calc_transferability) nmf = decomp.NMF(alpha=alpha, init='nndsvdar', l1_ratio=l1, max_iter=max_iter, n_components=k, random_state=0, shuffle=True, solver='cd', tol=1e-6, verbose=0) W = nmf.fit_transform(mixed_data) H = nmf.components_ self.dictionary = W self.data_matrix = H self.cluster_labels = np.argmax(nmf.components_, axis=0) self.mixed_data = mixed_data
def create_model(self, k): A, terms = self.vectorize() model = decomposition.NMF(init="nndsvd", n_components=k) # apply the model and extract the two factor matrices W = model.fit_transform(A) H = model.components_ return W, H
def write_topics(ftopics, fwords, ftopics_words, poem_words, n_topic, n_topic_words): """ 找出唐诗语料库中的所有topics :param ftopics: :param fwords: :param ftopics_words: :param poem_words: :param n_topic: :param n_topic_words: :return: """ count_matrix = count_vect.fit_transform(poem_words) tfidf = TfidfTransformer().fit_transform(count_matrix) nmf = decomposition.NMF(n_components=n_topic).fit(tfidf) feature_names = count_vect.get_feature_names() fw = codecs.open(ftopics, 'w', 'utf-8') for topic in nmf.components_: fw.write(' '.join([ feature_names[i] for i in topic.argsort()[:-n_topic_words - 1:-1] ]) + '\n') fw.close() print('Write topics done.') fw = codecs.open(fwords, 'wb') pickle.dump(feature_names, fw) fw.close() print('Write words done.') fw = codecs.open(ftopics_words, 'wb') pickle.dump(nmf.components_, fw) fw.close() print('Write topic_words done.')
def nmf_topic_modeling(self): corpus = [] client = MongoDB() tweetsCollection = client.get_collection('florida') findCorpusQuery = tweetsCollection.find() for fileid in findCorpusQuery: corpus.append(fileid['text']) cachedStopWords = set(STOPWORDS) cachedStopWords.update( ('well', 'say', 'New', 'RT', 'now', 'https', 'via', 'CIA', 'make', 'says', 'new', 'will', 'said', 'take', 'amp', 'one', 'go', 'know', 'day', 'look', 'think', 'gt', 'lt', 'co', 'rt', 'zbvnkkvl48', 'gtwkxla6bn', 'cia', 'elizasoul80', '5febckimwt', 'af', 'ex', 'realdonaldtrump', 'judgejeanine', 'epucokw6b3', 'vqpgqgfwye', 'mr', '12', '61m', 'putin', 're', 'donthecon', 'let', 'ass', 'rep', 'still', 'much', 'steph93065', 'dems')) vectorizer = TfidfVectorizer(stop_words=cachedStopWords, min_df=2) dtm = vectorizer.fit_transform(corpus) vocab = vectorizer.get_feature_names() num_topics = 5 clf = decomposition.NMF(n_components=num_topics, random_state=1) doctopic = clf.fit_transform(dtm) topic_words = [] num_top_words = 5 for topic in clf.components_: word_idx = np.argsort(topic)[::-1][0:num_top_words] # get indexes with highest weights topic_words.append([vocab[i] for i in word_idx]) for t in range(len(topic_words)): print "Topic {}: {}".format(t, ' '.join(topic_words[t][:15]))
def apply(self, k=-1, alpha=1.0, l1=0.75, max_iter=4000, rel_err=1e-3): if k == -1: k = self.num_cluster X = self.pre_processing() nmf = decomp.NMF(alpha=alpha, init='nndsvdar', l1_ratio=l1, max_iter=1000, n_components=k, random_state=0, shuffle=True, solver='cd', tol=0.00001, verbose=0) W = nmf.fit_transform(X) H = nmf.components_ self.cluster_labels = np.argmax(nmf.components_, axis=0) if np.any(np.isnan(H)): raise Exception( 'H contains NaNs (alpha={0}, k={1}, l1={2}, data={3}x{4}'. format(alpha, k, l1, X.shape[0], X.shape[1])) if np.any(np.isnan(W)): raise Exception( 'W contains NaNs (alpha={0}, k={1}, l1={2}, data={3}x{4}'. format(alpha, k, l1, X.shape[0], X.shape[1])) self.print_reconstruction_error(X, W, H) self.dictionary = W self.data_matrix = H
def code_sklearn(arr, method=None, init=None, c=None, params=None, transform=True): """Non-negative matrix factorization using scikits-learn. - arr(``path``) input array (loci x scaled features) see: ``scale_features``. - c(``int``) number of expected histone codes (factorization rank). - init(``str``) matrix factorization initialization method. """ chk_exit(*inp_file(path(arr))) with open(arr) as fh: bam_names = fh.readline().strip().split("\t") bam_scaled = np.loadtxt(fh, delimiter="\t") kwargs = parse_params(params, {"max_iter": 1000}) nmf = decomposition.NMF(n_components=c, init=init, sparseness='components', **kwargs) nmf.fit(bam_scaled) ofn_epi = arr.replace( ".arr", "_%s-c#%s-i#%s-p#%s.epi" % ("pgnmf", c, init, (params or ""))) ofn_arr = arr.replace( ".arr", "_%s-c#%s-i#%s-p#%s.arr" % ("pgnmf", c, init, (params or ""))) write_codes(ofn_epi, nmf.components_, bam_names) if transform: bam_transformed = nmf.transform(bam_scaled) write_values(ofn_arr, bam_transformed, c) return ofn_epi, ofn_arr
def nmf(self, n_components, transform_df=False, transform_test_df=False, seed=1, init='nndsvdar', tol=5e-3): self.nmf_fit = decomposition.NMF(n_components=n_components, init=init, tol=tol, random_state=seed) self.nmf_df = self.nmf_fit.fit_transform(self.df) colnames = ['nmf_{}'.format(x) for x in range(n_components)] self.nmf_df = pd.DataFrame(self.nmf_df, index=self.df.index, columns=colnames) self.nmf_weights = pd.DataFrame(self.nmf_fit.components_, columns=self.df.columns, index=colnames) if transform_df: out_df = self.nmf_fit.transform(self.df) return out_df if transform_test_df: self.nmf_test_df = self.nmf_fit.transform(self.test_df)
def extractTopic(self): """ * Tokenize the all words * Eliminates any word with less than two letters * Forms term frequency–inverse document frequency for each word * Generate Document-term_matrix * Classify topics based on Document-term_matrix and frequency–inverse document frequency * Gathers first "N" numbers from each topic """ self.vectorizer = text.CountVectorizer(input='filename', stop_words='english', min_df=2) for x in range(len(self.fileNames)): temp = self.fileNames[x] self.fileNames[x] = self.baseDirectory + temp self.dtm = self.vectorizer.fit_transform(self.fileNames).toarray() self.vocab = np.array(self.vectorizer.get_feature_names()) self.clf = decomposition.NMF(n_components=self.num_topics, random_state=1) self.doctopic = self.clf.fit_transform(self.dtm) for topic in self.clf.components_: word_idx = np.argsort(topic)[::-1][0:self.num_top_words] self.topic_words.append([self.vocab[i] for i in word_idx]) for t in range(len(self.topic_words)): print("Topic {}: {}".format(t, ' '.join(self.topic_words[t][:15])))
def transform(self, df, y=None): pca = decomposition.PCA(n_components=self.n_components, random_state=self.random_state) pca_train = pca.fit_transform(df) ica = decomposition.FastICA(n_components=self.n_components, random_state=self.random_state) ica_train = ica.fit_transform(df) tsvd = decomposition.TruncatedSVD(n_components=self.n_components, random_state=self.random_state) tsvd_train = tsvd.fit_transform(df) nmf = decomposition.NMF(n_components=self.n_components, random_state=self.random_state) nmf_train = nmf.fit_transform(df) for i in range(1, self.n_components + 1): df['pca_' + str(i)] = pca_train[:, i - 1] df['ica_' + str(i)] = ica_train[:, i - 1] df['tsvd_' + str(i)] = tsvd_train[:, i - 1] df['nmf_' + str(i)] = nmf_train[:, i - 1] return df
def find_NMF_topics(self): """ :param num_topics: :param num_top_words: a list of top words for each topic :return: """ vectorizer = text.CountVectorizer(input='filename', stop_words='english', min_df=self.min_df, max_df=self.max_df) dtm = vectorizer.fit_transform(self.all_documents).toarray() vocab = np.array(vectorizer.get_feature_names()) clf = decomposition.NMF(n_components=self.num_topics, random_state=1) # it shows for how many proability each corpus is related to a word in topic results self.doctopic = clf.fit_transform(dtm) self.topic_words = [] for topic in clf.components_: word_idx = np.argsort(topic)[::-1][0:self.num_top_words] self.topic_words.append([vocab[i] for i in word_idx]) return
def extract_topics_nmf(data_samples, preprocessor, n_features, n_topics, n_top_words, n_gram_range=(1,1), more_stopwords=None): nmf = decomposer.NMF(n_components=n_topics, random_state=1, alpha=.1, l1_ratio=.5) topics_words = _extract_topics_decomposer(data_samples, preprocessor, nmf, n_features, n_topics, n_top_words, n_gram_range, more_stopwords) return topics_words
def get_cluster_membership(self): """ Determine the cluster number that each sample is associated with. """ model = skld.NMF(n_components=self._num_clusters, init='random', beta=.3, eta=.5, max_iter=5000, nls_max_iter=10000) w = model.fit_transform(self._matrix) h = model.components_ # convert the 'H' matrix, which represents weights for our data matrix W, into # an array representing cluster membership. Index of biggest value in each # col of matrix H is the cluster clusters = [] model_width = len(h[0]) for col_idx in range(model_width): # convert column into an array col_vals = h[:, col_idx] # determine biggest row index and it's value from the array (row_idx, max_val) = max(enumerate(col_vals), key=lambda x: x[1]) clusters.append(row_idx) # clusters array, w, h return (clusters, w, h)
def init_params(X, S, Kpart, nmf_noise=1e-2): F, N, J = S.shape I = X.shape[2] W = [] H = [] for j in range(J): model = skd.NMF(n_components=Kpart[j], init='random', random_state=0) s_2 = squared_module(S[:, :, j]) W.append( model.fit_transform(s_2) + np.random.uniform(0, nmf_noise, (F, Kpart[j]))) H.append(model.components_ + np.random.uniform(0, nmf_noise, (Kpart[j], N))) W = np.concatenate(tuple(W), axis=1).astype(complex) H = np.concatenate(tuple(H), axis=0).astype(complex) A = np.zeros((F, I, J), dtype=complex) sigma_b = np.zeros(F, dtype=complex) Rxx = np.zeros((F, I, I), dtype=complex) Rxs = np.zeros((F, I, J), dtype=complex) Rss = np.zeros((F, J, J), dtype=complex) for f in range(F): Rxx[f] = r_hat(X[f]) Rxs[f] = r_hat(X[f], S[f]) Rss[f] = r_hat(S[f]) A[f] = Rxs[f].dot(np.linalg.inv(Rss[f])) sigma_b[f] = np.mean( np.real( np.diagonal(Rxx[f] - A[f].dot(np.matrix(Rxs[f]).getH()) - Rxs[f].dot(np.matrix(A[f]).getH()) + A[f].dot(Rss[f].dot(np.matrix(A[f]).getH()))))) return A, W, H, sigma_b, Rxx, Rxs, Rss
def nmf(bag_of_words, vocab, topics, top_words): dtm = bag_of_words.toarray() from sklearn import decomposition num_topics = topics num_top_words = top_words clf = decomposition.NMF(n_components=num_topics, random_state=1) doctopic = clf.fit_transform(dtm) #print words associated with topics topic_words = [] for topic in clf.components_: word_idx = np.argsort(topic)[::-1][0:num_top_words] topic_words.append([vocab[i] for i in word_idx]) #make visualization easier #doctopic = doctopic/np.sum(doctopic, axis = 1, keepdims=True) #print "doctopic matrix" #print doctopic[0:5] print "show top 15 words" for t in range(len(topic_words)): print ("topic {}: {}".format(t, ' '.join(topic_words[t][:15]))) return doctopic
def generate_topics_NMF(pickle_file, state, no_of_topics): corpus = [] tweet_list = pickle.load(open(pickle_file, 'rb')) tweet_corpus = [] # Removing Stopwords for tweet in tweet_list: cln_tweet = '' for wrd in tweet.split(): if wrd not in stopwords: cln_tweet += ' ' + wrd tweet_corpus.append(cln_tweet) #NMF vectorizer = TfidfVectorizer(stop_words='english', min_df=2) dtm = vectorizer.fit_transform(tweet_corpus) vocab = vectorizer.get_feature_names() num_topics = no_of_topics clf = decomposition.NMF(n_components=num_topics, random_state=1) doctopic = clf.fit_transform(dtm) topic_words = [] num_top_words = 6 for topic in clf.components_: word_idx = np.argsort(topic)[::-1][0:num_top_words] topic_words.append([vocab[i] for i in word_idx]) #Saving data to text file with open('op4_topics_{}_{}.txt'.format(pickle_file.split('.')[0], state), 'w') as outfile: outfile.write('{}\nPossible topics in the given data are\n{}\n'.format( '==' * 50, '==' * 50)) for t in range(len(topic_words)): outfile.write('Topic {}: {}\n'.format( t + 1, ' '.join(topic_words[t][:15])))
def nmf_topic_modeling(self): corpus = [] client = MongoDB() tweetsCollection = client.get_collection('newmexico') findCorpusQuery = tweetsCollection.find() for fileid in findCorpusQuery: corpus.append(fileid['text']) cachedStopWords = set(STOPWORDS) cachedStopWords.update( ('well', 'say', 'New', 'RT', 'now', 'https', 'via', 'CIA', 'make', 'says', 'new', 'will', 'said', 'take', 'amp', 'one', 'go', 'know', 'day', 'look', 'think', 'gt', 'lt', 'co', 'rt', 'zbvnkkvl48', 'gtwkxla6bn', 'cia', 'elizasoul80', '5febckimwt', 'af', 'ex', 'putin', 'realdonaldtrump', 'msm', 'dworkinreport', 'knightstrumplar', 'vp', 'ur', 'us', 'join', 'trumppence', 'thanx', 'pvtjokerus', 're', 'see', 'mayflowerscandal', 'agree', 'maga', 'wh', 'yeah')) vectorizer = TfidfVectorizer(stop_words=cachedStopWords, min_df=2) dtm = vectorizer.fit_transform(corpus) vocab = vectorizer.get_feature_names() num_topics = 5 clf = decomposition.NMF(n_components=num_topics, random_state=1) doctopic = clf.fit_transform(dtm) topic_words = [] num_top_words = 5 for topic in clf.components_: word_idx = np.argsort(topic)[::-1][0:num_top_words] # get indexes with highest weights topic_words.append([vocab[i] for i in word_idx]) for t in range(len(topic_words)): print "Topic {}: {}".format(t, ' '.join(topic_words[t][:15]))
def topic_extractions(df): vectorizer = CountVectorizer(stop_words='english') X_train_dtm = vectorizer.fit_transform(df['article']) vocab = np.array(vectorizer.get_feature_names()) # Generating Decomposition Model to extract topics clf = decomposition.NMF(n_components=NUM_TOPICS, random_state=1) doctopic = clf.fit_transform(X_train_dtm) # Generating dominant topics for each words topic_words = [] for topic in clf.components_: word_idx = np.argsort(topic)[::-1][0:NUM_TOP_WORDS] topic_words.append([vocab[i] for i in word_idx]) # Making DataFrame that gets the doctopic (values of topics for each text) dftopic = pd.DataFrame(doctopic, columns=topic_words) dftopicinv = dftopic.T # Getting the dominant topic topic_series = [] for i in np.arange(dftopic.shape[0]): topic_series.append(dftopicinv[i].idxmax()) df['toptopic'] = topic_series return df
def write_topics(ftopics, fwords, ftopics_words, poem_words, n_topic, n_topic_words): count_matrix = count_vect.fit_transform(poem_words) tfidf = TfidfTransformer().fit_transform(count_matrix) nmf = decomposition.NMF(n_components=n_topic).fit(tfidf) feature_names = count_vect.get_feature_names() fw = codecs.open(ftopics, 'w', 'utf-8') for topic in nmf.components_: fw.write(' '.join([feature_names[i] for i in topic.argsort()[:-n_topic_words - 1:-1]]) + '\n') fw.close() print('Write topics done.') fw = codecs.open(fwords, 'wb') flen = len(feature_names) #for i in range(flen): #fw.write(feature_names[i]) #fw.write('\n') pickle.dump(feature_names, fw) #feature_names = feature_names.encode('gbk') fw.close() print('Write words done.') fw = codecs.open(ftopics_words, 'wb') nmf.components_.tolist() nmflen=len(nmf.components_) #for i in range(nmflen): #print(nmf.components_[i]) #print(type(nmf.components_[i])) #fw.write(nmf.components_[i]) #fw.write('\n') pickle.dump(nmf.components_, fw) print((nmf.components_)) fw.close() print('Write topic_words done.')