def applyNMF(self, number_of_clusters, country_specific_tweets): train, feature_names = self.extractFeatures(country_specific_tweets,False) name = "nmf" # Fit the NMF model if self.results: print("Fitting the NMF model", end=" - ") t0 = time() nmf = NMF(n_components=number_of_clusters, random_state=1, alpha=.1, l1_ratio=.5).fit(train) if self.results: print("done in %0.3fs." % (time() - t0)) if self.results: print("\nNMF:") parameters = nmf.get_params() if self.results: print("Parameter: " + str(parameters)) topics = nmf.components_ doc_topic = nmf.transform(train) top10, labels = self.printTopicCluster(topics, doc_topic, feature_names) labels = numpy.asarray(labels) if self.results: print("Silhouette Coefficient {0}: {1}".format(name, metrics.silhouette_score(train, labels))) return name, parameters, top10, labels
def tfidf_nmf(release_texts, n_components=10, max_features=None): ''' Creates and fits tfidf and NMF models. INPUT: - n_components: number of latent features for the NMF model to find - max_features: max number of features (vocabulary size) for the tfidf model to consider OUTPUT: - tfidf_vectorizer: tfidf model object - tfidf_sparse:tfidf sparse matrix - nmf: NMF model object - W: Feature matrix output from NMF factorization into W and H matrices ''' # tfidf model custom_stop_words = make_stop_words() tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words=custom_stop_words, max_features=max_features) tfidf_sparse = tfidf_vectorizer.fit_transform(release_texts) # normalize row-wise so each row sums to one tfidf_sparse = normalize(tfidf_sparse, axis=1, norm='l1') # nmf model nmf = NMF(n_components=n_components, random_state=1) nmf.fit(tfidf_sparse) W = nmf.transform(tfidf_sparse) return tfidf_vectorizer, tfidf_sparse, nmf, W
def hog2hognmf(hog_feature): """Transform HOG feature into HOG-NMF feature. Parameters ---------- hog_feature: np.ndarray HOG feature. """ mat = np.zeros((500, 8), dtype=np.float32) NMFmodel = NMF(n_components=2, init="random", random_state=0) # Transform 3780 into 500 * 8 for i in range(7): mat[:, i] = hog_feature[i * 500 : (i + 1) * 500] mat[:280, 7] = hog_feature[3500:] W = NMFmodel.fit_transform(mat) H = NMFmodel.components_ hognmf_feature = np.array([], dtype=np.float32) for i in range(8): _sum = np.sum(H[:, i]) if _sum == 0: H[:, i] *= 0.0 else: H[:, i] /= _sum hognmf_feature = np.append(hognmf_feature, H[:, i]) for i in range(500): _sum = np.sum(W[i, :]) if _sum == 0: W[i, :] *= 0.0 else: W[i, :] /= _sum hognmf_feature = np.append(hognmf_feature, W[i, :]) return hognmf_feature
def nmf(self, **kwargs): """Perform dimensionality reduction using NMF.""" nmf = NMF(**kwargs) reduced_matrix = nmf.fit_transform(self.matrix) # TODO: it is incorrect to pass self.column_labels! There are not column labels. return Space(reduced_matrix, self.row_labels, self.column_labels)
def nmf_df(sym, k, coll): data = [ item for item in coll.find({'text': { '$in' :[re.compile(sym)] }}) ] sents = [ sentence['text'] for sentence in data ] dates = [ str(text['created_at']) for text in data ] d = np.array(dates).T d = d.reshape(len(dates), 1) vectorizer = TfidfVectorizer(stop_words='english', max_features=1000) X = vectorizer.fit_transform(sents) #features = vectorizer.get_feature_names() model = NMF(n_components=k, init='random', random_state=0) latent_features = model.fit_transform(X) # lat0 = list(latent_features[:,0]) # lat1 = list(latent_features[:,1]) # lat2 = list(latent_features[:,2]) # lat3 = list(latent_features[:,3]) df = pd.DataFrame(latent_features) #np.concatenate((d, latent_features), axis=1) df.columns = [ 'lat'+ str(n) for n in xrange(len(df.columns)) ] df['time_stamp'] = d #print df.head() df['date'] = pd.to_datetime(df['time_stamp']).apply(pd.datetools.normalize_date) df.pop('time_stamp') #print df.head() grouped_data = df.groupby(['date']).mean() grouped_data['sym'] = sym return grouped_data
def test_nmf_fit_close(solver): rng = np.random.mtrand.RandomState(42) # Test that the fit is not too far away pnmf = NMF(5, solver=solver, init='nndsvdar', random_state=0, max_iter=600) X = np.abs(rng.randn(6, 5)) assert_less(pnmf.fit(X).reconstruction_err_, 0.1)
def find_aspects(sentences, city, n_top_words=15): ''' INPUT sentences, city(str, lower case) OUTPUT aspects dictionary ''' vectorizer = TfidfVectorizer(max_features=n_features, stop_words='english') document_term_mat = vectorizer.fit_transform(sentences) feature_words = vectorizer.get_feature_names() nmf = NMF(n_components=n_topics) W_sklearn = nmf.fit_transform(document_term_mat) H_sklearn = nmf.components_ important_words = [] for topic in H_sklearn: for i in topic.argsort()[:-n_top_words - 1:-1]: important_words.append(feature_words[i]) important_words = set(important_words) important_words = list(important_words) nouns = [] for i in sentences: nouns.extend(list(TextBlob(i).noun_phrases)) noun_list = list(set(filter(lambda x: (len(x.split(' '))>1)&('...' not in x.split(' ')), nouns))) aspects_dict = defaultdict(list) for i in important_words: if i not in [city, city.lower(),'okay','ok','thing','things','time','times','greasy','awful'] and TextBlob(i).tags[0][1] in ['NN', 'NNS']: for j in noun_list: if i in j.split(' '): aspects_dict[i].append(j) for i in aspects_dict: aspects_dict[i] = list(set(aspects_dict[i])) return aspects_dict
def extract_reconstruction_errors(comps, music_stft, window_length, hop): K = comps.shape[1] #initialize transformer (non-negative matrix factorization) with K components transformer = NMF(n_components = K, init = 'custom') #W and H are random at first W = np.random.rand(comps.shape[0], K) start = 0 errors = [] while (start + window_length < music_stft.shape[1]): block = music_stft[:, start:start+window_length] H = np.random.rand(K, block.shape[1]) W[:, 0:K] = comps params = {'W': W, 'H': H, 'update_W': False} comps_block = transformer.fit_transform(np.abs(block), **params) acts_block = transformer.components_ #reconstruct the signal block_reconstruction = comps_block.dot(acts_block) errors.append(transformer.reconstruction_err_) start = start + hop return errors
def produceEncoding( trainX, nComponents ): '''Produces an NMF encoding from the training data matrix''' model = NMF( n_components=nComponents, solver='cd', \ tol=1e-4, max_iter=200, alpha=0.0 ) model.fit( trainX ) return model
def fit_nmf(tfidf): '''takes in a tfidf sparse vector and finds the top topics''' nmf = NMF(n_components=n_topics, random_state=1, alpha=.1, l1_ratio=.5) nmf.fit(tfidf) tfidf_feature_names = tfidf_vectorizer.get_feature_names() nmf_topic_dict = print_top_words(nmf, tfidf_feature_names, n_top_words) return nmf, nmf_topic_dict
def extract_reconstruction_error_beats(comps, music_stft, beats): K = comps.shape[1] #initialize transformer (non-negative matrix factorization) with K components transformer = NMF(n_components = K, init = 'custom') #W and H are random at first W = np.random.rand(comps.shape[0], K) start = 0 errors = [] lookback = 0 weight = np.array([1 for i in range(2, music_stft.shape[0] + 2)]) weight = weight/np.max(weight) for i in range(lookback+1, len(beats)): block = music_stft[:, beats[i-(lookback+1)]:beats[i]] H = np.random.rand(K, block.shape[1]) W[:, 0:K] = comps params = {'W': W, 'H': H, 'update_W': False} comps_block = transformer.fit_transform(np.abs(block), **params) acts_block = transformer.components_ #reconstruct the signal block_reconstruction = comps_block.dot(acts_block) block_reconstruction = block_reconstruction.T*weight block = block.T*weight distance = norm(block_reconstruction - np.abs(block)) #errors.append(transformer.reconstruction_err_) errors.append(distance) return errors
def extract_template(comps, music_stft): K = comps.shape[1] #initialize transformer (non-negative matrix factorization) with K components transformer = NMF(n_components = K, init = 'custom') #W and H are random at first W = np.random.rand(comps.shape[0], K) H = np.random.rand(K, music_stft.shape[1]) #set W to be the template components you want to extract W[:, 0:K] = comps #don't let W get updated in the non-negative matrix factorization params = {'W': W, 'H': H, 'update_W': False} comps_music = transformer.fit_transform(np.abs(music_stft), **params) acts_music = transformer.components_ #reconstruct the signal music_reconstruction = comps_music.dot(acts_music) #mask the input signal music_stft_max = np.maximum(music_reconstruction, np.abs(music_stft)) mask = np.divide(music_reconstruction, music_stft_max) mask = np.nan_to_num(mask) #binary mask mask = np.round(mask) #template - extracted template, residual - everything that's leftover. template = np.multiply(music_stft, mask) residual = np.multiply(music_stft, 1 - mask) return template, residual
def doNMF(datan,n_components=4): # from Mitsu #alternatively PCA ... might me faster nmf=NMF(n_components=n_components,init='nndsvd') data_decomp_all=nmf.fit_transform(datan) data_components_all=nmf.components_ return data_decomp_all,data_components_all
def _make_test_matrix(self, matrix, test_decomp='svd'): ''' Input: a matrix Output: a recomposed estimated ratings matrix Decomposes input matrix according to decomposition type and then makes an estimated ratings matrix ''' if test_decomp == 'svd': _, s1, V = svd(matrix) how = self.s_option how = self.test_how #print "s1", s1 #print "how", how s = self._get_s(s1, how) #print s #print V #print self.matrix_1.U return np.dot(self.matrix_1.U, np.dot(s, V)) elif test_decomp == 'nmf': model = NMF() H = model.fit_transform(matrix) print H W = model.components_ return np.dot(self.matrix_1.H, W) else: pass '''
def test_nmf_transform(): # Test that NMF.transform returns close values A = np.abs(random_state.randn(6, 5)) m = NMF(n_components=4, init="nndsvd", random_state=0) ft = m.fit_transform(A) t = m.transform(A) assert_array_almost_equal(ft, t, decimal=2)
def __Factorize_NMF(self,K): model = NMF(n_components=K,max_iter=self._iteration) model.fit(self._mat) user_fmat = model.fit_transform(self._mat) item_fmat = model.components_.T return user_fmat,item_fmat
def do_NMF(sparse_matrix): t0 = time.time() print("* Performing NMF on sparse matrix ... ") nmf = NMF(n_components=3) coordinates = nmf.fit_transform(sparse_matrix) print("done in %0.3fs." % (time.time() - t0)) return(coordinates)
def nnMatrixFactorisation(data, labels, new_dimension): print "non negative matrix factorisation..." start = time.time() mf = NMF(n_components=new_dimension) reduced = mf.fit_transform(data) end = time.time() return (reduced, end-start)
def find_template(music_stft, sr, min_t, n_components, start, end): """ from Prem :param music_stft: :param sr: :param min_t: :param n_components: :param start: :param end: :return: """ template_stft = music_stft[:, start:end] layer = librosa.istft(template_stft) layer_rms = np.sqrt(np.mean(layer * layer)) comps = [] acts = [] errors = [] for T in range(min_t, n_components): transformer = NMF(n_components=T) comps.append(transformer.fit_transform(np.abs(template_stft))) acts.append(transformer.components_) errors.append(transformer.reconstruction_err_) # knee = np.diff(errors, 2) # knee = knee.argmax() + 2 knee = 0 # print 'Using %d components' % (knee + min_t) return comps[knee], acts[knee]
def test_nmf_fit_nn_output(): # Test that the decomposition does not contain negative values A = np.c_[5 * np.ones(5) - np.arange(1, 6), 5 * np.ones(5) + np.arange(1, 6)] for init in (None, "nndsvd", "nndsvda", "nndsvdar"): model = NMF(n_components=2, init=init, random_state=0) transf = model.fit_transform(A) assert_false((model.components_ < 0).any() or (transf < 0).any())
def nmf_model2(n_topics,document_term_mat): # print("\n\n---------\n decomposition") nmf = NMF(n_components=n_topics, l1_ratio=0.0) W_sklearn = nmf.fit_transform(document_term_mat) H_sklearn = nmf.components_ # describe_nmf_results(document_term_mat, W_sklearn, H_sklearn) return W_sklearn, H_sklearn
def infer_topics(self, num_topics=10): self.nb_topics = num_topics nmf = NMF(n_components=num_topics) topic_document = nmf.fit_transform(self.corpus.sklearn_vector_space) self.topic_word_matrix = [] self.document_topic_matrix = [] vocabulary_size = len(self.corpus.vocabulary) row = [] col = [] data = [] for (topic_idx, topic) in enumerate(nmf.components_): for i in range(vocabulary_size): row.append(topic_idx) col.append(i) data.append(topic[i]) self.topic_word_matrix = coo_matrix((data, (row, col)), shape=(self.nb_topics, len(self.corpus.vocabulary))).tocsr() row = [] col = [] data = [] doc_count = 0 for doc in topic_document: topic_count = 0 for topic_weight in doc: row.append(doc_count) col.append(topic_count) data.append(topic_weight) topic_count += 1 doc_count += 1 self.document_topic_matrix = coo_matrix((data, (row, col)), shape=(self.corpus.size, self.nb_topics)).tocsr()
def reduceDimensionality(n_components=100): # import the csv into a pandas df df = pd.read_csv('data/gameData.csv') # Normalize the numeric columns to values in [0,1] numericColumns = ['maxPlayers','maxPlaytime','minAge','minPlayers','minPlaytime','playtime'] colsToNormalize = [] for col in numericColumns: if col in df.columns: colsToNormalize.append(col) df[colsToNormalize] = df[colsToNormalize].apply(lambda x: (x - x.min())/(x.max() - x.min())/2) # Drop string columns colsToDrop = ['artists','categories','designers','families','publishers','mechanics','boardGameId','yearPublished'] # Convert df to an array for NMF and stor the board game id column to attach later boardGameIds = df['boardGameId'] arr = df.as_matrix([col for col in df.columns if col not in colsToDrop]) arr = np.nan_to_num(arr) # Perform NMF with n_dimensions model = NMF(n_components=n_components) W = model.fit_transform(arr) W = np.insert(W, 0, boardGameIds, axis=1) np.savetxt("data/reducedGameFeatures.csv", W, delimiter=",")
def extractTemplate(y, w=d_w, h=d_h, n_components=nc): model = NMF(n_components=n_components, max_iter=max_iter, beta=beta) S = librosa.core.stft(y, n_fft=w, hop_length=h) model.fit_transform(np.abs(S).T) components = model.components_.T #components, activation = librosa.decompose.decompose(np.abs(S), n_components=3) return components
def get_topics_nmf(urls, num_topics): '''Input: URL containing links to each document (pdf) in the corpus (i.e. arxiv) Output: the num_topics most important latent topics from the corpus (via NMF) ''' article_info = [] for url in urls: article_info.append(get_text(url)) text = [] for thing in article_info: text.extend(thing[0]) text = clean_pdf_text(text) tfidf_math = TfidfVectorizer(max_features=100, stop_words=math_stop(), ngram_range=(1, 1), decode_error='ignore') M = tfidf_math.fit_transform(text) feature_names = tfidf_math.get_feature_names() feature_names = [WordNetLemmatizer().lemmatize(word) for word in feature_names] nmf = NMF(n_components=num_topics) nmf.fit(M) topics = [] for topic_idx, topic in enumerate(nmf.components_): topics.append((" ".join([feature_names[i] for i in topic.argsort()[:-10 - 1:-1]]))) return M, topics, text, title_list, urls
def extract_tfidf_nmf_feats(self, df_data, n_components): """ Extract tfidf features using nmf. """ df_feat = pd.DataFrame(index=range(df_data.shape[0])) tfidf = TfidfVectorizer(ngram_range=(2, 3), stop_words='english') tsvd = TruncatedSVD(n_components=n_components, random_state = 2016) nmf = NMF(solver='cd', n_components=n_components, init='nndsvda', random_state=0, tol=1e-3) df_data['q'].to_csv('q', index=False) df_data['t'].to_csv('t', index=False) df_data['d'].to_csv('d', index=False) print('fitting in tfidf') tfidf.set_params(input='filename') tfidf.fit(['q','t','d']) tfidf.set_params(input='content') for col in ['d', 't', 'q', 'b']: print('process column', col) txt = df_data[col] tfidf_mat = tfidf.transform(txt) nd_feat = nmf.fit_transform(tfidf_mat) tmp = pd.DataFrame(nd_feat, columns=[col+'_tfidf_nmf_comp'+str(i) \ for i in range(n_components)]) df_feat = pd.merge(df_feat, tmp, left_index=True, right_index=True) saveit(df_feat, 'df_tfidf_nmf_feats')
def get_LDA(X, num_components=10, show_topics=True): """ Latent Dirichlet Allication by NMF. 21 Nov 2015, Keunwoo Choi LDA for a song-tag matrix. The motivation is same as get_LSI. With NMF, it is easier to explain what each topic represent - by inspecting 'H' matrix, where X ~= X' = W*H as a result of NMF. It is also good to have non-negative elements, straight-forward for both W and H. """ from sklearn.decomposition import NMF if X == None: print 'X is omitted, so just assume it is the mood tag mtx w audio.' X = np.load(PATH_DATA + FILE_DICT["mood_tags_matrix"]) #np matrix, 9320-by-100 nmf = NMF(init='nndsvd', n_components=num_components, max_iter=400) # 400 is too large, but it doesn't hurt. W = nmf.fit_transform(X) H = nmf.components_ print '='*60 print "NMF done with k=%d, average error:%2.4f" % (num_components, nmf.reconstruction_err_/(X.shape[0]*X.shape[1])) term_rankings = [] moodnames = cP.load(open(PATH_DATA + FILE_DICT["moodnames"], 'r')) #list, 100 for topic_index in range( H.shape[0] ): top_indices = np.argsort( H[topic_index,:] )[::-1][0:10] term_ranking = [moodnames[i] for i in top_indices] term_rankings.append(term_ranking) if show_topics: print "Topic %d: %s" % ( topic_index, ", ".join( term_ranking ) ) print '='*60 cP.dump(term_rankings, open(PATH_DATA + (FILE_DICT["mood_topics_strings"] % num_components), 'w')) return W / np.max(W) # return normalised matrix, [0, 1]
def get_LDA(X, num_components=10, show_topics=True): ''' Latent Dirichlet Allication by NMF. 21 Nov 2015, Keunwoo Choi LDA for a song-tag matrix. The motivation is same as get_LSI. With NMF, it is easier to explain what each topic represent - by inspecting 'H' matrix, where X ~= X' = W*H as a result of NMF. It is also good to have non-negative elements, straight-forward for both W and H. ''' from sklearn.decomposition import NMF nmf = NMF(init='nndsvd', n_components=num_components, max_iter=400) # 400 is too large, but it doesn't hurt. W = nmf.fit_transform(X) H = nmf.components_ print '='*60 print "NMF done with k=%d, average error:%2.4f" % (num_components, nmf.reconstruction_err_/(X.shape[0]*X.shape[1])) term_rankings = [] moodnames = cP.load(open(PATH_DATA + FILE_DICT['sorted_tags'], 'r')) #list, 100 for topic_index in range( H.shape[0] ): top_indices = np.argsort( H[topic_index,:] )[::-1][0:10] term_ranking = [moodnames[i] for i in top_indices] term_rankings.append(term_ranking) if show_topics: print "Topic %d: %s" % ( topic_index, ", ".join( term_ranking ) ) print '='*60 cP.dump(nmf, open(PATH_DATA + 'NMF_object.cP', 'w')) cP.dump(term_rankings, open(PATH_DATA + ('topics_strings_%d_components.cP' % num_components), 'w')) for row_idx, row in enumerate(W): if np.max(row) != 0: W[row_idx] = row / np.max(row) return W / np.max(W) # return normalised matrix, [0, 1] ''''''
class NMFReducer(): def __init__(self, dataset, dataset_name, num_components=10): self.dataset = dataset self.dataset_name = dataset_name self.labels = dataset.target self.scaler = MinMaxScaler() self.data = self.scaler.fit_transform(dataset.data) self.n_samples, self.n_features = self.data.shape self.reducer = NMF(n_components=num_components, max_iter=5000) def reduce(self): self.reducer.fit(self.data) self.reduced = self.scaler.fit_transform(self.reducer.transform(self.data)) return self.reduced def benchmark(self, estimator, name, data): t0 = time() sample_size = 300 labels = self.labels estimator.fit(data) print('% 9s %.2fs %i %.3f %.3f %.3f %.3f %.3f %.3f' % (name, (time() - t0), estimator.inertia_, metrics.homogeneity_score(labels, estimator.labels_), metrics.completeness_score(labels, estimator.labels_), metrics.v_measure_score(labels, estimator.labels_), metrics.adjusted_rand_score(labels, estimator.labels_), metrics.adjusted_mutual_info_score(labels, estimator.labels_), metrics.silhouette_score(data, estimator.labels_, metric='euclidean', sample_size=sample_size))) def display_reduced_digits(self): sys.stdout = open('out/NMFReduceDigitsOutput.txt', 'w') print("NMF Reduction of %s:\n" % self.dataset_name) print(40 * '-') print(self.reduced) print("\nLength of 1 input vector before reduction: %d \n" % len(self.data.tolist()[0])) print("Length of 1 input vector after reduction: %d \n" % len(self.reduced.tolist()[0])) print(40 * '-') print(self.reducer.reconstruction_err_) def display_reduced_iris(self): sys.stdout = open('out/NMFReduceIrisOutput.txt', 'w') print("NMF Reduction of %s:\n" % self.dataset_name) print(40 * '-') print(self.reduced) print("\nLength of 1 input vector before reduction: %d \n" % len(self.data.tolist()[0])) print("Length of 1 input vector after reduction: %d \n" % len(self.reduced.tolist()[0])) print(40 * '-') print(self.reducer.reconstruction_err_) def reduce_crossvalidation_set(self, X_train, X_test): self.reducer.fit(X_train) reduced_X_train = self.scaler.transform(X_train) reduced_X_test = self.scaler.transform(X_test) return reduced_X_train, reduced_X_test
def test_nmf_transform(): # Test that NMF.transform returns close values A = np.abs(random_state.randn(6, 5)) for solver in ('pg', 'cd'): m = NMF(solver=solver, n_components=4, init='nndsvd', random_state=0) ft = m.fit_transform(A) t = m.transform(A) assert_array_almost_equal(ft, t, decimal=2)
# Use tf (raw term count) features for LDA. print("抽取 tf 特征,用于LDA") tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=n_features, stop_words='english') t0 = time() tf = tf_vectorizer.fit_transform(data_samples) print("抽取 tf 特征完成 in %0.3fs." % (time() - t0)) print() # Fit the NMF model print("用tf-idf特征训练NMF模型(范数),, " "文章个数=%d and 特征个数=%d..." % (n_samples, n_features)) t0 = time() nmf = NMF(n_components=n_components, random_state=1, alpha=.1, l1_ratio=.5).fit(tfidf) print("训练完成。done in %0.3fs." % (time() - t0)) print("\n在非负的矩阵分解模型(范数)的主题:") tfidf_feature_names = tfidf_vectorizer.get_feature_names() print_top_words(nmf, tfidf_feature_names, n_top_words) # Fit the NMF model print("用ft-idf特征训练非负的矩阵分解模型(普通的KL散度), 文章个数=%d and 特征个数=%d..." % (n_samples, n_features)) t0 = time() nmf = NMF(n_components=n_components, random_state=1, beta_loss='kullback-leibler', solver='mu', max_iter=1000,
start_time = time.time() # vectorize documents by using tfidf vectorizer tfidf_vectorizer = TfidfVectorizer(tokenizer=tokenization, max_features=n_features, max_df=0.9, min_df=2) docs_tfidf = tfidf_vectorizer.fit_transform(doc_set) termid_word_list = tfidf_vectorizer.get_feature_names( ) # word = termid_word_list[indx] print("Fitting the NMF model...") # solver: coordinate descent; learning rate: alpha = 0.1; #l1_ratio 0: L2 regularization, NO L1 regularization nmf_model = NMF(n_components=n_factors, random_state=1, solver='cd', alpha=.1, l1_ratio=.0) # generate latent factors for documents based on NMF model docs_lf = nmf_model.fit_transform(docs_tfidf) for qIndex in range(0, len(queryID_list)): #for qIndex in range(0, 2): print(str(qIndex) + "/" + str(len(queryID_list))) query_str = queries_dict[queryID_list[qIndex]] query = [query_str] # generate tfidf vector for the query query_tfidf = tfidf_vectorizer.transform(query) # generate latent factor for the query based on NMF model query_lf = nmf_model.transform(query_tfidf)
# tf-idf for max_fq in df_gradients: tweetImport = codecs.open(importfilename, 'r', 'utf-8') # NMF can use tf-idf # lowercase=False tfidf_vectorizer = TfidfVectorizer(strip_accents='ascii', ngram_range=(ngram_min, ngram_max), max_df=max_fq, min_df=1, max_features=num_features, stop_words=stop_words, analyzer='word', token_pattern='[a-zA-Z]+') tfidf_matrix = tfidf_vectorizer.fit_transform(tweetImport) tfidf_feature_names = tfidf_vectorizer.get_feature_names() stop_words.extend(tfidf_feature_names) tweetImport.close() # save the terms ranked by tfidf scores into a list, to be used for wordcloud plotting version = 3 saveTerms_sortedTFIDFscores(outputPath, max_fq, num_features, version, tfidf_feature_names, tfidf_matrix) # Run NMF (results not as good as LDA) nmf = NMF(n_components=num_topics, random_state=1, alpha=0, init='random').fit(tfidf_matrix) display_topics(nmf, tfidf_feature_names, num_top_words) # plot all wordclouds in one figure # wordcloud_in_one_figure(outputPath, num_features, df_gradients) # plot individual wordclouds: plt.rcParams['figure.figsize'] = (10.0, 7.0) for max_fq in df_gradients: tfidffilename = outputPath + 'tweet_keyword_tradewar_tfidf_features_' + str(max_fq) + '_' + str(num_features) + '_v3.csv' tfidffile = open(tfidffilename, 'r') word_text = tfidffile.read() wordcloud = WordCloud(colormap='hsv', max_words=1000, width=3000, height=2000, margin=3, collocations=False).generate(word_text)
def topics(df, model="lda", stopwords=None): """ Either executes LDA or NMF on a dutch document. This is a simple implementation and only used for "fun" purposes. It is not so much to find the very best topics, but topics that are good enough. Parameters: ----------- df : pandas dataframe Pandas dataframe that contains the raw messages mode : str, default "lda" Which model to use for topic modelling. Either "lda" or "nmf" works for now stopwords : str, default None If you want to remove stopwords, provide a local link to the text file (that includes a list of words) including the extension. """ # Prepare stopwords if stopwords: with open(stopwords) as stopwords_list: stopwords_list = stopwords_list.readlines() stopwords_list = [word[:-1] for word in stopwords_list] else: stopwords_list = [] # Create Topics for user in df.User.unique(): print("#" * len(user) + "########") print("### " + user + " ###") print("#" * len(user) + "########\n") data_samples = df[df.User == user].Message_Prepared data_samples = data_samples.tolist() if model == "lda": # Extracting Features tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words=stopwords_list) tf = tf_vectorizer.fit_transform(data_samples) # Fitting LDA topic_model = LatentDirichletAllocation(n_components=5, max_iter=5, learning_method='online', learning_offset=50., random_state=0) topic_model.fit(tf) feature_names = tf_vectorizer.get_feature_names() else: # MNF uses tfidf tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words=stopwords_list) tfidf = tfidf_vectorizer.fit_transform(data_samples) feature_names = tfidf_vectorizer.get_feature_names() # Run NMF topic_model = NMF(n_components=5, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd') topic_model.fit(tfidf) print("\nTopics in {} model:".format(model)) print_top_words(topic_model, feature_names, 7)
def update_nmf_graph1(no_topics, nmf_components_value, nmf_alpha_value, nmf_l1ratio_value, min_df_value, max_df_value, ngram_range_value, num_clicks): if num_clicks > 0: # Getting the filenames matrix_filename = 'temp_data/' + temporary_key + '_output_matrix.csv' processed_docs_filename = 'temp_data/' + temporary_key + '_processed_docs.csv' features_list_filename = 'temp_data/' + temporary_key + '_features_list.csv' tfidf_fit_filename = 'temp_data/' + temporary_key + '_vectorizer_model.pickle' print('loading nmf input objects') # Read in tfidf dense_tfidf_matrix = pd.read_csv(matrix_filename) print('The shape of the tfidf_matrix is: {}.'.format(dense_tfidf_matrix.shape)) # Reading in the processed documents processed_docs = pd.read_csv(processed_docs_filename, encoding = 'latin1') processed_docs = processed_docs['processed_doc'].tolist() print(processed_docs[0]) features_df = pd.read_csv(features_list_filename) features_list = features_df['feature_list'].tolist() print('The first five token features are: {}.'.format(features_list[:5])) sparse_tfidf_matrix = scipy.sparse.csr_matrix(dense_tfidf_matrix.values) # print(sparse_tfidf_matrix) print('the sparse tfidf matrix is loaded') # Defining the NMF object nmf = NMF(n_components=no_topics, random_state=42, alpha=0.1, l1_ratio=.2, \ max_iter = 500, verbose = False, shuffle = True, init='nndsvd', solver = 'cd') print('Computing the NMF for the sparse tfidf matrix') nmf_model = nmf.fit(sparse_tfidf_matrix) print(nmf_model) #-------------------------------------------------------------------------------------------------- #-------------------------------------------------------------------------------------------------- def generate_topic_table(model, feature_names, n_top_words): topics = {} for topic_idx, topic in enumerate(model.components_): t = ("topic_%d" % topic_idx) topics[t] = [feature_names[i] for i in top_words(topic, n_top_words)] out_df = pd.DataFrame(topics) out_df = out_df[list(topics.keys())] return out_df #-------------------------------------------------------------------------------------------------- #-------------------------------------------------------------------------------------------------- print(processed_docs[0])
def log_stdvar_NMF_L2(X): X = log_stdvar(X) k = compute_pcs_needed_to_explain_variance(X,50) nmf = NMF(n_components=k) Xrd = nmf.fit_transform(X) return pairwise_distances(Xrd)
# 1 パイプラインの定義 ------------------------------------------------------------------------------ # パイプラインの定義 # --- 次元削減 # --- SVM分類器 pipe = Pipeline([ ('reduce_dim', PCA()), ('classify', SVC()) ]) # パラメータ設定 params_grid = [ { 'reduce_dim': [PCA(), NMF(), Isomap(), TruncatedSVD()], 'reduce_dim__n_components': [2, 3], 'classify': [SVC(), LinearSVC()], 'classify__C': [1, 10, 100, 1000] } ] # 確認 print(params_grid) # 2 パラメータチューニングの実行 ----------------------------------------------------------------------- # <ポイント> # - グリッドサーチを用いてハイパーパラメータのチューニングを行う
def extract_components(mov_tot, n_components=6, normalize_std=True, max_iter_DL=-30, method_factorization='nmf', **kwargs): """ From optical flow images can extract spatial and temporal components Parameters: ---------- mov_tot: ndarray (can be 3 or 4D) contains the optical flow values, either in cartesian or polar, either one (3D) or both (4D coordinates) the input is generated by the compute_optical_flow function n_components: int number of components to look for normalize_std: bool whether to normalize each oof the optical flow components normalize_output_traces: boolean whether to normalize the behavioral traces so that they match the units in the movie Returns: ------- spatial_filter: ndarray set of spatial inferred filters time_trace:ndarray set of time components norm_fact: ndarray used notmalization factors """ if mov_tot.ndim == 4: if normalize_std: norm_fact = np.nanstd(mov_tot, axis=(1, 2, 3)) mov_tot = old_div(mov_tot, norm_fact[:, np.newaxis, np.newaxis, np.newaxis]) else: norm_fact = np.array([1., 1.]) c, T, d1, d2 = np.shape(mov_tot) else: norm_fact = 1 T, d1, d2 = np.shape(mov_tot) c = 1 tt = time.time() newm = np.reshape(mov_tot, (c * T, d1 * d2)) if method_factorization == 'nmf': nmf = NMF(n_components=n_components, **kwargs) time_trace = nmf.fit_transform(newm) spatial_filter = nmf.components_ spatial_filter = np.concatenate([ np.reshape(sp, (d1, d2))[np.newaxis, :, :] for sp in spatial_filter ], axis=0) elif method_factorization == 'dict_learn': import spams newm = np.asfortranarray(newm, dtype=np.float32) time_trace = spams.trainDL(newm, K=n_components, mode=0, lambda1=1, posAlpha=True, iter=max_iter_DL) spatial_filter = spams.lasso(newm, D=time_trace, return_reg_path=False, lambda1=0.01, mode=spams.spams_wrap.PENALTY, pos=True) spatial_filter = np.concatenate([ np.reshape(sp, (d1, d2))[np.newaxis, :, :] for sp in spatial_filter.toarray() ], axis=0) time_trace = [np.reshape(ttr, (c, T)).T for ttr in time_trace.T] el_t = time.time() - tt print(el_t) return spatial_filter, time_trace, norm_fact
# Import NMF from sklearn.decomposition import NMF # Create an NMF instance: model model = NMF(n_components=6) # Fit the model to articles model.fit(articles) # Transform the articles: nmf_features nmf_features = model.transform(articles) # Print the NMF features print(nmf_features) #NMF features of the Wikipedia articles # Also available is a list titles giving the title of each Wikipedia article. # Import pandas import pandas as pd # Create a pandas DataFrame: df df = pd.DataFrame(nmf_features, index=titles) # Print the row for 'Anne Hathaway' print(df.loc['Anne Hathaway']) # Print the row for 'Denzel Washington' print(df.loc['Denzel Washington']) #When investigating the features, notice that for both actors, the NMF feature 3 has by far the highest value. This means that both articles are reconstructed using mainly the 3rd NMF component
def test_nmf_fit_close(solver): rng = np.random.mtrand.RandomState(42) # Test that the fit is not too far away pnmf = NMF(5, solver=solver, init='nndsvdar', random_state=0, max_iter=600) X = np.abs(rng.randn(6, 5)) assert pnmf.fit(X).reconstruction_err_ < 0.1
def test_n_components_greater_n_features(): # Smoke test for the case of more components than features. rng = np.random.mtrand.RandomState(42) A = np.abs(rng.randn(30, 10)) NMF(n_components=15, random_state=0, tol=1e-2).fit(A)
# Rm*n m = item n = user RATE_MATRIX = np.array( [[5, 5, 3, 0, 5, 5, 4, 3, 2, 1, 4, 1, 3, 4, 5], [5, 0, 4, 0, 4, 4, 3, 2, 1, 2, 4, 4, 3, 4, 0], [0, 3, 0, 5, 4, 5, 0, 4, 4, 5, 3, 0, 0, 0, 0], [5, 4, 3, 3, 5, 5, 0, 1, 1, 3, 4, 5, 0, 2, 4], [5, 4, 3, 3, 5, 5, 3, 3, 3, 4, 5, 0, 5, 2, 4], [5, 4, 2, 2, 0, 5, 3, 3, 3, 4, 4, 4, 5, 2, 5], [5, 4, 3, 3, 2, 0, 0, 0, 0, 0, 0, 0, 2, 1, 0], [5, 4, 3, 3, 2, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1], [5, 4, 3, 3, 1, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2], [5, 4, 3, 3, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1]] ) nmf_model = NMF(n_components=2) # 设有2个主题 item_dis = nmf_model.fit_transform(RATE_MATRIX) user_dis = nmf_model.components_ print('用户的主题分布:' + str(user_dis.shape)) print(user_dis) print('电影的主题分布:' + str(item_dis.shape)) print(item_dis) plt1 = plt plt1.plot(item_dis[:, 0], item_dis[:, 1], 'ro') plt1.xlim((-1, 3)) plt1.ylim((-1, 3)) plt1.title(u'Item Distribution')#设置图的标题 count = 1
pipe = Pipeline([('reduce_dim', PCA()), ('classify', LinearSVC())]) N_EXPERIMENTS = 5 N_FEATURES_OPTIONS = [4] C_OPTIONS = [1, 10, 100, 1000] reducer_labels = ['PCA', 'NMF', 'KBest(chi2)'] non_nested_scores = np.zeros(N_EXPERIMENTS) nested_scores = np.zeros(N_EXPERIMENTS) ############################################################ param_grid = [ { 'reduce_dim': [PCA(iterated_power=7), NMF()], 'reduce_dim__n_components': N_FEATURES_OPTIONS, 'classify__C': C_OPTIONS }, { 'reduce_dim': [SelectKBest(chi2)], 'reduce_dim__k': N_FEATURES_OPTIONS, 'classify__C': C_OPTIONS }, ] print('Grid Search experiments... ') start = time() for ith_exp in range(N_EXPERIMENTS): # CV technique
# plot the mean cross-validation scores mglearn . tools . heatmap ( scores , xlabel = 'svm__C' , xticklabels = param_grid [ 'svm__C' ], ylabel = 'svm__gamma' , yticklabels = param_grid [ 'svm__gamma' ], cmap = "viridis" ) """-----------------------------------------------------------------------------""" """===========================================================================================""" """-----------------------------------------------------------------------------""" """NMF pre-processing with SVC algorithm """ ##Pipelines in Grid Searches pipe = Pipeline([("scaler", NMF()), ("svm", SVC())]) param_grid = { 'scaler__n_components' : [5], 'svm__C' : [0.00001, 0.1], 'svm__gamma' : [0.00001, 0.1]} grid = GridSearchCV(pipe, param_grid = param_grid, cv = 5) grid.fit(X_train,y_train ) pred = grid.predict(X_test) print("NMF pre-processing with SVC algorithm") print("Best cross-validation accuracy: {:.2f}".format(grid.best_score_)) print("Test set accuracy: {:.2f}".format(grid.score(X_test,y_test))) print("f1 score: {:.2f}".format(f1_score(y_test,pred))) print("Best parameters: {}".format(grid.best_params_)) print ( classification_report ( y_test, pred, target_names = [ "mol" , "no_mol" ])) scores = grid.cv_results_ [ 'mean_test_score' ] . reshape ( 2,2 ) # plot the mean cross-validation scores mglearn . tools . heatmap ( scores , xlabel = 'svm__C' ,
print "Extracting tf-idf features for NMF..." tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english') tfidf = tfidf_vectorizer.fit_transform(posts) print "Extracting tf features for LDA..." tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=n_features, stop_words='english') tf = tf_vectorizer.fit_transform(posts) # cell 3 - Using NMF to get top topics print "Fitting the NMF model with tf-idf features," "n_samples=%d and n_features=%d..." % ( n_samples, n_features) nmf = NMF(n_components=n_topics, random_state=1, alpha=.1, l1_ratio=.5).fit(tfidf) print "\nTopics in NMF model:" tfidf_feature_names = tfidf_vectorizer.get_feature_names() print_top_words(nmf, tfidf_feature_names, n_top_words) # cell 4 - Using LDA to get top topics print "Fitting LDA models with tf features, n_samples=%d and n_features=%d..." % ( n_samples, n_features) lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5, learning_method='online', learning_offset=50., random_state=0) lda.fit(tf)
model.add(e) model.add(Flatten()) model.add(Dense(10173, activation='relu')) model.add(Dropout(0.2)) model.add(Dense(13, activation='softmax')) model.compile(optimizer='Adadelta', loss='categorical_crossentropy', metrics=['acc']) history = model.fit(tfidf, y_label, epochs=20, verbose=1,validation_split=0.3) # Run NMF from sklearn.decomposition import NMF, LatentDirichletAllocation no_topics = 13 nmf = NMF(n_components=no_topics, init='nndsvd').fit(tfidf) W = nmf.fit_transform(tfidf) H = nmf.components_ # Run LDA lda = LatentDirichletAllocation(n_components=no_topics, learning_method='online', learning_offset=50.).fit(tf) W_lda = lda.fit_transform(tf) H_lda = lda.components_ def display_topics(model, feature_names, no_top_words): for topic_idx, topic in enumerate(model.components_): print ("Topic %d:" % (topic_idx)) print (" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))
def handle(self, *args, **options): parent_run_id = options['run_id'] K = options['K'] nWords = 50 #options['nWords'] fileDest = "" #options['fileDest'] parent_stat = RunStats.objects.get(pk=parent_run_id) n_features = parent_stat.max_features if fileDest == '': run_id = init(n_features) stat = RunStats.objects.get(run_id=run_id) stat.query = Query.objects.get(pk=parent_stat.query.id) stat.method = "DT" stat.parent_run_id = parent_run_id stat.save() for tp in parent_stat.periods.all(): stat.periods.add(tp) tops = Topic.objects.filter(run_id=parent_run_id, topicterm__isnull=False).distinct() terms = Term.objects.all() B = np.zeros((tops.count(), terms.count())) wt = 0 for topic in tops: tts = TopicTerm.objects.filter( topic=topic).order_by('-score')[:nWords] if len(tts) == 0: if fileDest != '': print(wt) continue print(topic) for tt in tts: B[wt, tt.term.id] = tt.score * np.log1p(topic.score) wt += 1 col_sum = np.sum(B, axis=0) vocab_ids = np.flatnonzero(col_sum) row_sum = np.sum(B, axis=1) top_ids = np.flatnonzero(row_sum) print(np.where(~B.any(axis=1))) # we only want the columns where there are at least some # topic-term values B = B[:, vocab_ids] print(B.shape) print(np.where(~B.any(axis=1))) if fileDest != '': np.save(fileDest, B) sys.exit() nmf = NMF(n_components=K, random_state=1, alpha=.1, l1_ratio=.5).fit(B) ## Add dynamic topics dtopics = [] for k in range(K): dtopic = DynamicTopic(run_id=RunStats.objects.get(pk=run_id)) dtopic.save() dtopics.append(dtopic) dtopic_ids = list( DynamicTopic.objects.filter(run_id=run_id).values_list('id', flat=True)) print(dtopic_ids) ################## ## Add the dtopic*term matrix to the db print("Adding topicterms to db") t0 = time() ldalambda = find(csr_matrix(nmf.components_)) topics = range(len(ldalambda[0])) tts = [] pool = Pool(processes=8) tts.append( pool.map( partial(f_dlambda, m=ldalambda, v_ids=vocab_ids, t_ids=dtopic_ids, run_id=run_id), topics)) pool.terminate() tts = flatten(tts) gc.collect() sys.stdout.flush() django.db.connections.close_all() DynamicTopicTerm.objects.bulk_create(tts) print("done in %0.3fs." % (time() - t0)) ## Add the wtopic*dtopic matrix to the database gamma = nmf.transform(B) for topic in range(len(gamma)): for dtopic in range(len(gamma[topic])): if gamma[topic][dtopic] > 0: tdt = TopicDTopic(topic=tops[topic], dynamictopic_id=dtopic_ids[dtopic], score=gamma[topic][dtopic]) tdt.save() ## Calculate the primary dtopic for each topic for t in tops: try: t.primary_dtopic = TopicDTopic.objects.filter( topic=t).order_by('-score').first().dynamictopic t.save() except: pass stat.error = parent_stat.error + nmf.reconstruction_err_ stat.errortype = "Frobenius" stat.last_update = timezone.now() stat.save() print("updating and summarising run, {}".format(run_id)) management.call_command('update_run', run_id) management.call_command('update_run', run_id)
stop_words='english', lowercase=True, token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}') data_vectorized = vectorizer.fit_transform(wines["processed_description"]) NUM_TOPICS = 10 # Latent Dirichlet Allocation Model lda = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=10, learning_method='online', verbose=True) data_lda = lda.fit_transform(data_vectorized) # Non-Negative Matrix Factorization Model nmf = NMF(n_components=NUM_TOPICS) data_nmf = nmf.fit_transform(data_vectorized) # Latent Semantic Indexing Model using Truncated SVD lsi = TruncatedSVD(n_components=NUM_TOPICS) data_lsi = lsi.fit_transform(data_vectorized) # Functions for printing keywords for each topic def selected_topics(model, vectorizer, top_n=10): for idx, topic in enumerate(model.components_): print("Topic %d:" % (idx)) print([(vectorizer.get_feature_names()[i], topic[i]) for i in topic.argsort()[:-top_n - 1:-1]])
def ldatopicmodeling(sentencetuples, searchobject): """ see: http://scikit-learn.org/stable/auto_examples/applications/plot_topics_extraction_with_nmf_lda.html#sphx-glr-auto-examples-applications-plot-topics-extraction-with-nmf-lda-py CountVectorizer: max_df : float in range [0.0, 1.0] or int, default=1.0 When building the vocabulary ignore terms that have a document frequency strictly higher than the given threshold (corpus-specific stop words). min_df : float in range [0.0, 1.0] or int, default=1 When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold. This value is also called cut-off in the literature. see sample results at end of file :param sentencetuples: :param activepoll: :return: """ maxfeatures = 2000 components = 15 topwords = 15 maxfreq = .60 minfreq = 5 iterations = 12 mustbelongerthan = 2 sentencetuples = [ s for s in sentencetuples if len(s[1].strip().split(' ')) > mustbelongerthan ] sentences = [s[1] for s in sentencetuples] sentences = [s.split(' ') for s in sentences] allwordsinorder = [ item for sublist in sentences for item in sublist if item ] morphdict = getrequiredmorphobjects(set(allwordsinorder)) morphdict = convertmophdicttodict(morphdict) bagsofwords = buildwordbags(searchobject, morphdict, sentences) bagsofsentences = [' '.join(b) for b in bagsofwords] # Use tf (raw term count) features for LDA. ldavectorizer = CountVectorizer(max_df=maxfreq, min_df=minfreq, max_features=maxfeatures) ldavectorized = ldavectorizer.fit_transform(bagsofsentences) lda = LatentDirichletAllocation(n_components=components, max_iter=iterations, learning_method='online', learning_offset=50., random_state=0) lda.fit(ldavectorized) print("\nTopics in LDA model:") tf_feature_names = ldavectorizer.get_feature_names() print_top_words(lda, tf_feature_names, topwords) # Use tf-idf features for NMF. tfidfvectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=maxfeatures) tfidf = tfidfvectorizer.fit_transform(bagsofsentences) # Fit the NMF model nmf = NMF(n_components=components, random_state=1, alpha=.1, l1_ratio=.5).fit(tfidf) print("\nTopics in NMF model (Frobenius norm):") tfidffeaturenames = tfidfvectorizer.get_feature_names() print_top_words(nmf, tfidffeaturenames, topwords) # Fit the NMF model print( "Fitting the NMF model (generalized Kullback-Leibler divergence) with " "tf-idf features, n_samples=%d and n_features=%d..." % (len(sentences), maxfeatures)) nmf = NMF(n_components=components, random_state=1, beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1, l1_ratio=.5).fit(tfidf) print("\nTopics in NMF model (generalized Kullback-Leibler divergence):") tfidffeaturenames = tfidfvectorizer.get_feature_names() print_top_words(nmf, tfidffeaturenames, topwords) return
def gen_decomposition_stats_vector_ftr51(stats_name, size='7d', non_zero=False, decomp_method='lda', n_components=5): """ :param stats_name: str,对药品数量进行统计的名字 :param size: str, 统计的时间粒度 1d, 4d, 7d, 15d, 30d, 45d :param non_zero: bool, 统计是否非0 :param decomp_method: str, 分解方法 :param n_components: int , 分解之后的维度 :return: """ assert decomp_method in ['svd', 'nmf', 'lda'] mask = (stats_name in ['sum', 'max', 'sum_ratio', 'max_ratio']) & non_zero assert not mask matrix_name = '{}_vector_ftr51_by_{}_{}'.format(stats_name, size, non_zero) # 0 读取数据 ftr51_stats_sparse_matrix = sparse.load_npz( get_path() + 'Data/Feature/{}.npz'.format(matrix_name)).toarray() if decomp_method == 'svd': print(' svd decomposition...') svd = TruncatedSVD(n_components=n_components, n_iter=50, random_state=42) ftr51_stats_matrix_decomp = svd.fit_transform( ftr51_stats_sparse_matrix) if decomp_method == 'nmf': print(' nmf decomposition...') nmf = NMF(n_components=n_components, init='random', random_state=0, max_iter=200) ftr51_stats_matrix_decomp = nmf.fit_transform( ftr51_stats_sparse_matrix) if decomp_method == 'lda': print(' lda decomposition...') lda = LatentDirichletAllocation(n_components=n_components, max_iter=50, learning_method='online', learning_offset=50., random_state=0, n_jobs=1) ftr51_stats_matrix_decomp = lda.fit_transform( ftr51_stats_sparse_matrix) joblib.dump(lda, "lda_{}_{}.m".format(stats_name, size)) columns = [ '{}_{}_vector_by_{}_{}_{}_{}'.format(decomp_method, stats_name, size, non_zero, n_components, j) for j in range(ftr51_stats_matrix_decomp.shape[1]) ] stats_df = pd.DataFrame(data=ftr51_stats_matrix_decomp, columns=columns) train = stats_df[:15000].reset_index(drop=True) test = stats_df[15000:].reset_index(drop=True) for feature in columns: SaveFeature(train, test, feature) return columns, 'gen_decomposition_stats_vector_ftr51("{}", "{}", {}, "{}", {})'.format( stats_name, size, non_zero, decomp_method, n_components)
class TweetAnalyzer: def __init__(self, tweets=None): if not tweets: try: with jsonlines.open(TWEETS_FILE) as reader: self.tweets = [tweet for tweet in reader] print('Loaded {} tweets fron {}'.format( len(self.tweets), TWEETS_FILE)) except FileNotFoundError: print("Can't find the tweets file") except Exception as e: print(e) else: self.tweets = tweets # Extract the keys from the first tweet and spread them into a list columns = [*self.tweets[0]] self.tfidf_result = None self.feature_names = None self.df = pd.DataFrame(self.tweets, columns=columns) self.clean_tweets() if DEBUG: print(self.df.head()) def clean_tweets(self): start = timer() self.df.text = self.df.text.apply(TweetPreprocessor.strip_links) self.df.text = self.df.text.apply(TweetPreprocessor.strip_mentions) self.df.text = self.df.text.apply(TweetPreprocessor.strip_hashtags) self.df.text = self.df.text.apply(TweetPreprocessor.strip_rt) self.df.text = self.df.text.apply( TweetPreprocessor.remove_special_characters) end = timer() print('Cleaned tweets in {}'.format(end - start)) def vectorize(self): self.vectorizer = TfidfVectorizer(stop_words='english') self.tfidf_result = self.vectorizer.fit_transform(self.df['text']) self.feature_names = self.vectorizer.get_feature_names() def top_n(self, top=100): if self.feature_names is None or self.tfidf_result is None: print('Must run vectorize() first before calling top_n') return scores = zip(self.feature_names, np.asarray(self.tfidf_result.sum(axis=0)).ravel()) sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True) labels, scores = [], [] # Get the scores and labels of the top 100 tweets for item in sorted_scores[:top]: print("{0:50} Score: {1}".format(item[0], item[1])) # sns.distplot(item[1], label=item[0]) labels.append(item[0]) scores.append(item[1]) index = np.arange(len(scores)) plt.bar(index, scores) plt.xlabel('Word', fontsize=12) plt.ylabel('TFIDF Score', fontsize=12) plt.xticks(index, labels, fontsize=8, rotation=90) plt.title('Top {} features'.format(top)) plt.savefig('Top_{}'.format(top)) def topic_model(self, num_topics=10): if DEBUG: print('Performing topic modeling with {} topics'.format(num_topics)) # Build a Latent Dirichlet Allocation Model self.lda_model = LatentDirichletAllocation(n_topics=num_topics, max_iter=10, learning_method='online') lda_Z = self.lda_model.fit_transform(self.tfidf_result) print('LDA shape: ') print(lda_Z.shape) # (NO_DOCUMENTS, NO_TOPICS) # Build a Non-Negative Matrix Factorization Model self.nmf_model = NMF(n_components=num_topics) nmf_Z = self.nmf_model.fit_transform(self.tfidf_result) print('NMF shape: ') print(nmf_Z.shape) # (NO_DOCUMENTS, NO_TOPICS) # Build a Latent Semantic Indexing Model self.lsi_model = TruncatedSVD(n_components=num_topics) lsi_Z = self.lsi_model.fit_transform(self.tfidf_result) print('LSI shape: ') print(lsi_Z.shape) # (NO_DOCUMENTS, NO_TOPICS) if DEBUG: # Let's see how the first document in the corpus looks like in different topic spaces print("LDA Model:") self.print_topics(self.lda_model) print("=" * 20) print("NMF Model:") self.print_topics(self.nmf_model) print("=" * 20) print("LSI Model:") self.print_topics(self.lsi_model) print("=" * 20) # Helper function to print topics def print_topics(self, model, top_n=10): for idx, topic in enumerate(model.components_): print("Topic %d:" % (idx)) print([(self.vectorizer.get_feature_names()[i], topic[i]) for i in topic.argsort()[:-top_n - 1:-1]]) def plot_topic_model_SVD(self): from bokeh.io import push_notebook, show, output_notebook from bokeh.plotting import figure from bokeh.models import ColumnDataSource, LabelSet output_notebook() self.svd = TruncatedSVD(n_components=2) words_2d = self.svd.fit_transform(self.tfidf_result.T) df = pd.DataFrame(columns=['x', 'y', 'word']) df['x'], df['y'], df['word'] = words_2d[:,0], words_2d[:,1], self.feature_names source = ColumnDataSource(ColumnDataSource.from_df(df)) labels = LabelSet(x="x", y="y", text="word", y_offset=8, text_font_size="8pt", text_color="#555555", source=source, text_align='center') plot = figure(plot_width=600, plot_height=600) plot.circle("x", "y", size=12, source=source, line_color="black", fill_alpha=0.8) plot.add_layout(labels) show(plot, notebook_handle=True)
E_symbol = np.asarray(E_symbol) P_symbol = np.asarray(P_symbol) E = pd.DataFrame(E) PeakO = pd.DataFrame(PeakO) E = quantileNormalize(E) PeakO = quantileNormalize(PeakO) print("Initializing non-negative matrix factorization for E...") E[E > 10000] = 10000 X = np.log(1 + E) err1 = np.zeros(rep) for i in range(0, rep): model = NMF(n_components=K, init='random', random_state=i, solver='cd', max_iter=50) W20 = model.fit_transform(X) H20 = model.components_ err1[i] = LA.norm(X - np.dot(W20, H20), ord='fro') model = NMF(n_components=K, init='random', random_state=np.argmin(err1), solver='cd', max_iter=1000) W20 = model.fit_transform(X) H20 = model.components_ S20 = np.argmax(H20, 0)
y = x_p[:, 1] plt.figure() plt.title('apres la methode pca') plt.scatter(x, y, c=label) plt.xlabel('dimension 1') plt.ylabel('diemnsion 2') # <p style="color:green">Donc la couleur jaune représente les personnes mort </p> # <i style="color:blue">On peut aussi utiliser la méthode NMF</i> # In[31]: from sklearn.decomposition import NMF nmf = NMF(n_components=2) x_n = nmf.fit(data).transform(data) print(x_n) x = x_n[:, 0] y = x_n[:, 1] plt.figure() plt.title('apres la methode NMF') plt.scatter(x, y, c=label) plt.xlabel('dimension 1') plt.ylabel('diemnsion 2') # <h3 style="color:#8080C0"> # Dans la suite, nous utilisons une méthode d'apprentissage automatique afin de prédire la classe : les patients sont soit «décédés» (‘died’) soit «sortis» (‘discharged’) de l'hôpital. Vous pouvez utiliser la classification par K-Nearest Neighbours (K-NN), l’arbre de decision ou le classificateur Bayes.</h3> # In[42]:
def lanchNMF(self): model = NMF(n_components=3, init='random', random_state=0) self.nmf_ = model.fit_transform(self.img)
import numpy as np from sklearn.decomposition import NMF,TruncatedSVD,ProjectedGradientNMF model = NMF(n_components=2, alpha=0.01) #Store AD ad_ID_dict = {} #ad_list = [] #ad_list = list(ad_list) #Assign ID number ad_ID = 0 user_ID = 0 max_feature = 0 #ad_ID for ad_nmu adID_for_num = {} with open ('ad_ID.dat') as file: for line in file: data = line.strip('\n').split(' ') #print(data) adID_for_num[int(data[1])] = int(data[0]) file.close()
# Challenge 1 #%% import numpy as np np.set_printoptions(threshold=np.inf) from sklearn.decomposition import NMF M = [[4, 4, 2, 2, 3, 1, 1], [1, 5, 5, 2, 1, 4, 5], [1, 5, 1, 1, 4, 1, 4], [5, 4, 3, 1, 1, 1, 2], [1, 4, 4, 1, 1, 5, 5], [5, 5, 3, 5, 5, 1, 2], [1, 5, 3, 5, None, 5, 5]] M1 = [[4, 4, 2, 2, 3, 1, 1], [1, 5, 5, 2, 1, 4, 5], [1, 5, 1, 1, 4, 1, 4], [5, 4, 3, 1, 1, 1, 2], [1, 4, 4, 1, 1, 5, 5], [5, 5, 3, 5, 5, 1, 2]] M2 = [[4, 4, 2, 2, 1, 1], [1, 5, 5, 2, 4, 5], [1, 5, 1, 1, 1, 4], [5, 4, 3, 1, 1, 2], [1, 4, 4, 1, 5, 5], [5, 5, 3, 5, 1, 2], [1, 5, 3, 5, 5, 5]] model1 = NMF(n_components=3) model1.fit(M1) W2 = model1.fit_transform(M2) H2 = model1.components_ W1 = model1.fit_transform(M1) H1 = model1.components_ print(np.matmul(W2, H1)) # Challenge 2 #%% # Lloyd’s algorithm import random import matplotlib.pyplot as plt class lloyds(object):
start_time = time.time() U_50, sigma_50, Vt_50 = svds(demeaned_input, k=50) sigma_50 = np.diag(sigma_50) svd_50_prediction = np.dot(np.dot(U_50, sigma_50), Vt_50) + user_mean end_time = time.time() svd_50_HR10 = test.hit_rate(svd_50_prediction[len(train_data):], last_item, 10) svd_50_HR25 = test.hit_rate(svd_50_prediction[len(train_data):], last_item, 25) svd_50_arhr = test.arhr(svd_50_prediction[len(train_data):], last_item) svd_50_time = end_time - start_time # NMF start_time = time.time() nmf = NMF(2) W = nmf.fit_transform(entire_data) H = nmf.components_ nmf_prediction = np.dot(W, H) end_time = time.time() nmf_HR10 = test.hit_rate(nmf_prediction[len(train_data):], last_item, 10) nmf_HR25 = test.hit_rate(nmf_prediction[len(train_data):], last_item, 25) nmf_arhr = test.arhr(nmf_prediction[len(train_data):], last_item) nmf_time = end_time - start_time # print tabulated result table = tabulate( [[ 'HR10', dhrbm_HR10, itempop_HR10, itempop_cluster_HR10, svd_10_HR10, svd_50_HR10, nmf_HR10
image_shape = people.images[0].shape mask = np.zeros(people.target.shape, dtype=np.bool) for target in np.unique(people.target): mask[np.where(people.target == target)[0][:50]] = 1 X_people = people.data[mask] y_people = people.target[mask] X_people = X_people / 255. X_train, X_test, y_train, y_test = train_test_split( \ X_people, y_people, stratify=y_people, random_state=0) mglearn.plots.plot_nmf_illustration() mglearn.plots.plot_nmf_faces(X_train, X_test, image_shape) from sklearn.decomposition import NMF nmf = NMF(n_components=15, random_state=0) nmf.fit(X_train) X_train_nmf = nmf.transform(X_train) X_test_nmf = nmf.transform(X_test) fix, axes = plt.subplots(3, 5, figsize=(15, 12), \ subplot_kw={'xticks': (), 'yticks': ()}) for i, (component, ax) in enumerate(zip(nmf.components_, axes.ravel())): ax.imshow(component.reshape(image_shape)) ax.set_title("{}. component".format(i)) # display the data that has large weighting for comp compn = 11 inds = np.argsort(X_train_nmf[:, compn])[::-1] fix, axes = plt.subplots(2, 5, figsize=(15, 8), \ subplot_kw={'xticks': (), 'yticks': ()})
def test_custom_nmf(self): mat = np.array([[1, 0, 0, 0], [1, 0, 0, 0], [1, 0, 0, 0], [1, 0, 0, 0], [1, 0, 0, 0]], dtype=np.float64) mat[:mat.shape[1], :] += np.identity(mat.shape[1]) mod = NMF(n_components=2) W = mod.fit_transform(mat) H = mod.components_ def predict(W, H, row_index, col_index): return np.dot(W[row_index, :], H[:, col_index]) pred = mod.inverse_transform(W) exp = [] got = [] for i in range(mat.shape[0]): for j in range(mat.shape[1]): exp.append((i, j, pred[i, j])) got.append((i, j, predict(W, H, i, j))) max_diff = max(abs(e[2] - o[2]) for e, o in zip(exp, got)) assert max_diff <= 1e-5 def nmf_to_onnx(W, H): """ The function converts a NMF described by matrices *W*, *H* (*WH* approximate training data *M*). into a function which takes two indices *(i, j)* and returns the predictions for it. It assumes these indices applies on the training data. """ col = OnnxArrayFeatureExtractor(H, 'col') row = OnnxArrayFeatureExtractor(W.T, 'row') dot = OnnxMul(col, row, op_version=TARGET_OPSET) res = OnnxReduceSum(dot, output_names="rec", op_version=TARGET_OPSET) indices_type = np.array([0], dtype=np.int64) onx = res.to_onnx(inputs={'col': indices_type, 'row': indices_type}, outputs=[('rec', FloatTensorType((None, 1)))]) return onx model_onnx = nmf_to_onnx(W, H) sess = InferenceSession(model_onnx.SerializeToString()) def predict_onnx(sess, row_indices, col_indices): res = sess.run(None, {'col': col_indices, 'row': row_indices}) return res onnx_preds = [] for i in range(mat.shape[0]): for j in range(mat.shape[1]): row_indices = np.array([i], dtype=np.int64) col_indices = np.array([j], dtype=np.int64) pred = predict_onnx(sess, row_indices, col_indices)[0] onnx_preds.append((i, j, pred[0, 0])) max_diff = max(abs(e[2] - o[2]) for e, o in zip(exp, onnx_preds)) assert max_diff <= 1e-5
def plot_optimal_k(docs, document_term_mat, vectorizer, kmin=3, kmax=15, num_top_terms=15, alpha=.1, l1_ratio=.5, dim_size=500, min_df=20, max_vocab_size=5000, model_file_path='./data/', model_file_name='w2v-model.bin'): ''' Run NMF for each k between min and max and plot to assess optimal k. Input docs - corpus of docuemnts as a list document_term_mat - TFIDF matrix from the vectorizer vectorizer - scikit-learn TFIDF vectorizer (trained in TopicModeller) Returns: Int - optimal k number ''' topic_models = [] # Run NMF for each value of k for k in range(kmin, kmax+1): t1 = time.time() # Run NMF model = NMF(n_components=k, init='nndsvd', alpha=alpha, l1_ratio=l1_ratio) W = model.fit_transform(document_term_mat) H = model.components_ # Store for iterating over all the models (of each k size) topic_models.append((k, W, H)) print("Processed NMF for k=%d of %d - Time: %0.3fs." % (k, kmax, (time.time() - t1)), end='\r', flush=True) print() # If the model is already built get it from disk, otherwise # build a Skipgram Word2Vec model from all documents # in the input file using Gensim: model_path = model_file_path + model_file_name if not os.path.exists(model_file_path): os.makedirs(model_file_path) w2v_model = None try: w2v_model = gensim.models.Word2Vec.load(model_path) except Exception as e: print('No existing word2vec model found to load. Exception: %s.\n' 'Building it...' % (e)) # w2v_model = None - uncomment to force rebuild every time if w2v_model: print('Existing word2vec Model loaded from \'%s\'' % model_path) else: docgen = nlp_utils.TokenGenerator(docs) # Process w2v with model of n dimensions and min doc-term freq as min_df t1 = time.time() w2v_model = gensim.models.Word2Vec(docgen, sg=1, size=dim_size, max_vocab_size=max_vocab_size, min_count=min_df) print("- Time: %0.3fs." % (time.time() - t1)) # Save for later use, so that we do not need to rebuild it: print('Saving it...') w2v_model.save(model_path) print(('word2vec model has %d terms' % len(w2v_model.wv.vocab))) # Implement TC-W2V coherence score measure def calculate_coherence(w2v_model, term_rankings): overall_coherence = 0.0 for topic_index in range(len(term_rankings)): # check each pair of terms pair_scores = [] # print 'Topic %s: %s top words: %s' % (topic_index, # len(term_rankings[topic_index]), # term_rankings[topic_index]) for pair in combinations(term_rankings[topic_index], 2): pair_scores.append(w2v_model.similarity(pair[0], pair[1])) # get the mean for all pairs in this topic topic_score = sum(pair_scores) / len(pair_scores) overall_coherence += topic_score # get the mean score across all topics return overall_coherence / len(term_rankings) # Function to get the topic descriptor # (i.e. list of top terms) for each topic: def get_descriptor(all_terms, H, topic_index, num_top_terms): # reverse sort the values to sort the indices top_indices = np.argsort(H[topic_index, :])[::-1] # now get the terms corresponding to the top-ranked indices top_terms = [] for term_index in top_indices[0:num_top_terms]: top_terms.append(all_terms[term_index]) return top_terms # Process each of the models for different values of k: vocab = vectorizer.get_feature_names() # vocab = w2v_model.wv.vocab # Process each of the models for different values of k: k_values = [] coherences = [] print('Calculating coherence scores...') for (k, W, H) in topic_models: # Get all topic descriptors - the term_rankings, based on top n terms term_rankings = [] for topic_index in range(k): # term_rankings.append(get_descriptor(vocab, H, topic_index, num_top_terms)) top_words = [vocab[i] for i in H[topic_index, :].argsort()[:-num_top_terms - 1:-1]] top_words = [x for x in top_words if x in w2v_model.wv.vocab] term_rankings.append(top_words) # Calculate the coherence based on our Word2vec model k_values.append(k) coherences.append(calculate_coherence(w2v_model, term_rankings)) # print(('K=%02d: Coherence=%.4f' % (k, coherences[-1]))) # Plot a line of coherence scores to identify an appropriate k value. plt.style.use("ggplot") matplotlib.rcParams.update({"font.size": 14}) fig = plt.figure(figsize=(13, 7)) # Create the line plot ax = plt.plot(k_values, coherences) plt.xticks(k_values) plt.xlabel("Number of Topics") plt.ylabel("Mean Coherence") # Add the points plt.scatter(k_values, coherences, s=120) # Find and annotate the maximum point on the plot ymax = max(coherences) xpos = coherences.index(ymax) best_k = k_values[xpos] plt.annotate('k=%d' % best_k, xy=(best_k, ymax), xytext=(best_k, ymax), textcoords="offset points", fontsize=16) print('Optimal number of k topics: %s' % best_k) # Show the plot plt.show() k = best_k # Get the model that we generated earlier. W = topic_models[k-kmin][1] H = topic_models[k-kmin][2] # Display the topics and descriptor words for the best k model for topic_index in range(k): descriptor = get_descriptor(vectorizer.get_feature_names(), H, topic_index, num_top_terms) str_descriptor = ", ".join(descriptor) print(("Topic %02d: %s" % (topic_index, str_descriptor))) return int(k)