def nmf_new(mut_final, mut_diff, mut_mean_qn, mut_median_qn, n_components, init='nndsvdar', random_state=0): # Numerical solver to use: ‘pg’ is a Projected Gradient solver (deprecated). # ‘cd’ is a Coordinate Descent solver (recommended). model = NMF(n_components=n_components, init=init, random_state=random_state) # TODO en boucle model.fit(mut_final) gene_comp = model.components_.copy() patient_strat = np.argmax(model.fit_transform(mut_final), axis=1).copy() # fit_transform more efficient than calling fit followed by transform model.fit(mut_diff) gene_comp_diff = model.components_.copy() patient_strat_diff = np.argmax( model.fit_transform(mut_diff), axis=1).copy() model.fit(mut_mean_qn) gene_comp_mean_qn = model.components_.copy() patient_strat_mean_qn = np.argmax( model.fit_transform(mut_mean_qn), axis=1).copy() model.fit(mut_median_qn) gene_comp_median_qn = model.components_.copy() patient_strat_median_qn = np.argmax( model.fit_transform(mut_median_qn), axis=1).copy() return (gene_comp, patient_strat, gene_comp_diff, patient_strat_diff, gene_comp_mean_qn, patient_strat_mean_qn, gene_comp_median_qn, patient_strat_median_qn)
def extractTemplate(y, w=d_w, h=d_h, n_components=nc): model = NMF(n_components=n_components, max_iter=max_iter, beta=beta) S = librosa.core.stft(y, n_fft=w, hop_length=h) model.fit_transform(np.abs(S).T) components = model.components_.T #components, activation = librosa.decompose.decompose(np.abs(S), n_components=3) return components
def test_nmf_inverse_transform(): # Test that NMF.inverse_transform returns close values random_state = np.random.RandomState(0) A = np.abs(random_state.randn(6, 4)) m = NMF(n_components=4, init="random", random_state=0) m.fit_transform(A) t = m.transform(A) A_new = m.inverse_transform(t) assert_array_almost_equal(A, A_new, decimal=2)
class TopicEmbeddingModel(): ''' Wrapper class for different topic models ''' def __init__(self,folder='model',modeltype='kpca',topics=10): # the classifier, which also contains the trained BoW transformer self.bow = Vectorizer(folder=folder,steps=['hashing','tfidf']) self.folder = folder self.modeltype = modeltype self.topics = topics if self.modeltype is 'kpca': from sklearn.decomposition import KernelPCA self.model = KernelPCA(kernel='rbf',gamma=1.,n_components=topics) if self.modeltype is 'nmf': from sklearn.decomposition import NMF self.model = NMF(n_components=topics) def fit(self,X): ''' fits a topic model INPUT X list of strings ''' # transform list of strings into sparse BoW matrix X = self.bow.transform(X) #X = self.bow['tfidf_transformer'].fit_transform(\ # self.bow['count_vectorizer'].fit_transform(X)) # depending on the model, train if self.modeltype is 'kpca': Xc = self.model.fit_transform(X) if self.modeltype is 'nmf': Xc = self.model.fit_transform(X) def predict(self,X): ''' predicts cluster assignment from list of strings INPUT X list of strings ''' if X is not list: X = [X] X = self.bow.transform(X) #X = self.bow['tfidf_transformer'].transform(\ # self.bow['count_vectorizer'].transform(X)) if self.modeltype is 'kpca': return self.model.transform(X) if self.modeltype is 'nmf': return self.model.transform(X)
def test_nmf_transform_custom_init(): # Smoke test that checks if NMF.transform works with custom initialization A = np.abs(random_state.randn(6, 5)) n_components = 4 avg = np.sqrt(A.mean() / n_components) H_init = np.abs(avg * random_state.randn(n_components, 5)) W_init = np.abs(avg * random_state.randn(6, n_components)) m = NMF(solver="cd", n_components=n_components, init="custom", random_state=0) m.fit_transform(A, W=W_init, H=H_init) m.transform(A)
def get_features(head_and_body): filename = "NMF_topics" + str(n_topics) + "topics" if include_holdout == True: filename += "_holdout" if include_unlbled_test == True: filename += "unlbled_test" if not (os.path.exists(features_dir + "/" + filename + ".pkl")): X_all, vocab = get_all_data(head_and_body, filename) # calculates n most important topics of the bodies. Each topic contains all words but ordered by importance. The # more important topic words a body contains of a certain topic, the higher its value for this topic nfm = NMF(n_components=n_topics, random_state=1, alpha=.1) print("NMF_topics: fit and transform body") t0 = time() nfm.fit_transform(X_all) print("done in %0.3fs." % (time() - t0)) with open(features_dir + "/" + filename + ".pkl", 'wb') as handle: joblib.dump(nfm, handle, protocol=pickle.HIGHEST_PROTOCOL) else: vocab = get_vocab(head_and_body, filename) with open(features_dir + "/" + filename + ".pkl", 'rb') as handle: nfm = joblib.load(handle) vectorizer_head = TfidfVectorizer(vocabulary=vocab, norm='l2') X_train_head = vectorizer_head.fit_transform(headlines) vectorizer_body = TfidfVectorizer(vocabulary=vocab, norm='l2') X_train_body = vectorizer_body.fit_transform(bodies) print("NMF_topics: transform head and body") # use the lda trained for body topcis on the headlines => if the headlines and bodies share topics # their vectors should be similar nfm_head_matrix = nfm.transform(X_train_head) nfm_body_matrix = nfm.transform(X_train_body) if cosinus_dist == False: return np.concatenate([nfm_head_matrix, nfm_body_matrix], axis=1) else: # calculate cosine distance between the body and head X = [] for i in range(len(nfm_head_matrix)): X_head_vector = np.array(nfm_head_matrix[i]).reshape((1, -1)) # 1d array is deprecated X_body_vector = np.array(nfm_body_matrix[i]).reshape((1, -1)) cos_dist = cosine_distances(X_head_vector, X_body_vector).flatten() X.append(cos_dist.tolist()) return X
def test_nmf_transform(): # Test that NMF.transform returns close values A = np.abs(random_state.randn(6, 5)) m = NMF(n_components=4, init="nndsvd", random_state=0) ft = m.fit_transform(A) t = m.transform(A) assert_array_almost_equal(ft, t, decimal=2)
def nmf(self, **kwargs): """Perform dimensionality reduction using NMF.""" nmf = NMF(**kwargs) reduced_matrix = nmf.fit_transform(self.matrix) # TODO: it is incorrect to pass self.column_labels! There are not column labels. return Space(reduced_matrix, self.row_labels, self.column_labels)
def test_nmf_fit_nn_output(): # Test that the decomposition does not contain negative values A = np.c_[5 * np.ones(5) - np.arange(1, 6), 5 * np.ones(5) + np.arange(1, 6)] for init in (None, "nndsvd", "nndsvda", "nndsvdar"): model = NMF(n_components=2, init=init, random_state=0) transf = model.fit_transform(A) assert_false((model.components_ < 0).any() or (transf < 0).any())
def find_template(music_stft, sr, min_t, n_components, start, end): """ from Prem :param music_stft: :param sr: :param min_t: :param n_components: :param start: :param end: :return: """ template_stft = music_stft[:, start:end] layer = librosa.istft(template_stft) layer_rms = np.sqrt(np.mean(layer * layer)) comps = [] acts = [] errors = [] for T in range(min_t, n_components): transformer = NMF(n_components=T) comps.append(transformer.fit_transform(np.abs(template_stft))) acts.append(transformer.components_) errors.append(transformer.reconstruction_err_) # knee = np.diff(errors, 2) # knee = knee.argmax() + 2 knee = 0 # print 'Using %d components' % (knee + min_t) return comps[knee], acts[knee]
def hog2hognmf(hog_feature): """Transform HOG feature into HOG-NMF feature. Parameters ---------- hog_feature: np.ndarray HOG feature. """ mat = np.zeros((500, 8), dtype=np.float32) NMFmodel = NMF(n_components=2, init="random", random_state=0) # Transform 3780 into 500 * 8 for i in range(7): mat[:, i] = hog_feature[i * 500 : (i + 1) * 500] mat[:280, 7] = hog_feature[3500:] W = NMFmodel.fit_transform(mat) H = NMFmodel.components_ hognmf_feature = np.array([], dtype=np.float32) for i in range(8): _sum = np.sum(H[:, i]) if _sum == 0: H[:, i] *= 0.0 else: H[:, i] /= _sum hognmf_feature = np.append(hognmf_feature, H[:, i]) for i in range(500): _sum = np.sum(W[i, :]) if _sum == 0: W[i, :] *= 0.0 else: W[i, :] /= _sum hognmf_feature = np.append(hognmf_feature, W[i, :]) return hognmf_feature
def get_LDA(X, num_components=10, show_topics=True): ''' Latent Dirichlet Allication by NMF. 21 Nov 2015, Keunwoo Choi LDA for a song-tag matrix. The motivation is same as get_LSI. With NMF, it is easier to explain what each topic represent - by inspecting 'H' matrix, where X ~= X' = W*H as a result of NMF. It is also good to have non-negative elements, straight-forward for both W and H. ''' from sklearn.decomposition import NMF nmf = NMF(init='nndsvd', n_components=num_components, max_iter=400) # 400 is too large, but it doesn't hurt. W = nmf.fit_transform(X) H = nmf.components_ print '='*60 print "NMF done with k=%d, average error:%2.4f" % (num_components, nmf.reconstruction_err_/(X.shape[0]*X.shape[1])) term_rankings = [] moodnames = cP.load(open(PATH_DATA + FILE_DICT['sorted_tags'], 'r')) #list, 100 for topic_index in range( H.shape[0] ): top_indices = np.argsort( H[topic_index,:] )[::-1][0:10] term_ranking = [moodnames[i] for i in top_indices] term_rankings.append(term_ranking) if show_topics: print "Topic %d: %s" % ( topic_index, ", ".join( term_ranking ) ) print '='*60 cP.dump(nmf, open(PATH_DATA + 'NMF_object.cP', 'w')) cP.dump(term_rankings, open(PATH_DATA + ('topics_strings_%d_components.cP' % num_components), 'w')) for row_idx, row in enumerate(W): if np.max(row) != 0: W[row_idx] = row / np.max(row) return W / np.max(W) # return normalised matrix, [0, 1] ''''''
def infer_topics(self, num_topics=10): self.nb_topics = num_topics nmf = NMF(n_components=num_topics) topic_document = nmf.fit_transform(self.corpus.sklearn_vector_space) self.topic_word_matrix = [] self.document_topic_matrix = [] vocabulary_size = len(self.corpus.vocabulary) row = [] col = [] data = [] for (topic_idx, topic) in enumerate(nmf.components_): for i in range(vocabulary_size): row.append(topic_idx) col.append(i) data.append(topic[i]) self.topic_word_matrix = coo_matrix((data, (row, col)), shape=(self.nb_topics, len(self.corpus.vocabulary))).tocsr() row = [] col = [] data = [] doc_count = 0 for doc in topic_document: topic_count = 0 for topic_weight in doc: row.append(doc_count) col.append(topic_count) data.append(topic_weight) topic_count += 1 doc_count += 1 self.document_topic_matrix = coo_matrix((data, (row, col)), shape=(self.corpus.size, self.nb_topics)).tocsr()
def reduceDimensionality(n_components=100): # import the csv into a pandas df df = pd.read_csv('data/gameData.csv') # Normalize the numeric columns to values in [0,1] numericColumns = ['maxPlayers','maxPlaytime','minAge','minPlayers','minPlaytime','playtime'] colsToNormalize = [] for col in numericColumns: if col in df.columns: colsToNormalize.append(col) df[colsToNormalize] = df[colsToNormalize].apply(lambda x: (x - x.min())/(x.max() - x.min())/2) # Drop string columns colsToDrop = ['artists','categories','designers','families','publishers','mechanics','boardGameId','yearPublished'] # Convert df to an array for NMF and stor the board game id column to attach later boardGameIds = df['boardGameId'] arr = df.as_matrix([col for col in df.columns if col not in colsToDrop]) arr = np.nan_to_num(arr) # Perform NMF with n_dimensions model = NMF(n_components=n_components) W = model.fit_transform(arr) W = np.insert(W, 0, boardGameIds, axis=1) np.savetxt("data/reducedGameFeatures.csv", W, delimiter=",")
def extract_tfidf_nmf_feats(self, df_data, n_components): """ Extract tfidf features using nmf. """ df_feat = pd.DataFrame(index=range(df_data.shape[0])) tfidf = TfidfVectorizer(ngram_range=(2, 3), stop_words='english') tsvd = TruncatedSVD(n_components=n_components, random_state = 2016) nmf = NMF(solver='cd', n_components=n_components, init='nndsvda', random_state=0, tol=1e-3) df_data['q'].to_csv('q', index=False) df_data['t'].to_csv('t', index=False) df_data['d'].to_csv('d', index=False) print('fitting in tfidf') tfidf.set_params(input='filename') tfidf.fit(['q','t','d']) tfidf.set_params(input='content') for col in ['d', 't', 'q', 'b']: print('process column', col) txt = df_data[col] tfidf_mat = tfidf.transform(txt) nd_feat = nmf.fit_transform(tfidf_mat) tmp = pd.DataFrame(nd_feat, columns=[col+'_tfidf_nmf_comp'+str(i) \ for i in range(n_components)]) df_feat = pd.merge(df_feat, tmp, left_index=True, right_index=True) saveit(df_feat, 'df_tfidf_nmf_feats')
def nnMatrixFactorisation(data, labels, new_dimension): print "non negative matrix factorisation..." start = time.time() mf = NMF(n_components=new_dimension) reduced = mf.fit_transform(data) end = time.time() return (reduced, end-start)
def nmf_model2(n_topics,document_term_mat): # print("\n\n---------\n decomposition") nmf = NMF(n_components=n_topics, l1_ratio=0.0) W_sklearn = nmf.fit_transform(document_term_mat) H_sklearn = nmf.components_ # describe_nmf_results(document_term_mat, W_sklearn, H_sklearn) return W_sklearn, H_sklearn
def __Factorize_NMF(self,K): model = NMF(n_components=K,max_iter=self._iteration) model.fit(self._mat) user_fmat = model.fit_transform(self._mat) item_fmat = model.components_.T return user_fmat,item_fmat
def do_NMF(sparse_matrix): t0 = time.time() print("* Performing NMF on sparse matrix ... ") nmf = NMF(n_components=3) coordinates = nmf.fit_transform(sparse_matrix) print("done in %0.3fs." % (time.time() - t0)) return(coordinates)
def nmf_df(sym, k, coll): data = [ item for item in coll.find({'text': { '$in' :[re.compile(sym)] }}) ] sents = [ sentence['text'] for sentence in data ] dates = [ str(text['created_at']) for text in data ] d = np.array(dates).T d = d.reshape(len(dates), 1) vectorizer = TfidfVectorizer(stop_words='english', max_features=1000) X = vectorizer.fit_transform(sents) #features = vectorizer.get_feature_names() model = NMF(n_components=k, init='random', random_state=0) latent_features = model.fit_transform(X) # lat0 = list(latent_features[:,0]) # lat1 = list(latent_features[:,1]) # lat2 = list(latent_features[:,2]) # lat3 = list(latent_features[:,3]) df = pd.DataFrame(latent_features) #np.concatenate((d, latent_features), axis=1) df.columns = [ 'lat'+ str(n) for n in xrange(len(df.columns)) ] df['time_stamp'] = d #print df.head() df['date'] = pd.to_datetime(df['time_stamp']).apply(pd.datetools.normalize_date) df.pop('time_stamp') #print df.head() grouped_data = df.groupby(['date']).mean() grouped_data['sym'] = sym return grouped_data
def find_aspects(sentences, city, n_top_words=15): ''' INPUT sentences, city(str, lower case) OUTPUT aspects dictionary ''' vectorizer = TfidfVectorizer(max_features=n_features, stop_words='english') document_term_mat = vectorizer.fit_transform(sentences) feature_words = vectorizer.get_feature_names() nmf = NMF(n_components=n_topics) W_sklearn = nmf.fit_transform(document_term_mat) H_sklearn = nmf.components_ important_words = [] for topic in H_sklearn: for i in topic.argsort()[:-n_top_words - 1:-1]: important_words.append(feature_words[i]) important_words = set(important_words) important_words = list(important_words) nouns = [] for i in sentences: nouns.extend(list(TextBlob(i).noun_phrases)) noun_list = list(set(filter(lambda x: (len(x.split(' '))>1)&('...' not in x.split(' ')), nouns))) aspects_dict = defaultdict(list) for i in important_words: if i not in [city, city.lower(),'okay','ok','thing','things','time','times','greasy','awful'] and TextBlob(i).tags[0][1] in ['NN', 'NNS']: for j in noun_list: if i in j.split(' '): aspects_dict[i].append(j) for i in aspects_dict: aspects_dict[i] = list(set(aspects_dict[i])) return aspects_dict
def extract_reconstruction_error_beats(comps, music_stft, beats): K = comps.shape[1] #initialize transformer (non-negative matrix factorization) with K components transformer = NMF(n_components = K, init = 'custom') #W and H are random at first W = np.random.rand(comps.shape[0], K) start = 0 errors = [] lookback = 0 weight = np.array([1 for i in range(2, music_stft.shape[0] + 2)]) weight = weight/np.max(weight) for i in range(lookback+1, len(beats)): block = music_stft[:, beats[i-(lookback+1)]:beats[i]] H = np.random.rand(K, block.shape[1]) W[:, 0:K] = comps params = {'W': W, 'H': H, 'update_W': False} comps_block = transformer.fit_transform(np.abs(block), **params) acts_block = transformer.components_ #reconstruct the signal block_reconstruction = comps_block.dot(acts_block) block_reconstruction = block_reconstruction.T*weight block = block.T*weight distance = norm(block_reconstruction - np.abs(block)) #errors.append(transformer.reconstruction_err_) errors.append(distance) return errors
def extract_template(comps, music_stft): K = comps.shape[1] #initialize transformer (non-negative matrix factorization) with K components transformer = NMF(n_components = K, init = 'custom') #W and H are random at first W = np.random.rand(comps.shape[0], K) H = np.random.rand(K, music_stft.shape[1]) #set W to be the template components you want to extract W[:, 0:K] = comps #don't let W get updated in the non-negative matrix factorization params = {'W': W, 'H': H, 'update_W': False} comps_music = transformer.fit_transform(np.abs(music_stft), **params) acts_music = transformer.components_ #reconstruct the signal music_reconstruction = comps_music.dot(acts_music) #mask the input signal music_stft_max = np.maximum(music_reconstruction, np.abs(music_stft)) mask = np.divide(music_reconstruction, music_stft_max) mask = np.nan_to_num(mask) #binary mask mask = np.round(mask) #template - extracted template, residual - everything that's leftover. template = np.multiply(music_stft, mask) residual = np.multiply(music_stft, 1 - mask) return template, residual
def extract_reconstruction_errors(comps, music_stft, window_length, hop): K = comps.shape[1] #initialize transformer (non-negative matrix factorization) with K components transformer = NMF(n_components = K, init = 'custom') #W and H are random at first W = np.random.rand(comps.shape[0], K) start = 0 errors = [] while (start + window_length < music_stft.shape[1]): block = music_stft[:, start:start+window_length] H = np.random.rand(K, block.shape[1]) W[:, 0:K] = comps params = {'W': W, 'H': H, 'update_W': False} comps_block = transformer.fit_transform(np.abs(block), **params) acts_block = transformer.components_ #reconstruct the signal block_reconstruction = comps_block.dot(acts_block) errors.append(transformer.reconstruction_err_) start = start + hop return errors
def doNMF(datan,n_components=4): # from Mitsu #alternatively PCA ... might me faster nmf=NMF(n_components=n_components,init='nndsvd') data_decomp_all=nmf.fit_transform(datan) data_components_all=nmf.components_ return data_decomp_all,data_components_all
def get_LDA(X, num_components=10, show_topics=True): """ Latent Dirichlet Allication by NMF. 21 Nov 2015, Keunwoo Choi LDA for a song-tag matrix. The motivation is same as get_LSI. With NMF, it is easier to explain what each topic represent - by inspecting 'H' matrix, where X ~= X' = W*H as a result of NMF. It is also good to have non-negative elements, straight-forward for both W and H. """ from sklearn.decomposition import NMF if X == None: print 'X is omitted, so just assume it is the mood tag mtx w audio.' X = np.load(PATH_DATA + FILE_DICT["mood_tags_matrix"]) #np matrix, 9320-by-100 nmf = NMF(init='nndsvd', n_components=num_components, max_iter=400) # 400 is too large, but it doesn't hurt. W = nmf.fit_transform(X) H = nmf.components_ print '='*60 print "NMF done with k=%d, average error:%2.4f" % (num_components, nmf.reconstruction_err_/(X.shape[0]*X.shape[1])) term_rankings = [] moodnames = cP.load(open(PATH_DATA + FILE_DICT["moodnames"], 'r')) #list, 100 for topic_index in range( H.shape[0] ): top_indices = np.argsort( H[topic_index,:] )[::-1][0:10] term_ranking = [moodnames[i] for i in top_indices] term_rankings.append(term_ranking) if show_topics: print "Topic %d: %s" % ( topic_index, ", ".join( term_ranking ) ) print '='*60 cP.dump(term_rankings, open(PATH_DATA + (FILE_DICT["mood_topics_strings"] % num_components), 'w')) return W / np.max(W) # return normalised matrix, [0, 1]
def _make_test_matrix(self, matrix, test_decomp='svd'): ''' Input: a matrix Output: a recomposed estimated ratings matrix Decomposes input matrix according to decomposition type and then makes an estimated ratings matrix ''' if test_decomp == 'svd': _, s1, V = svd(matrix) how = self.s_option how = self.test_how #print "s1", s1 #print "how", how s = self._get_s(s1, how) #print s #print V #print self.matrix_1.U return np.dot(self.matrix_1.U, np.dot(s, V)) elif test_decomp == 'nmf': model = NMF() H = model.fit_transform(matrix) print H W = model.components_ return np.dot(self.matrix_1.H, W) else: pass '''
def _fit_local(self, data): from sklearn.decomposition import NMF nmf = NMF(n_components=self.k, tol=self.tol, max_iter=self.max_iter, random_state=self.seed) w = nmf.fit_transform(data) return w, nmf.components_,
def nmf (matriztfxidf): nmf = NMF(n_components = 50, init='random', random_state=0) matrizReduzida = nmf.fit_transform(matriztfxidf) # w #h = nmf.components_ # h #resultado = np.dot(matrizReduzida, h) # w.h -> volta na matriz original aproximada return matrizReduzida
def test_nmf_transform(): # Test that NMF.transform returns close values A = np.abs(random_state.randn(6, 5)) for solver in ('pg', 'cd'): m = NMF(solver=solver, n_components=4, init='nndsvd', random_state=0) ft = m.fit_transform(A) t = m.transform(A) assert_array_almost_equal(ft, t, decimal=2)
class Archetypes: ''' Archetypes: Performs NMF of order n on X and stores the result as attributes. Archetypes are normalized: cosine similarity a(i) @ a(i) = 1. Atributes: my_archetypes.n - order / number of archetypes my_archetypes.X - input matrix my_archetypes.model - NMF model my_archetypes.w - NMF w-matrix my_archetypes.h - NMF h-matrix my_archetypes.o - occupations x archetypes matrix (from w-matrix) my_archetypes.on - occupations x normalized archetypes matrix (from w-matrix) - SOCP number as index. my_archetypes.occ - occupations x normalized archetypes matrix - Occupation names as index my_archetypes.f - features x archetypes matrix (from h-matrix) my_archetypes.fn - features x normalized archetypes matrix ''' def __init__(self,X,n,norm = norm_dot): self.n = n self.X = X self.model = NMF(n_components=n, init='random', random_state=0, max_iter = 1000, tol = 0.0000001) self.w = self.model.fit_transform(self.X) self.o = pd.DataFrame(self.w,index=self.X.index) self.on = self.o.T.apply(norm).T self.occ = self.on.copy() self.occ['Occupations'] = self.occ.index # self.occ['Occupations'] = self.occ['Occupations'].apply(onet_socp_name) self.occ = self.occ.set_index('Occupations') self.h = self.model.components_ self.f = pd.DataFrame(self.h,columns=X.columns) self.fn =self.f.T.apply(norm).T self.plot_occupations_dic ={} self.plot_features_dic ={} def plot_features(self,fig_scale = (1,3.5),metric='cosine', method = 'single',vertical = False): ''' Plot Archetypes as x and features as y. Utilizes Seaborn Clustermap, with hierarchical clustering along both axes. This clusters features and archetypes in a way that visualizes similarities and diffferences between the archetypes. Archetypes are normalized (cosine-similarity): dot product archetype[i] @ archetype[i] = 1. The plot shows intensities (= squared feature coefficients) so that the sum of intensities = 1. fig_scale: default values (x/1, y/3.5) scales the axes so that all feature labels are included in the plot. For other hyperparameters, see seaborn.clustermap ''' param = (fig_scale,metric,method,vertical) if param in self.plot_features_dic.keys(): fig = self.plot_features_dic[param] return fig.fig df = np.square(self.fn) if vertical: fig = sns.clustermap(df.T,robust = True, z_score=1,figsize=( self.n/fig_scale[0],self.X.shape[1]/fig_scale[1]),method = method,metric = metric) else: # horizontal fig = sns.clustermap(df,robust = True, z_score=0,figsize=( self.X.shape[1]/fig_scale[1],self.n/fig_scale[0]),method = method,metric = metric) self.features_plot = fig return fig def plot_occupations(self,fig_scale = (1,3.5),metric='cosine', method = 'single',vertical = False): ''' Plot Archetypes as x and occupations as y. Utilizes Seaborn Clustermap, with hierarchical clustering along both axes. This clusters occupations and archetypes in a way that visualizes similarities and diffferences between the archetypes. Occupations are normalized (cosine-similarity): dot product occupation[i] @ occupation[i] = 1. The plot shows intensities (= squared feature coefficients) so that the sum of intensities = 1. fig_scale: default values (x/1, y/3.5) scales the axes so that all feature labels are included in the plot. For other hyperparameters, see seaborn.clustermap ''' param = (fig_scale,metric,method,vertical) if param in self.plot_occupations_dic.keys(): fig = self.plot_occupations_dic[param] #return return fig.fig df = np.square(self.occ) if vertical: fig = sns.clustermap(df, figsize=( self.n/fig_scale[0],self.X.shape[0]/fig_scale[1]),method = method,metric = metric) else: # horizontal fig = sns.clustermap(df.T, figsize=( self.X.shape[0]/fig_scale[1],self.n/fig_scale[0]),method = method,metric = metric) self.plot_occupations_dic[param] = fig #return return fig.fig
for comp in W_zero: comp[pitch-pitch_min_number] = 1.0 p = pitch + 12 while p < W_zero.shape[1] - 2: for epsilon in range(-2, 2): comp[p - pitch_min + epsilon] = 1.0 p += 12 H_zero = np.random.rand(V.shape[0], pitch_max - pitch_min_number) print V.shape from sklearn.decomposition import NMF model = NMF(init='custom', n_components=pitch_max-pitch_min_number) comps = model.fit_transform(V, W=H_zero, H=W_zero) acts = model.components_ #from librosa.decompose import decompose #comps, acts = decompose(V, n_components=n_components, sort=True) # visualisation matters import matplotlib.pyplot as plt from librosa.display import specshow import matplotlib.gridspec as gridspec plt.close('all') plt.subplot2grid((4, 2), (0,0), colspan=2) specshow(midi_mat, sr=sr, x_axis='time', y_axis='cqt_note')
def extract_components(mov_tot, n_components: int = 6, normalize_std: bool = True, max_iter_DL=-30, method_factorization: str = 'nmf', **kwargs) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: """ From optical flow images can extract spatial and temporal components Args: mov_tot: ndarray (can be 3 or 4D) contains the optical flow values, either in cartesian or polar, either one (3D) or both (4D coordinates) the input is generated by the compute_optical_flow function n_components: int number of components to look for normalize_std: bool whether to normalize each oof the optical flow components normalize_output_traces: boolean whether to normalize the behavioral traces so that they match the units in the movie Returns: spatial_filter: ndarray set of spatial inferred filters time_trace: ndarray set of time components norm_fact: ndarray used notmalization factors """ if mov_tot.ndim == 4: if normalize_std: norm_fact = np.nanstd(mov_tot, axis=(1, 2, 3)) mov_tot = old_div(mov_tot, norm_fact[:, np.newaxis, np.newaxis, np.newaxis]) else: norm_fact = np.array([1., 1.]) c, T, d1, d2 = np.shape(mov_tot) else: norm_fact = 1 T, d1, d2 = np.shape(mov_tot) c = 1 tt = time.time() newm = np.reshape(mov_tot, (c * T, d1 * d2)) if method_factorization == 'nmf': nmf = NMF(n_components=n_components, **kwargs) time_trace = nmf.fit_transform(newm) spatial_filter = nmf.components_ spatial_filter = np.concatenate([ np.reshape(sp, (d1, d2))[np.newaxis, :, :] for sp in spatial_filter ], axis=0) elif method_factorization == 'dict_learn': import spams newm = np.asfortranarray(newm, dtype=np.float32) time_trace = spams.trainDL(newm, K=n_components, mode=0, lambda1=1, posAlpha=True, iter=max_iter_DL) spatial_filter = spams.lasso(newm, D=time_trace, return_reg_path=False, lambda1=0.01, mode=spams.spams_wrap.PENALTY, pos=True) spatial_filter = np.concatenate([ np.reshape(sp, (d1, d2))[np.newaxis, :, :] for sp in spatial_filter.toarray() ], axis=0) time_trace = [np.reshape(ttr, (c, T)).T for ttr in time_trace.T] el_t = time.time() - tt print(el_t) return spatial_filter, time_trace, norm_fact
def find_model(dataset, train_size, problem, label="", datatype="numerical", dim_reduction=False, components="auto", contains_negative=True, ensembling=True, priority="accuracy"): if datatype != "nominal": # Label encode data to ensure everything is numeric print("Label encoding. . .") dataset = dataset.apply(LabelEncoder().fit_transform) # Reduce dimensionality of dataset if dim_reduction: print("Performing dimensionality reduction. . .") print("Features' shape before reduction is", X.shape) if contains_negative: # If dataset contains negative values, use principal component analysis if components == "auto": print( "Using default number of components for principal component analysis. . ." ) pca = PCA(n_components=2) else: print("Using", components, "components for principal component analysis. . .") pca = PCA(n_components=components) X = pca.fit_transform(X) else: # Otherwise, use non-negative matrix factorization if components == "auto": print( "Using default number of components for non-negative matrix factorization. . ." ) nmf = NMF(n_components=2) else: print("Using", components, "components for principal component analysis. . .") nmf = NMF(n_components=components) X = nmf.fit_transform(X) print("Features' shape after reduction is", X.shape) if problem != "clustering": # Split X and y into training and testing datasets print("Splitting datasets for training and testing. . .") X_train, X_test, y_train, y_test = train_test_split( X, y, train_size=train_size) # Scale variables to standardize values print("Standardizing values. . .") sc = StandardScaler() X_train = sc.fit_transform(X_train) X_test = sc.transform(X_test) else: # Scale variables to standardize values print("Standardizing values. . .") sc = StandardScaler() X = sc.fit_transform(X) if priority == "accuracy": if problem == "classification": find_classification_model(X_train, X_test, y_train, y_test, priority="accuracy", ensembling=ensembling, datatype=datatype) elif problem == "regression": find_regression_model(X_train, X_test, y_train, y_test, ensembling=ensembling) elif problem == "clustering": find_clustering_model(X) if priority == "time": if problem == "classification": find_classification_model(X_train, X_test, y_train, y_test, priority="time", ensembling=ensembling, datatype=datatype) elif problem == "regression": find_regression_model(X_train, X_test, y_train, y_test, ensembling=ensembling) elif problem == "clustering": find_clustering_model(X)
class MFRecommender(BaseRecommender): """Matrix factorization recommender Uses Matrix Factorization to determine which pipeline to recommend. Args: n_components (int): Corresponds to the number of features to keep in matrix decomposition. Must be greater than the number of rows in matrix. r_minimum (int): The minimum number of past results this recommender needs in order to use Matrix Factorization for prediction. If not enough results are present during a ``predict``, a uniform recommender is used. """ def __init__(self, dpp_matrix, n_components=100, r_minimum=5): super(MFRecommender, self).__init__(dpp_matrix) self.n_components = n_components self.r_minimum = r_minimum # Matrix Factorization model that reduces dimensionality from num pipelines space to # n_components space. self.mf_model = NMF(n_components=n_components, init='nndsvd') dpp_decomposed = self.mf_model.fit_transform(dpp_matrix) # Matrix of rankings for each row of dpp_matrix after matrix facorization has been applied. self.dpp_ranked = np.empty(dpp_decomposed.shape) for i in range(dpp_decomposed.shape[0]): rankings = stats.rankdata( dpp_decomposed[i, :], method='dense' ) self.dpp_ranked[i, :] = rankings random_matching_index = np.random.randint(self.dpp_matrix.shape[0]) # Row from dpp_matrix representing pipeline performances for the dataset that most closely # matches the new dataset D. Identified in fit. self.matching_dataset = self.dpp_matrix[random_matching_index, :] def fit(self, dpp_vector): """ Finds row of self.dpp_matrix most closely corresponds to X by means of Kendall tau distance. https://en.wikipedia.org/wiki/Kendall_tau_distance Args: dpp_vector (np.array): Array with shape (n_components, ) """ # decompose X and generate the rankings of the elements in the # decomposed matrix dpp_vector_decomposed = self.mf_model.transform(dpp_vector) dpp_vector_ranked = stats.rankdata( dpp_vector_decomposed, method='dense', ) max_agreement_index = None max_agreement = -1 # min value of Kendall Tau agremment for i in range(self.dpp_ranked.shape[0]): # calculate agreement between current row and X agreement, _ = stats.kendalltau( dpp_vector_ranked, self.dpp_ranked[i, :], ) if agreement > max_agreement: max_agreement_index = i max_agreement = agreement if max_agreement_index is None: max_agreement_index = np.random.randint(self.dpp_matrix.shape[0]) # store the row with the highest agreement for prediction self.matching_dataset = self.dpp_matrix[max_agreement_index, :] def predict(self, indices): num_tried_candidates = len(np.where(self.dpp_vector != 0)[0]) if num_tried_candidates < self.r_minimum: return UniformRecommender(self.dpp_matrix).predict(indices) matching_scores = np.array( [self.matching_dataset[each] for each in indices] ) return stats.rankdata(matching_scores, method='dense')
test_encoded = encoder.predict(test_signal) train_encoded = encoder.predict(aggregate_signal) ################ # buid nmf model ################ alpha = 0.012 model = NMF(n_components = encoding_dim, init = 'random', max_iter=500, solver='cd') ################# # train nmf model ################# print('*'*5,'evaluate NMF','*'*5) W = model.fit_transform(train_encoded) train_error = disag_error(train_encoded,W,model.components_) print('train error: ', train_error) #################### # evaluate nmf model #################### W_test = model.transform(test_encoded) test_error = disag_error(test_encoded,W_test,model.components_) print('Test error: ', test_error) ################# # decoder outputs ################# decoded_signal = decoder.predict(W.dot(model.components_))
def NMF_TFIDF(): english_stemmer = Stemmer.Stemmer('en') class StemmedTfidfVectorizer(TfidfVectorizer): def build_analyzer(self): analyzer = super(TfidfVectorizer, self).build_analyzer() return lambda doc: english_stemmer.stemWords(analyzer(doc)) cats = ['comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware','comp.sys.mac.hardware', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey'] print("Loading 20 newsgroups dataset for categories:") pprint(list(cats)) newsgroups = fetch_20newsgroups(subset='all', categories = cats) print("%d documents" % len(newsgroups.data)) print("%d categories" % len(newsgroups.target_names)) print("Creating stemmed TFxIDF representation...") t0 = time() vect = StemmedTfidfVectorizer(stop_words='english') vectors = vect.fit_transform(newsgroups.data) # TFxIDF representation print("Done in %fs" % (time() - t0)) print("n_samples: %d, n_features: %d" % vectors.shape) workbook = xlsxwriter.Workbook('part3_NMF.xlsx') purityMetricsNames = ['Homogeneity', 'Completeness', 'V-measure', 'Adjust Rand-Index', 'Adjusted Mutual Information Score'] metric_list = {} for i in range(1,21): print("Implementing NMF on data...") nmf_ = NMF(n_components=i) # nmf_data = nmf_.fit_transform(vectors) print("Done.") labels = newsgroups.target labels_2 = [] # Changing the labels from 0-7 to 0-1 for mark in labels: if mark <= 3: labels_2.append(0) else: labels_2.append(1) k = 2 km = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=1) print("Clustering sparse data with %s" % km) t0 = time() km.fit(nmf_data) print("done in %0.3fs" % (time() - t0)) print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_2, km.labels_)) print("Completeness: %0.3f" % metrics.completeness_score(labels_2, km.labels_)) print("V-measure: %0.3f" % metrics.v_measure_score(labels_2, km.labels_)) print("Adjusted Rand-Index: %.3f" % metrics.adjusted_rand_score(labels_2, km.labels_)) print("Adjusted Mutual Information Score: %.3f" % metrics.adjusted_mutual_info_score(labels_2, km.labels_)) print metrics.confusion_matrix(labels_2,km.labels_) purityMetrics = [metrics.homogeneity_score(labels_2, km.labels_), metrics.completeness_score(labels_2, km.labels_),metrics.v_measure_score(labels_2, km.labels_),metrics.adjusted_rand_score(labels_2, km.labels_),metrics.adjusted_mutual_info_score(labels_2, km.labels_)] # Writing to .xlsx file (For Confusion Matrix) worksheet = workbook.add_worksheet() obs = zip(km.labels_,labels_2) row = 0 col = 0 worksheet.write(row,col,'Predictions') worksheet.write(row,col+1,'Actuals') worksheet.write(row,col+6,'Dimension') worksheet.write(row+1,col+6,i) metric_list = dict(zip(purityMetricsNames,purityMetrics)) pprint(dict(metric_list)) for key in metric_list.keys(): row += 1 worksheet.write(row,col+11,key) worksheet.write(row,col+12,metric_list[key]) row = 0 col = 0 for pred, actual in (obs): row += 1 worksheet.write(row,col, pred) worksheet.write(row,col+1,actual) row = 1 for things in labels: worksheet.write(row,col+2,things) row += 1 workbook.close()
def NMF_2(): english_stemmer = Stemmer.Stemmer('en') class StemmedTfidfVectorizer(TfidfVectorizer): def build_analyzer(self): analyzer = super(TfidfVectorizer, self).build_analyzer() return lambda doc: english_stemmer.stemWords(analyzer(doc)) cats = ['comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware','comp.sys.mac.hardware', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey'] print("Loading 20 newsgroups dataset for categories:") pprint(list(cats)) newsgroups = fetch_20newsgroups(subset='all', categories = cats) print("%d documents" % len(newsgroups.data)) print("%d categories" % len(newsgroups.target_names)) print("Creating stemmed TFxIDF representation...") t0 = time() vect = StemmedTfidfVectorizer(stop_words='english') vectors = vect.fit_transform(newsgroups.data) # TFxIDF representation print("Done in %fs" % (time() - t0)) print("n_samples: %d, n_features: %d" % vectors.shape) workbook = xlsxwriter.Workbook('partC_NMF.xlsx') print("Implementing NMF of dimension 2 on data...") nmf_ = NMF(n_components=2) # alpha value? l1 value? nmf_data = nmf_.fit_transform(vectors) print("Done.") print("Implementing non-linear transform on data...") offset = 0.001 nmf_data_off=np.add(nmf_data,offset) log_nmfdata=np.log(nmf_data_off) print("Done.") labels = newsgroups.target labels_2 = [] # Changing the labels from 0-7 to 0-1 for mark in labels: if mark <= 3: labels_2.append(0) else: labels_2.append(1) k = 2 km = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=1) print("Clustering sparse data with %s" % km) t0 = time() km.fit(nmf_data) km.fit(log_nmfdata) print("done in %0.3fs" % (time() - t0)) # Transforming data back data2D = km.transform(nmf_data) data2D_logarithm = km.transform(log_nmfdata) plt.figure(1) plt.subplot(221) print("Plotting labels of Kmeans algorithm using NMF") plt.title('NMF Dim 2 Kmeans Algorithm with NMF') plt.scatter(nmf_data[:,0], nmf_data[:,1], c=km.labels_) plt.subplot(222) print("Plotting ground truth") plt.title('True labels of data') plt.scatter(nmf_data[:,0], nmf_data[:,1], c=labels_2) plt.subplot(223) print("Plotting labels of Kmeans algorithm with nonlinear transform NMF") plt.title('NMF Dim 2 Kmeans Algorithm Nonlinear transform') plt.scatter(log_nmfdata[:,0], log_nmfdata[:,1], c=km.labels_) plt.subplot(224) print("Plotting ground truth with nonlinear transform") plt.title('Ground truth, nonlinear transform') plt.scatter(log_nmfdata[:,0], log_nmfdata[:,1], c=labels_2) plt.show() print ("Done.")
[5,3,0,1], [4,0,0,1], [1,1,0,5], [1,0,0,4], [0,1,5,4], [5,3,0,0] ] R = np.array(R) N = len(R) M = len(R[0]) K = 2 P = np.random.rand(N,K) Q = np.random.rand(M,K) print('Simple matrix factorization') P, Q = matrix_factorization(R, P, Q, K) print(P) print(Q) print(P.dot(Q.T)) print('Non-negative matrix factorization') model = NMF(n_components=2, init='random', random_state=0) W = model.fit_transform(R) H = model.components_ print(W) print(H.T) print(W.dot(H))
def _greedyROI(scan, num_components=200, neuron_size=(11, 11), num_background_components=1): """ Initialize components by searching for gaussian shaped, highly active squares. #one by one by moving a gaussian window over every pixel and taking the highest activation as the center of the next neuron. :param np.array scan: 3-dimensional scan (image_height, image_width, num_frames). :param int num_components: The desired number of components. :param (float, float) neuron_size: Expected size of the somas in pixels (y, x). :param int num_background_components: Number of components that model the background. """ from scipy import ndimage # Get some params image_height, image_width, num_frames = scan.shape # Get the gaussian kernel gaussian_stddev = np.array(neuron_size) / 4 # entire neuron in four standard deviations gaussian_kernel = _gaussian2d(gaussian_stddev) # Create residual scan (scan minus background) residual_scan = scan - np.mean(scan, axis=(0, 1)) # image-wise brightness background = ndimage.gaussian_filter(np.mean(residual_scan, axis=-1), neuron_size) residual_scan -= np.expand_dims(background, -1) # Create components masks = np.zeros([image_height, image_width, num_components], dtype=np.float32) traces = np.zeros([num_components, num_frames], dtype=np.float32) mean_frame = np.mean(residual_scan, axis=-1) for i in range(num_components): # Get center of next component neuron_locations = ndimage.gaussian_filter(mean_frame, gaussian_stddev) y, x = np.unravel_index(np.argmax(neuron_locations), [image_height, image_width]) # Compute initial trace (bit messy because of edges) half_kernel = np.fix(np.array(gaussian_kernel.shape) / 2).astype(np.int32) big_yslice = slice(max(y - half_kernel[0], 0), y + half_kernel[0] + 1) big_xslice = slice(max(x - half_kernel[1], 0), x + half_kernel[1] + 1) kernel_yslice = slice(max(0, half_kernel[0] - y), None if image_height > y + half_kernel[0] else image_height - y - half_kernel[0] - 1) kernel_xslice = slice(max(0, half_kernel[1] - x), None if image_width > x + half_kernel[1] else image_width - x - half_kernel[1] - 1) cropped_kernel = gaussian_kernel[kernel_yslice, kernel_xslice] trace = np.average(residual_scan[big_yslice, big_xslice].reshape(-1, num_frames), weights=cropped_kernel.ravel(), axis=0) # Get mask and trace using 1-rank NMF half_neuron = np.fix(np.array(neuron_size) / 2).astype(np.int32) yslice = slice(max(y - half_neuron[0], 0), y + half_neuron[0] + 1) xslice = slice(max(x - half_neuron[1], 0), x + half_neuron[1] + 1) mask, trace = _rank1_NMF(residual_scan[yslice, xslice], trace) # Update residual scan neuron_activity = np.expand_dims(mask, -1) * trace residual_scan[yslice, xslice] -= neuron_activity mean_frame[yslice, xslice] = np.mean(residual_scan[yslice, xslice], axis=-1) # Store results masks[yslice, xslice, i] = mask traces[i] = trace # Create background components residual_scan += np.mean(scan, axis=(0, 1)) # add back overall brightness residual_scan += np.expand_dims(background, -1) # and background if num_background_components == 1: background_masks = np.expand_dims(np.mean(residual_scan, axis=-1), axis=-1) background_traces = np.expand_dims(np.mean(residual_scan, axis=(0, 1)), axis=0) else: from sklearn.decomposition import NMF print("Warning: Fitting more than one background component uses scikit-learn's " "NMF and may take some time.""") model = NMF(num_background_components, random_state=123, verbose=True) flat_masks = model.fit_transform(residual_scan.reshape(-1, num_frames)) background_masks = flat_masks.reshape([image_height, image_width, -1]) background_traces = model.components_ return masks, traces, background_masks, background_traces
# IPython log file from __future__ import division import numpy as np from sklearn.decomposition import NMF nmf = NMF() from sklearn.datasets import load_iris iris = load_iris() NMF(iris.data) nmf.fit_transform(iris.data) nmf = NMF(n_components=2) fits = nmf.fit_transform(iris.data) fits len(fits) fits[:5] nmf.reconstruction_err_ exit()
def nmf_on_data(vectorizer, uk_transcipts, topics, num_top_words): trans_vectorized = vectorizer.fit_transform(uk_transcipts) nmf_model = NMF(topics) doc_topic = nmf_model.fit_transform(trans_vectorized) print('explained variance: ', get_score(nmf_model, trans_vectorized.toarray())) return doc_topic, display_topics(nmf_model, vectorizer.get_feature_names(), num_top_words)
def calculate_topics(features, n_topics): # http://scikit-learn.org/stable/auto_examples/applications/topics_extraction_with_nmf.html nmf = NMF(n_components=n_topics) return nmf, nmf.fit_transform(features)
# LSA # http://scikit-learn.org/stable/auto_examples/text/document_clustering.html#example-text-document-clustering-py from sklearn.decomposition import TruncatedSVD from sklearn.preprocessing import Normalizer from sklearn.pipeline import make_pipeline svd = TruncatedSVD(dim) normalizer = Normalizer(copy=False) lsa = make_pipeline(svd, normalizer) sklearn_tfidf_train_svd = lsa.fit_transform(sklearn_tfidf_train) sklearn_tfidf_test_svd = lsa.fit_transform(sklearn_tfidf_test) sklearn_tf_train_svd = lsa.fit_transform(sklearn_tf_train) sklearn_tf_test_svd = lsa.fit_transform(sklearn_tf_test) from sklearn.decomposition import NMF nmfModel = NMF(n_components=dim, init='random', random_state=0) sklearn_tfidf_train_nmf = nmfModel.fit_transform(sklearn_tfidf_train) sklearn_tfidf_test_nmf = nmfModel.fit_transform(sklearn_tfidf_test) sklearn_tf_train_nmf = nmfModel.fit_transform(sklearn_tf_train) sklearn_tf_test_nmf = nmfModel.fit_transform(sklearn_tf_test) #sklearn_tf_train_nmf = nmfModel.fit_transform(sklearn_tfidf_train) #%% 4. Traing LDA Model and Vectoring # Train model #from gensim.models.ldamodel import LdaModel #lda = LdaModel(corpus=train_corpus_tfidf, id2word=dictionary, num_topics=dim)#, alpha=alpha) # ## Vectorizing #trainTopicDistArr = lda.inference(train_corpus_tfidf)[0] #testTopicDistArr = lda.inference(test_corpus_tfidf)[0]
def nmf(X, K): nmf = NMF(n_components=K) X_red = nmf.fit_transform(X) X_red = normalizer.fit_transform(X_red) return X_red
def generate_clustering(loom, layername, clustering_depth=3, starting_clustering_depth=0, max_clusters='sqrt_rule', mode='pca', silhouette_threshold=0.1, clusteringcachedir='clusteringcachedir/'): """ Parameters ---------- loom : clustering_depth : (Default value = 3) starting_clustering_depth : (Default value = 0) max_clusters : (Default value = 200) layername : mode : (Default value = 'pca') silhouette_threshold : (Default value = 0.1) clusteringcachedir : (Default value = 'clusteringcachedir/') Returns ------- """ if type(clustering_depth) != int or clustering_depth < 1 or type( starting_clustering_depth) != int: raise Exception( "clustering_depth and starting_clustering_depth must be natural numbers." ) if (starting_clustering_depth > 0) and ( 'ClusteringIteration{}'.format(starting_clustering_depth - 1) not in loom.ca.keys()): raise Exception( "starting_clustering_depth not yet computed; please run with lower starting_clustering depth, or 0" ) if mode not in ['pca', 'nmf']: raise Exception("Currently only implemented for modes: pca and nmf") from time import time from sklearn.decomposition import IncrementalPCA from tqdm import tqdm from panopticon.analysis import get_subclustering if mode == 'pca': from sklearn.decomposition import PCA elif mode == 'nmf': from sklearn.decomposition import NMF if starting_clustering_depth == 0: if mode == 'nmf': n_nmf_cols = loom.attrs['NumberNMFComponents'] nmf_loadings = [] for col in [ '{} NMF Loading Component {}'.format(layername, x) for x in range(1, n_nmf_cols + 1) ]: nmf_loadings.append(loom.ca[col]) X = np.vstack(nmf_loadings).T elif mode == 'pca': n_pca_cols = loom.attrs['NumberPrincipalComponents_{}'.format( layername)] pca_loadings = [] for col in [ '{} PC {} Loading'.format(layername, x) for x in range(1, n_pca_cols + 1) ]: pca_loadings.append(loom.ca[col]) X = np.vstack(pca_loadings).T if max_clusters == 'sqrt_rule': clustering = get_subclustering( X, silhouette_threshold, max_clusters=int(np.floor(np.sqrt(X.shape[0]))), clusteringcachedir=clusteringcachedir ) # This shouldn't be hard-coded S Markson 9 June 2020 else: clustering = get_subclustering( X, silhouette_threshold, max_clusters=max_clusters, clusteringcachedir=clusteringcachedir ) # This shouldn't be hard-coded S Markson 9 June 2020 loom.ca['ClusteringIteration0'] = clustering starting_clustering_depth = 1 for subi in range(starting_clustering_depth, clustering_depth): loom.ca['ClusteringIteration{}'.format(subi)] = ['U'] * len( loom.ca['ClusteringIteration{}'.format(subi - 1)]) for cluster in set([ x for x in loom.ca['ClusteringIteration{}'.format(subi - 1)] if x != 'U' ]): #will need to fix mask = loom.ca['ClusteringIteration{}'.format( subi - 1)] == cluster #first mask, check for top level clustering #break start = time() data_c = loom[layername][:, mask] print("processing cluster", cluster, "; time to load: ", time() - start, ", mask size: ", np.sum(mask)) if mode == 'nmf': model = NMF(n_components=np.min([50, data_c.shape[1]]), init='random', random_state=0) X = model.fit_transform(data_c.T) elif mode == 'pca': data_c = data_c.T if data_c.shape[0] > 5000: model = IncrementalPCA(n_components=10) for chunk in tqdm( np.array_split(data_c, data_c.shape[0] // 512, axis=0), desc='partial fitting over chunks of masked data'): model.partial_fit(chunk) X = model.transform(data_c) print("EV", model.explained_variance_) print("EVR", model.explained_variance_ratio_) else: model = PCA(n_components=np.min([10, data_c.shape[0]]), random_state=0) X = model.fit_transform(data_c) print("EV", model.explained_variance_) print("EVR", model.explained_variance_ratio_) if max_clusters == 'sqrt_rule': print("xshape", X.shape) nopath_clustering = get_subclustering( X, silhouette_threshold, max_clusters=int(np.floor(np.sqrt(X.shape[0]))), clusteringcachedir=clusteringcachedir ) # This shouldn't be hard-coded S Markson 9 June 2020 else: nopath_clustering = get_subclustering( X, silhouette_threshold, max_clusters=max_clusters, clusteringcachedir=clusteringcachedir ) # This shouldn't be hard-coded S Markson 9 June 2020 # nopath_clustering = get_subclustering(X, score_threshold=silhouette_threshold) #Really shouldn't be hard-coded S Markson 9 June 2020 fullpath_clustering = [ '{}-{}'.format(cluster, x) for x in nopath_clustering ] loom.ca['ClusteringIteration{}'.format( subi)][mask] = fullpath_clustering loom.ca['ClusteringIteration{}'.format(subi)] = loom.ca[ 'ClusteringIteration{}'.format( subi)] #This is to force the changes to save to disk
# get sparse representation X_sparse = sparse.csr_matrix((y_train, (X_train[:, 0], X_train[:, 1])), shape=(len(portfolio_list), len(investment_list))) model = NMF(n_components=3, init='random', solver='cd', beta_loss='frobenius', max_iter=200, tol=0.0001, alpha=0, l1_ratio=0, random_state=0, verbose=0, shuffle=False) W = model.fit_transform(X_sparse) H = model.components_ # # Test model import random len(X_test) # Since our test set also contains only positive examples, we want to add some zero values: we randomly generate a pair (user, item) and, if there isn't a position for it, we add it with rating zero # EXTRA TODO: this function is super slow! find a faster way so the following line can be decommented # l = int(len(X_test) * 1.5) l = len(X_test) + 500 t1 = time.time() while len(X_test) < l:
def generate_nmf_and_loadings(loom, layername, nvargenes=2000, n_components=100, verbose=False): """ Parameters ---------- loom : layername : nvargenes : (Default value = 2000) n_components : (Default value = 100) verbose : (Default value = False) Returns ------- """ from sklearn.decomposition import NMF if 'GeneVar' not in loom.ra.keys(): raise Exception( "Necessary to have already generated gene expression variances") vargenemask = loom.ra['GeneVar'] > np.sort( loom.ra['GeneVar'])[::-1][nvargenes] X = loom[layername][vargenemask, :] model = NMF(n_components=n_components, init='random', random_state=0, verbose=verbose) W = model.fit_transform(X) H = model.components_ # record NMF basis nmf_factors = [] counter = 0 for isvargene in vargenemask: if isvargene: nmf_factors.append(W[counter, :]) counter += 1 else: nmf_factors.append(np.zeros(W.shape[1])) nmf_factors = np.vstack(nmf_factors) factor_sums = [] for i in range(nmf_factors.shape[1]): loom.ra['{} NMF Component {}'.format( layername, i + 1)] = nmf_factors[:, i] / np.sum(nmf_factors[:, i]) factor_sums.append(np.sum(nmf_factors[:, i])) factor_sums = np.array(factor_sums) # record NMF loadings for i in range(H.shape[0]): loom.ca['{} NMF Loading Component {}'.format( layername, i + 1)] = H[i, :] * factor_sums[i] loom.attrs['NumberNMFComponents'] = n_components
def display_topics(model, feature_names, no_top_words, topic_names=None): topic_list = [] for ix, topic in enumerate(model.components_): if not topic_names or not topic_names[ix]: topics = "Topic: " + str(ix) else: topics = "Topic:'" + str(topic_names[ix]) terms = ", ".join( [feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]) topic_list.append((topics, terms)) return topic_list nmf_model = NMF(n_components=n_components, random_state=42) start = time.time() doc_topic = nmf_model.fit_transform(tlj_tfidf) end = time.time() nmf_topics = display_topics(nmf_model, tfidf.get_feature_names(), 20) pprint(nmf_topics) print("Model Execution Time:", end - start) pickle.dump( nmf_topics, open('../../data/pickles/nmf/nmf_topics_' + str(n_components) + '.pkl', 'wb')) pickle.dump( nmf_model, open('../../data/pickles/nmf/nmf_model_' + str(n_components) + '.pkl', 'wb'))
class MyIndividual(_Individual): element_class = _Chromosome def get_neighbour(self): # select a neighour randomly cpy = self.clone(fitness=None) cpy.chromosomes = [ chromosome.random_neighbour() for chromosome in self.chromosomes ] return cpy from sklearn.decomposition import NMF nmf = NMF(n_components=c) W = nmf.fit_transform(evaluate.M) H = nmf.components_ err = -evaluate(W, H.T) i = MyIndividual.random(sizes=(c, ) * N + (p, ) * c) j = i.clone() data = i.get_history(stat={'Error': lambda i: -i.fitness}, n_iter=200) yourdata = j.get_history(stat={'Error': lambda i: -i.fitness}, n_iter=200) import matplotlib.pyplot as plt fig = plt.figure() ax = fig.add_subplot(111) ax.plot(np.arange(200), yourdata['Error'], 'bo', np.arange(200), data['Error'], 'r+', [0, 200], [err, err], 'k--') ax.legend(('My Error', 'Your Error', 'EM Error')) plt.show()
S = mglearn.datasets.make_signals() plt.figure(figsize=(11, 2)) plt.plot(S, '-') plt.xlabel("Time") plt.ylabel("Signal") plt.tight_layout() plt.show() A = np.random.RandomState(0).uniform(size=(100, 3)) X = np.dot(S, A.T) print("Shape of measurements: {}".format(X.shape)) nmf = NMF(n_components=3, random_state=42) S_ = nmf.fit_transform(X) print("Recovered signal shape: {}".format(S_.shape)) pca = PCA(n_components=3) H = pca.fit_transform(X) models = [X, S, S_, H] names = ['Observations (first three measurements)', 'True sources', 'NMF recovered signals', 'PCA recovered signals'] fig, axes = plt.subplots(4, figsize=(12, 6), gridspec_kw={'hspace': .5}, subplot_kw={'xticks': (), 'yticks': ()}) for model, name, ax in zip(models, names, axes):
print() def get_top_words_by_loadings(H, features): return features[np.argsort(np.sum(H, axis=0))[::-1][:30]] if __name__ == "__main__": # Load data df_ml = pd.read_csv(ML_ONLY_FILEPATH, encoding='utf-8') tfidf_vectorizer = TfidfVectorizer(stop_words='english') tfidf_ml = tfidf_vectorizer.fit_transform(df_ml['description']) features = np.array(tfidf_vectorizer.get_feature_names()) nmf_model = NMF(n_components=10, random_state=42) W = nmf_model.fit_transform(tfidf_ml) H = nmf_model.components_ # This model specifically returns roughly these topics: hand_labeled_features = [ 'machine learning / time series', 'optimization', 'neural networks / deep learning', 'reinforcement learning', 'bayesian', 'graphs / graph ML', 'Generative adversarial networks', 'image classification', 'clustering', 'optimal solutions' ]
int(len(R) / 5)].fillna(0) R[3 * int(len(R) / 5):4 * int(len(R) / 5)] = R[3 * int(len(R) / 5):4 * int(len(R) / 5)].fillna(0) R[4 * int(len(R) / 5):] = R[4 * int(len(R) / 5):].fillna(0) # Delete dataframes to free memory del ratings, movies, tags, titles, flipped, duplicates, ratings_final # Progress indicator print('Calculating collaborative filter model. Time passed in seconds: ', time.perf_counter() - start_time) # NMF for collaborative filtering # train model collab_model = NMF(n_components=60, init="nndsvd") transformed_R = collab_model.fit_transform(R) collab_matrix = pd.DataFrame(transformed_R, index=R.index) # Export collab_matrix pickle.dump(collab_matrix, open('binaries/collab_matrix', 'wb')) del R, collab_model, transformed_R, collab_matrix # Progress indicator print('Calculating content-based filter model. Time passed in seconds: ', time.perf_counter() - start_time) # Tfidf vectorization of the tag strings for content-based filtering tfidf = TfidfVectorizer(stop_words='english') tfidf_matrix = tfidf.fit_transform(tags_final['tags']) # Reduce dimensionality with SVD
''' path = "data/markets_new.csv" weights_paths = ["results_1/all_weights_5.npy"] df = pd.read_csv(path) # Drop rows with at least one missing value df_cut_nan = df.dropna() df_cut_nan = df_cut_nan.drop("Date", axis=1) scaler = MinMaxScaler() df_cut_nan_min_max = scaler.fit_transform(df_cut_nan) nmf = NMF(n_components=3, l1_ratio=1) transformed = nmf.fit_transform(df_cut_nan_min_max) components = nmf.components_ assign_trend = np.argmax(components, axis=0) for weights_path in weights_paths: all_weights = np.load(weights_path) all_trends = [] for weights in all_weights: trends = [0, 0, 0] for i, weight in enumerate(weights): trend = assign_trend[i] trends[trend] += weight all_trends.append(trends) df = pd.DataFrame(all_trends) df.columns = ["Trend 1", "Trend 2", "Trend 3"]
def main(test): if test == 'True': nrows = test_rows sys.stdout.flush() else: nrows = None file_name = PROJECT_DIR + '/data/processed/train.csv' extract_q1 = ColumnExtractor(file_name, Q_WORD_TOKENIZED[0], nrows=nrows) extract_q2 = ColumnExtractor(file_name, Q_WORD_TOKENIZED[1], nrows=nrows) q_stacker = ColumnStacker() pipeline = FeatureUnion([('extract_q1', extract_q1), ('extract_q2', extract_q2)], n_jobs=2) pipeline = Pipeline([('question_extractor', pipeline), ('q_stacker', q_stacker)]) D_q1 = pd.read_csv(PROJECT_DIR + '/data/processed/train.csv', index_col='id', usecols=['id', Q_WORD_TOKENIZED[0]], nrows=nrows) nrows = len(D_q1) q1 = D_q1.loc[:, Q_WORD_TOKENIZED[0]].apply( lambda l: ' '.join(literal_eval(l))) del D_q1 D_q2 = pd.read_csv(PROJECT_DIR + '/data/processed/train.csv', index_col='id', usecols=['id', Q_WORD_TOKENIZED[1]], nrows=nrows) q2 = D_q2.loc[:, Q_WORD_TOKENIZED[1]].apply( lambda l: ' '.join(literal_eval(l))) del D_q2 all_questions = q1.append(q2) all_questions.index = range(len(all_questions)) t = TfidfVectorizer(max_df=.95, min_df=2, stop_words='english', max_features=max_tfidf_features, ngram_range=(1, 2)) t0 = time.clock() print('VECTORIZING...') tfidf = t.fit_transform(all_questions.values) print('Time: ', time.clock() - t0) joblib.dump(t, PROJECT_DIR + '/models/' + 'tfidf.pkl') nmf_tfidf = NMF(n_components=n_components, init='nndsvda') print('NMF tfidf...') t0 = time.clock() W = nmf_tfidf.fit_transform(tfidf) print('Time: ', time.clock() - t0) joblib.dump(nmf_tfidf, PROJECT_DIR + '/models/' + 'nmf_tfidf.pkl') W = np.abs(W[:nrows, :] - W[nrows:, :]) D = pd.read_csv( PROJECT_DIR + '/data/processed/train.csv', index_col='id', usecols=['id', MASI_DISTANCE, JACCARD_DISTANCE, EDIT_DISTANCE], dtype=np.float64, nrows=nrows) Dist = D.values del D D = pd.read_csv(PROJECT_DIR + '/data/processed/train.csv', index_col='id', usecols=['id'] + Q_TYPE1, dtype='object', nrows=nrows) D = D.loc[:, Q_TYPE1].applymap(literal_eval) T = np.hstack( (np.vstack(D.loc[:, Q_TYPE1[0]]), np.vstack(D.loc[:, Q_TYPE1[1]]))) # nmf_T = NMF(n_components = 25, init = 'nndsvda') # print('NMF T...') # t0 = time.clock() # T = nmf_T.fit_transform(T) # print('Time: ', time.clock() - t0) # joblib.dump(nmf_T, PROJECT_DIR + '/models/' + 'nmf_T.pkl') del D X = np.hstack((W, Dist, T)) D = pd.read_csv(PROJECT_DIR + '/data/processed/train.csv', index_col='id', usecols=['id', 'is_duplicate'], dtype=np.float64, nrows=nrows) y = D.values.ravel() param_dist = { 'max_depth': range(15), 'learning_rate': np.logspace(-3, 1, 50), 'subsample': [0.3, 0.5, 0.7, 1.0], 'n_estimators': [250, 400, 700, 1000, 1500, 2000], 'min_child_weight': [1, 3, 5, 7], 'gamma': np.logspace(-2, 2, 50), } cv = RandomizedSearchCV(XGBClassifier(nthread=2), param_dist, scoring='neg_log_loss', n_jobs=4) print('Fitting xgb') cv.fit(X, y) clf = cv.best_estimator_ joblib.dump(clf, PROJECT_DIR + '/models/' + 'cv_xgb.pkl') return
Yres = y_train - corrected_zero[0] qx_init = np.random.normal(0, 1, size=[N, Q]) if PCA_INIT: ols = LinearRegression(fit_intercept=False) ols.fit(z_init, y_train) Yres = y_train - ols.predict(z_init) if SPARSE: pca = NMF(n_components=Q) #pca = PCA(n_components = Q) else: pca = PCA(n_components=Q) qx_init = pca.fit_transform(Yres) #.tocsr()) qx_init = 15. * qx_init / np.max(qx_init) #print(np.max(qx_init)) if args.gpy_init: m2 = GPy.models.GPLVM(Yres, Q) m2.optimize() qx_init = m2['latent_mean'][:] # Initialize lengths with max distance across dimensions len_var_init = np.abs(np.max(qx_init, axis=0) - np.mean(qx_init, axis=0)) len_con_init = np.ones(z_init.shape[1]) sig_init = 0.5 if args.gpy_init: len_var_init = m2['sum.rbf.lengthscale'] * np.ones(Q)
for topic_idx, topic in enumerate(model.components_): message = "Topic #%d: " % topic_idx message += " ".join( [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]) print(message) print() tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=5000, stop_words='english') tfidf = tfidf_vectorizer.fit_transform(joke_df.tokenized_joke) nmf = NMF(n_components=100, random_state=1, alpha=.1, l1_ratio=.5) nmf_matrix = nmf.fit_transform(tfidf) tfidf_feature_names = tfidf_vectorizer.get_feature_names() print_top_words(nmf, tfidf_feature_names, 20) topic_descriptions = [[ tfidf_feature_names[i] for i in topic.argsort()[:-25 - 1:-1] ] for topic in nmf.components_] feature_nn = NearestNeighbors(n_neighbors=5).fit(nmf_matrix) with open('fit_models/nmf.p', 'wb') as fp: pickle.dump((nmf_matrix, topic_descriptions, feature_nn), fp)
class TopicModel: def __init__(self, topicCollection, string): if string.lower() == "nmf": self.model = "NMF" print("Topic Extraction Model: sklearn.NMF") else: self.model = "LDA" print("Topic Extraction Model: gensim.LDAModel") self.stemmer = PorterStemmer() #Train the LDA model on the current discussion def train(self, sentences): if self.model == "NMF": self.sentenceData = [] for sentence in sentences: self.sentenceData.append(preprocess(sentence, self.stemmer)) self.tfidf_vectorizer = TfidfVectorizer( max_features=1500, ngram_range=(1, 2), preprocessor=' '.join, stop_words='english' ) tfidf = self.tfidf_vectorizer.fit_transform(self.sentenceData) self.nmf = NMF(n_components=2, solver="mu") self.W = self.nmf.fit_transform(tfidf) self.H = self.nmf.components_ else: sentenceData = [] for sentence in sentences: sentenceData.append(preprocess(sentence, self.stemmer)) self.dictionary = Dictionary(sentenceData) bow_corpus = [self.dictionary.doc2bow(doc) for doc in sentenceData] self.lda_model = LdaModel(bow_corpus, num_topics=2, id2word=self.dictionary, passes=10) #Classify a given sentence to one of the topics found in training def classify(self, sentence): if self.model == "NMF": index = self.sentenceData.index(preprocess(sentence, self.stemmer)) topic = self.W.argmax(axis=1)[index] return "Topic " + str(topic) else: bow_vector = self.dictionary.doc2bow(preprocess(sentence, self.stemmer)) return "Topic " + str(sorted(self.lda_model[bow_vector], key=lambda tup: -1*tup[1])[0][0]) #Shows the terms of a given topic def showTerms(self, topic): if self.model == "NMF": terms = "" top_features = [] tfidf_feature_names = self.tfidf_vectorizer.get_feature_names() for topic_idx, topicID in enumerate(self.H): if topic_idx == int(topic.split(' ')[-1]): top_features_ind = topicID.argsort()[:-20 - 1:-1] top_features = [tfidf_feature_names[i] for i in top_features_ind] weights = topicID[top_features_ind] for term in top_features: terms += term + ", " print(topic.split(' ')[-1] + " " + terms) return terms else: terms = "" topic = int(topic.split(" ")[-1]) for term in self.lda_model.show_topic(topic): terms += term[0] + ", " print(str(topic) + " " + terms) return terms #Gets the probability or the coefficient of the given term in the topic def getCoeff(self, topic, term): if self.model == "NMF": weights = [] top_features = [] tfidf_feature_names = self.tfidf_vectorizer.get_feature_names() for topic_idx, topicID in enumerate(self.H): if topic_idx == topic: top_features_ind = topicID.argsort()[:-20 - 1:-1] top_features = [tfidf_feature_names[i] for i in top_features_ind] weights = topicID[top_features_ind] for coeff, terms in zip(weights, top_features): if terms == term: return coeff else: topic = int(topic.split(" ")[-1]) for terms in self.lda_model.show_topic(topic): if terms[0] == term: return terms[1] #Shows all the topics found in training def showTopics(self): if self.model == "NMF": ret = [] for topic_idx, topicID in enumerate(self.H): ret.append("Topic " + str(topic_idx)) return ret else: topics = self.lda_model.print_topics() ret = [] for topic in topics: ret.append("Topic " + str(topic[0])) return ret #Returns a flag to check what model is deployed at the moment def getModel(self): return self.model
def test_custom_nmf(self): mat = np.array([[1, 0, 0, 0], [1, 0, 0, 0], [1, 0, 0, 0], [1, 0, 0, 0], [0, 0, 1, 0]], dtype=np.float64) mat[:mat.shape[1], :] += np.identity(mat.shape[1]) mod = NMF(n_components=2, max_iter=2) W = mod.fit_transform(mat) H = mod.components_ def predict(W, H, row_index, col_index): return np.dot(W[row_index, :], H[:, col_index]) pred = mod.inverse_transform(W) exp = [] got = [] for i in range(mat.shape[0]): for j in range(mat.shape[1]): exp.append((i, j, pred[i, j])) got.append((i, j, predict(W, H, i, j))) max_diff = max(abs(e[2] - o[2]) for e, o in zip(exp, got)) assert max_diff <= 1e-5 def nmf_to_onnx(W, H): """ The function converts a NMF described by matrices *W*, *H* (*WH* approximate training data *M*). into a function which takes two indices *(i, j)* and returns the predictions for it. It assumes these indices applies on the training data. """ col = OnnxArrayFeatureExtractor(H, 'col') row = OnnxArrayFeatureExtractor(W.T, 'row') dot = OnnxMul(col, row, op_version=TARGET_OPSET) res = OnnxReduceSum(dot, output_names="rec", op_version=TARGET_OPSET) indices_type = np.array([0], dtype=np.int64) onx = res.to_onnx(inputs={ 'col': indices_type, 'row': indices_type }, outputs=[('rec', FloatTensorType((None, 1)))]) return onx model_onnx = nmf_to_onnx(W.astype(np.float32), H.astype(np.float32)) sess = InferenceSession(model_onnx.SerializeToString()) def predict_onnx(sess, row_indices, col_indices): res = sess.run(None, {'col': col_indices, 'row': row_indices}) return res onnx_preds = [] for i in range(mat.shape[0]): for j in range(mat.shape[1]): row_indices = np.array([i], dtype=np.int64) col_indices = np.array([j], dtype=np.int64) pred = predict_onnx(sess, row_indices, col_indices)[0] onnx_preds.append((i, j, pred[0, 0])) max_diff = max(abs(e[2] - o[2]) for e, o in zip(exp, onnx_preds)) assert max_diff <= 1e-5
/* Now use what you've learned about NMF to decompose the digits dataset. You are again given the digit images as a 2D array samples. This time, you are also provided with a function show_as_image() that displays the image encoded by any 1D array: def show_as_image(sample): bitmap = sample.reshape((13, 8)) plt.figure() plt.imshow(bitmap, cmap='gray', interpolation='nearest') plt.colorbar() plt.show() After you are done, take a moment to look through the plots and notice how NMF has expressed the digit as a sum of the components! */ # Import NMF from sklearn.decomposition import NMF # Create an NMF model: model model = NMF(n_components = 7) # Apply fit_transform to samples: features features = model.fit_transform(samples) # Call show_as_image on each component for component in model.components_: show_as_image(component) # Assign the 0th row of features: digit_features digit_features = features[0, :] # Print digit_features print(digit_features)