def apply(self, X, k=2): """ Apply NMF to the specified document-term matrix X. """ import nimfa self.W = None self.H = None initialize_only = self.max_iters < 1 if self.update == "euclidean": objective = "fro" else: objective = "div" lsnmf = nimfa.Lsnmf(X, max_iter=self.max_iters, rank=k, seed=self.init_strategy, update=self.update, objective=objective, test_conv=self.test_conv) res = lsnmf() # TODO: fix try: self.W = res.basis().todense() self.H = res.coef().todense() except: self.W = res.basis() self.H = res.coef() # last number of iterations self.n_iter = res.n_iter
def read_covar(filename, k=10): array = [] with open(filename) as f: for line in f: line = line.strip().split() line = list(map(int, line)) array.append(line) array = np.array(array) lsnmf = nimfa.Lsnmf(array, seed='random_vcol', rank=k, max_iter=10, update='divergence', objective='div') # lsnmf = nimfa.SepNmf(array, seed='random_vcol', rank=k, n_run=10) lsnmf_fit = lsnmf() # best_rank = lsnmf.estimate_rank(rank_range=range(8,15)) # for rank, values in best_rank.items(): # print('Rank: %d' % rank) # print('Rss: %5.4f' % values['rss']) # print('Evar: %5.4f' % values['evar']) # print('K-L: %5.4f' % values['kl']) # print(best_rank) print('Rss: %5.4f' % lsnmf_fit.fit.rss()) print('Evar: %5.4f' % lsnmf_fit.fit.evar()) print('K-L divergence: %5.4f' % lsnmf_fit.distance(metric='kl')) print('Sparseness, W: %5.4f, H: %5.4f' % lsnmf_fit.fit.sparseness()) return array, lsnmf_fit
def factorize(V): """ Perform LSNMF factorization on the ORL faces data matrix. Return basis and mixture matrices of the fitted factorization model. :param V: The ORL faces data matrix. :type V: `numpy.matrix` """ lsnmf = nimfa.Lsnmf(V, seed="random_vcol", rank=25, max_iter=50, sub_iter=10, inner_sub_iter=10, beta=0.1, min_residuals=1e-8) print("Algorithm: %s\nInitialization: %s\nRank: %d" % (lsnmf, lsnmf.seed, lsnmf.rank)) fit = lsnmf() print("""Stats: - iterations: %d - final projected gradients norm: %5.3f - Euclidean distance: %5.3f""" % (fit.fit.n_iter, fit.distance(), fit.distance(metric='euclidean'))) return fit.basis(), fit.coef()
def get_rank_est(self, k): print('Running NMF rank estimation for rank %d...' % k) t0 = time.time() mat = self.extant_collation_df.values nmf_setup = nf.Lsnmf(mat, rank=k, seed='random', max_iter=50, n_run=500, track_factor=1) nmf_fit = nmf_setup() print('Done in %0.4fs' % (time.time() - t0)) #Return the result as a one-row DataFrame (it will be added as part of a larger DataFrame later): rank_est_k_df = pd.DataFrame({'max_iter': [nmf_fit.fit.max_iter], 'n_run': [nmf_fit.fit.n_run], 'coph_cor': [nmf_fit.fit.coph_cor()], 'dispersion': [nmf_fit.fit.dispersion()]}, index = [k]) return rank_est_k_df
def nmf_library(V, W_init, correct_H): #comparisons with non-negative matrix factorization lsnmf = nimfa.Lsnmf(V, seed=None, rank=3, max_iter=100, H=np.array([0., 0., 0.]).reshape(-1, 1), W=W_init) nmf = nimfa.Nmf(V, seed=None, rank=3, max_iter=100, H=np.array([0., 0., 0.]).reshape(-1, 1), W=W_init) icm = nimfa.Icm(V, seed=None, rank=3, max_iter=100, H=np.array([0., 0., 0.]).reshape(-1, 1), W=W_init) bd = nimfa.Bd(V, seed=None, rank=3, max_iter=100, H=np.array([0., 0., 0.]).reshape(-1, 1), W=W_init) pmf = nimfa.Pmf(V, seed=None, rank=3, max_iter=100, H=np.array([0., 0., 0.]).reshape(-1, 1), W=W_init) #lfnmf = nimfa.Lfnmf(V, seed=None, rank=3, max_iter=100, H = np.array([0.,0.,0.]).reshape(-1,1), W = W_init) lsnmf_fit = lsnmf() nmf_fit = nmf() icm_fit = icm() bd_fit = bd() pmf_fit = pmf() lsnmf_error = mean_absolute_error( correct_H, normalized(np.array(lsnmf.H).reshape(-1, ))) nmf_error = mean_absolute_error(correct_H, normalized(np.array(nmf.H).reshape(-1, ))) icm_error = mean_absolute_error(correct_H, normalized(np.array(icm.H).reshape(-1, ))) bd_error = mean_absolute_error(correct_H, normalized(np.array(bd.H).reshape(-1, ))) pmf_error = mean_absolute_error(correct_H, normalized(np.array(pmf.H).reshape(-1, ))) return [lsnmf_error, nmf_error, icm_error, bd_error, pmf_error]
def nmf(nodeFeatureMatrix): actual_fx_matrix = nodeFeatureMatrix n, f = actual_fx_matrix.shape number_bins = int(np.log2(n)) max_roles = min([n, f]) best_W = None best_H = None mdlo = mdl.MDL(number_bins) minimum_description_length = 1e20 min_des_not_changed_counter = 0 for rank in range(1, max_roles + 1): lsnmf = nimfa.Lsnmf(actual_fx_matrix, rank=rank, max_iter=100) lsnmf_fit = lsnmf() W = np.asarray(lsnmf_fit.basis()) H = np.asarray(lsnmf_fit.coef()) estimated_matrix = np.asarray(np.dot(W, H)) code_length_W = mdlo.get_huffman_code_length(W) code_length_H = mdlo.get_huffman_code_length(H) model_cost = code_length_W * ( W.shape[0] + W.shape[1]) + code_length_H * (H.shape[0] + H.shape[1]) loglikelihood = mdlo.get_log_likelihood(actual_fx_matrix, estimated_matrix) description_length = model_cost - loglikelihood if description_length < minimum_description_length: minimum_description_length = description_length best_W = np.copy(W) best_H = np.copy(H) min_des_not_changed_counter = 0 else: min_des_not_changed_counter += 1 if min_des_not_changed_counter == 4: break # print ('Number of Roles: %s, Model Cost: %.2f, -loglikelihood: %.2f, Description Length: %.2f, MDL: %.2f (%s)' % (rank, model_cost, loglikelihood, description_length, minimum_description_length, best_W.shape[1])) # print ('MDL has not changed for these many iters:', min_des_not_changed_counter) print('MDL: %.2f, Roles: %s' % (minimum_description_length, best_W.shape[1])) return (best_W, best_H)
def run_lsnmf(V, rank=12, max_iter=5000): """ Run least squares nonnegative matrix factorization. :param V: Target matrix to estimate. :type V: :class:`numpy.matrix` """ rank = rank lsnmf = nimfa.Lsnmf(V, seed="random_vcol", rank=rank, max_iter=max_iter, sub_iter=10, inner_sub_iter=10, beta=0.1, min_residuals=1e-5) fit = lsnmf() return print_info(fit)
def get_nmf_results(self, k): print('Running NMF with rank k = %d...' % k) t0 = time.time() mat = self.extant_collation_df.values nmf_setup = nf.Lsnmf(mat, rank=k, seed='nndsvd', max_iter=32000) nmf_fit = nmf_setup() t1 = time.time() print('Done in %0.4fs' % (t1 - t0)) #Get the factor matrices: W = nmf_fit.basis() H = nmf_fit.coef() #Convert to DataFrames, complete with their own labels: W_df = pd.DataFrame(W) H_df = pd.DataFrame(H) W_df.index = self.extant_collation_df.index W_df.columns = self.get_cluster_labels(k) H_df.index = self.get_cluster_labels(k) H_df.columns = self.extant_collation_df.columns nmf_summary_df = pd.DataFrame({'time (s)': [t1 - t0], 'n_iter': [nmf_fit.fit.n_iter], 'dist': [nmf_fit.fit.distance()], 'evar': [nmf_fit.fit.evar()], 'W_sparseness': [nmf_fit.fit.sparseness()[0]], 'H_sparseness': [nmf_fit.fit.sparseness()[1]]}, index = [k]) self.W_df = W_df self.H_df = H_df self.nmf_summary_df = nmf_summary_df return
# Normalize: V = (V - V.min()) / (V.max() - V.min()) # V = nimfa.examples.medulloblastoma.read(normalize = True) # actually is nimfa.methods.factorization.lsnmf # print "Number of zero elements: ", V.size - np.count_nonzero(V) ''' Example-1: Simple Usage -- Projected Gradient Update Covers: 1) Input/Output 2) Metrics ''' V = np.array([[1, 2, 3], [4, 5, 6], [6, 7, 8]]) print('Target:\n%s' % V) lsnmf = nimfa.Lsnmf(V, distance="euclidean", seed='random_vcol', max_iter=10, rank=3) lsnmf_fit = lsnmf() W = lsnmf_fit.basis() print('Basis matrix:\n%s' % W) H = lsnmf_fit.coef() print('Mixture matrix:\n%s' % H) print('Target estimate:\n%s' % np.dot(W, H)) print('K-L divergence: %5.3f' % lsnmf_fit.distance(metric='kl')) print('Euclidean distance: %5.3f' % lsnmf_fit.distance(metric='euclidean')) print('Iterations: %d' % lsnmf_fit.n_iter)
import numpy as np import nimfa V = np.random.rand(40, 100) lsnmf = nimfa.Lsnmf(V, seed="random_vcol", rank=10, max_iter=12, sub_iter=10, inner_sub_iter=10, beta=0.1) lsnmf_fit = lsnmf()
node_feature = args.node_feature out_prefix = args.output_prefix out_dir = args.output_dir refex_features = np.loadtxt(node_feature, delimiter=',') actual_fx_matrix = refex_features[:, 1:] n, f = actual_fx_matrix.shape print 'Number of Features: ', f print 'Number of Nodes: ', n sparsity_threshold = 1.0 for i in xrange(1, 6): for rank in xrange(20, 29 + 1): lsnmf = nimfa.Lsnmf(actual_fx_matrix, rank=rank, max_iter=200) lsnmf_fit = lsnmf() G = np.asarray(lsnmf_fit.basis()) F = np.asarray(lsnmf_fit.coef()) G, F = glrd_sparse(V=actual_fx_matrix, G=G, F=F, r=rank, err_V=sparsity_threshold, err_F=sparsity_threshold) G[G <= 0.0] = 0.0 F[F <= 0.0] = 0.0 w_out = '%s-%s-%s-nodeRoles.txt' % (rank, i, out_prefix) h_out = '%s-%s-%s-roleFeatures.txt' % (rank, i, out_prefix)
Y = (Y * 5).astype(int) + 1 #X = np.loadtxt("X_gen.txt") #sample matrice by Ganesh #Y = np.loadtxt("Y_gen.txt") #print X #print Y T = np.random.rand(10, 20) T = T > 0.3 Q = (np.dot(X, Y)).astype(float) #matice is 30% sparse Q[T == False] = 0 #print Q #print T #print Q[T] sparseQ = csr_matrix(Q) L = nf.Lsnmf(sparseQ, seed="random_vcol", rank=5, max_iter=10, beta=0.1) Lmod = L.factorize() Xnew = L.basis().todense() Ynew = L.coef().todense() Qnew = L.fitted().todense() Qx = Q - Qnew Xx = X - Xnew Yx = Y - Ynew print rmse(Xx) print X print Xnew print Qnew #print Qnew[T]- Q[T] #print rmse(Xnew-X) #print rmse(Ynew-Y) #print Qnew
model_cost_b = code_length_b * (n_b + f_b) model_cost_bu = code_length_bu * (n_bu + f_bu) model_costs.append((bins, model_cost_b)) model_costs_uniform.append((bins, model_cost_bu)) p_ari = [] p_ari_u = [] s_ari = [] s_ari_u = [] c = 0 for rank in xrange(33, 43): for i in xrange(10): c += 1 fctr = nimfa.Lsnmf(full_fx_matrix, rank=rank, max_iter=200) fctr_b = nimfa.Lsnmf(binned_fx_matrix, rank=rank, max_iter=200) fctr_bu = nimfa.Lsnmf(binned_fx_matrix_u, rank=rank, max_iter=200) fctr_res = fctr() fctr_res_b = fctr_b() fctr_res_bu = fctr_bu() W = np.asarray(fctr_res.basis()) W_b = np.asarray(fctr_res_b.basis()) W_bu = np.asarray(fctr_res_bu.basis()) actual_primary, actual_secondary = get_role_assignment(W) estimated_primary, estimated_secondary = get_role_assignment(
R = np.array(expert_category_matrix) N = len(R) M = len(R[0]) K = 10 P = np.random.rand(N, K) Q = np.random.rand(M, K) nP, nQ = matrix_factorization(R, P, Q, K) nR = np.dot(nP, nQ.T) print '#################computing the R hat matrix' print 'approach nimfa , given library for computing R hat' V = R #nimfa.examples.medulloblastoma.read(normalize=True) lsnmf = nimfa.Lsnmf(V, seed='random_vcol', rank=50, max_iter=100) lsnmf_fit = lsnmf() print('Rss: %5.4f' % lsnmf_fit.fit.rss()) print('Evar: %5.4f' % lsnmf_fit.fit.evar()) print('K-L divergence: %5.4f' % lsnmf_fit.distance(metric='kl')) print('Sparseness, W: %5.4f, H: %5.4f' % lsnmf_fit.fit.sparseness()) print '################# Finished computing the R hat matrix' # for each expert in train set, output the score in the table for that [expert][category] #compare it to the output in the train file #this gives train accuracy print nR #convert test data into expert_tag table # for each expert, output the score in the table for that [expert][category]
def alsPred(matrix=None, sparseness=0.3, method="lsnmf", n_features=5, n_iterations=30, low=1, high=5): """ Decomposes a given matrix using the specified ALS algorithm. Instead of the matrix, dimensions can be given to create a random matrix of specified sparseness. INPUTS matrix : Can be either a 2-D numpy array or a tuple of size 2 specifying the matrix dimensions sparseness : Determines the sparseness of the random matrix to be generated method : Specifies the ALS algorithm variation "lsnmf" for Lsnmf model implemented in Nimfa package "wr" for ALS-WR n_features : Number of features of the decomposed matrices n_iterations : Number of iterations for the ALS algorithm low, high : Minimum and maximum values for filling the random matrix """ #generate a random matrix if matrix is not specified if isinstance(matrix, tuple) and len(matrix) == 2: mat = np.random.randint(low=low, high=(high + 1), size=matrix) mask = np.random.choice([0.0, 1.0], size=mat.size, p=[sparseness, (1.0 - sparseness)]).reshape(mat.shape) mat = mat * mask print mat if method == "lsnmf": mask = (mat > 0.5).astype("int") sparseMat = csr_matrix(mat) als = nf.Lsnmf(sparseMat, seed="random_vcol", rank=n_features, max_iter=n_iterations, beta=0.5) als_fit = als.factorize() U = np.array(als_fit.basis().todense()) V = np.array(als_fit.coef().todense()) predictedMat = np.round(np.array(np.dot(U, V)), 2) RMSE = RMSE_matrix(mat, predictedMat) return mat, predictedMat, np.round(U, 2), np.round(V, 2), RMSE elif method == "wr": mask = (mat > 0.5).astype("int") output = als(mat, n_factors=n_features, n_iterations=n_iterations) RMSE = RMSE_matrix(mat, output[0]) return mat, output[0], output[1], output[2], RMSE return 0
import numpy as np import nimfa V = np.array([[1, 2, 3], [4, 5, 6], [6, 7, 8]]) print('Target:\n%s' % V) lsnmf = nimfa.Lsnmf(V, max_iter=10, rank=3) lsnmf_fit = lsnmf() W = lsnmf_fit.basis() print('Basis matrix:\n%s' % W) H = lsnmf_fit.coef() print('Mixture matrix:\n%s' % H) print('K-L divergence: %5.3f' % lsnmf_fit.distance(metric='kl')) print('Rss: %5.3f' % lsnmf_fit.fit.rss()) print('Evar: %5.3f' % lsnmf_fit.fit.evar()) print('Iterations: %d' % lsnmf_fit.n_iter) print('Target estimate:\n%s' % np.dot(W, H))
# W : Keep check of which cells have ratings R = np.zeros((num_users,num_movies)) W = np.zeros((num_users,num_movies)) for i in range(len(train)): R[ train.iloc[i]['userID']-1 , train.iloc[i]['movieID']-1 ] = train.iloc[i]['rating'] #userID is 1-based while R matrix is 0-based W[ train.iloc[i]['userID']-1 , train.iloc[i]['movieID']-1 ] = 1 sparseR = csr_matrix(R) #%%################################################################ ## ALS implementation using NIMFA package als = nf.Lsnmf(sparseR,seed="random_vcol",rank=100,max_iter=15,beta=0.1) #Try using different rank (#of features) and see the reduction in training error als_fit= als.factorize() #%% user_features = als_fit.basis() movie_features = als_fit.coef() predictedR = np.dot( user_features.todense() , movie_features.todense() ) get_train_error(R,predictedR,W,rmse=True) #%% get_test_error( test,predictedR,rmse=True ) #%%
fica = FastICA(n_components=k).fit(datas) H = fica.components_ if seed == "skmeans": skm = SphericalKMeans(n_clusters=k).fit(datas) H = skm.cluster_centers_ if seed == "kmeans": skm = KMeans(n_clusters=k).fit(datas) H = skm.cluster_centers_ W = np.random.random((_data_dimension, k)) options['seed'] = None options['W'] = W options['H'] = H if methode == "lsnmf": init_nmf = nimfa.Lsnmf(datas, **options) if methode == "nmf": init_nmf = nimfa.Nmf(datas, **options) if methode == "sepnmf": init_nmf = nimfa.SepNmf(datas, **options) if methode == "nsnmf": init_nmf = nimfa.Nsnmf(datas, **options) res_nmf = init_nmf() result.append(res_nmf.summary()) name = "nmf_result/" + str(methode) + "-norm-" + str(
import numpy as np import nimfa V = np.array([[1, 2, 3], [4, 5, 6], [6, 7, 8]]) print('Target:\n%s' % V) lsnmf = nimfa.Lsnmf(V, seed='random_vcol', max_iter=10, rank=3, track_error=True) lsnmf_fit = lsnmf() W = lsnmf_fit.basis() print('Basis matrix:\n%s' % W) H = lsnmf_fit.coef() print('Mixture matrix:\n%s' % H) # Objective function value for each iteration print('Error tracking:\n%s' % lsnmf_fit.fit.tracker.get_error()) sm = lsnmf_fit.summary() print('Rss: %5.3f' % sm['rss']) print('Evar: %5.3f' % sm['evar']) print('Iterations: %d' % sm['n_iter'])
bbox_inches='tight') print("Finished Company BM25") ############################################################################### ##ANALYST BM25 ############################################################################### analysts_vec = GroupVectorizer(tf_type='bm25', apply_idf=True, idf_type='smooth', apply_dl=True, dl_type='linear').fit(trigram_docs, analysts) analyst_doc_term_matrix = analysts_vec.transform(trigram_docs, analysts) mod = nimfa.Lsnmf(V=analyst_doc_term_matrix, max_iter=200, rank=5) nmf_grid = nimfa.Lsnmf.estimate_rank(mod, rank_range=np.arange(2, 20), what=['cophenetic', 'rss'], n_run=10) fig = plt.figure(figsize=(8, 6)) fig_plt = sns.barplot(x=np.arange(2, 20), y=[i['cophenetic'] for i in nmf_grid.values()]) fig_plt.set_xlabel("N-Components") fig_plt.set_ylabel("Cophenetic") fig_plt.set_title('Cophenetic Score vs. N-Components') print(fig_plt) fig_plt.get_figure().savefig(figure_directory + "NMF/BM25/AnalystCophenticScoresLSNMF.png", bbox_inches='tight')