def __init__(self, data, mfmethod, nsub=20, show_progress=True, mapW=False, base_sel=2, num_bases=3 , niterH=1, compute_h=True, compute_w=True, sstrategy='rand'): NMF.__init__(self, data, num_bases=num_bases, compute_h=compute_h, show_progress=show_progress, compute_w=compute_w) self._niterH = niterH self._nsub = nsub self.data = data self._mfmethod = mfmethod self._mapW = mapW self._sstrategy = sstrategy self._base_sel = base_sel # assign the correct distance function if self._sstrategy == 'cur': self._subfunc = self.curselect elif self._sstrategy == 'kmeans': self._subfunc = self.kmeansselect elif self._sstrategy == 'hull': self._subfunc = self.hullselect elif self._sstrategy == 'laesa': self._subfunc = self.laesaselect elif self._sstrategy == 'sivm': self._subfunc = self.sivmselect else: self._subfunc = self.randselect
class TestNMF(unittest.TestCase): """ Test the NMF class. """ def setUp(self): self.description_csv = pd.read_csv("docs/description.csv") self.description_1000_csv = pd.read_csv("docs/description_1000.csv") self.dp = DocsPreprocessor() self.description_1000 = self.dp.process(self.description_1000_csv) self.nmf = NMF(self.description_1000) def test_type(self): self.assertEqual(type(self.nmf.docs), list) """ def test_vectorize(self): vect, terms = self.nmf.vectorize() self.assertTrue(len(terms) == 2381) self.assertEqual((vect.shape[0], vect.shape[1]), (1000, 2381)) """ """ def test_create_model(self): self.nmf.create_model(10) """ """ def test_run_topic_models(self): self.nmf.run_topic_models(10, 30, 10) """ """ def test_create_word_embedding_model(self): w_model = self.nmf.create_word_embedding_model() """ def test_process_models(self): self.nmf.process_models(10, 30, 10, 20)
def run(self): """ :returns: Tuple, such as (<dict of nmis, with algorithm names as keys >) """ nsc = NSpecSparse(self.X, self.k, maxiter=2000) nmf = NMF(self.X, self.k) km = KMeans(n_clusters=self.k) nsckm = NSpecSparseKM(self.X, self.k, maxiter=2000) nsc_result = nsc.predict() nmf_result = nmf.predict() km_result = km.fit_predict(self.X) nsckm_result = nsckm.predict() w_nsc = nsc_result.matrices[0].todense() w_nmf = nmf_result.matrices[0] w_nsckm = nsckm_result.matrices # gets only the labels arrays = { 'nsc': np.array(np.argmax(w_nsc, axis=1))[:,0], 'nmf': np.array(np.argmax(w_nmf, axis=1)), 'km': km_result, 'nsckm': w_nsckm } nmi = {k: nmiscore(arrays[k], self.y) for k in arrays.keys()} return (nmi, arrays)
def __init__(self, data, num_bases=0, niter=1, show_progress=False, compW=True, center_mean=True): NMF.__init__(self, data, num_bases=num_bases, niter=niter, show_progress=show_progress, compW=compW) # center the data around the mean first self._center_mean = center_mean if self._center_mean: # copy the data before centering it -> arrays # are passed by reference ... self._data_orig = data self._meanv = self._data_orig[:, :].mean(axis=1).reshape( data.shape[0], -1) self.data = self._data_orig - self._meanv else: self.data = data
def __init__(self, data, num_bases=4, niter=100, show_progress=False, compW=True): """ Inits Nmf class: sampleNmf = Nmf(data, num_bases=4, niter=100, show_progress=True, compW=True) Args: data (required) : d x n data matrix [d - dimension, n -number of samples] num_bases : number of basis vectors for W (default: 4) niter : number of iterations (default: 100) show_progress : (default: True) compW : set to True if W and H should be optimized, set to False if only H should be optimized. This is usefull if W is computed somewhere or if new data should be mapped on a given set of basis vectors W. """ # data can be either supplied by conventional numpy arrays or # as a numpy array within a pytables table (should be preferred for large data sets) NMF.__init__(self, data, num_bases=num_bases, niter=niter, show_progress=show_progress, compW=compW)
def run(self): """ :returns: Tuple, such as (<dict of nmis, with algorithm names as keys >) """ nsc = NSpecSparse(self.X, self.k, maxiter=2000) nmf = NMF(self.X, self.k) km = KMeans(n_clusters=self.k) nsckm = NSpecSparseKM(self.X, self.k, maxiter=2000) nsc_result = nsc.predict() nmf_result = nmf.predict() km_result = km.fit_predict(self.X) nsckm_result = nsckm.predict() w_nsc = nsc_result.matrices[0].todense() w_nmf = nmf_result.matrices[0] w_nsckm = nsckm_result.matrices # gets only the labels arrays = { 'nsc': np.array(np.argmax(w_nsc, axis=1))[:, 0], 'nmf': np.array(np.argmax(w_nmf, axis=1)), 'km': km_result, 'nsckm': w_nsckm } nmi = {k: nmiscore(arrays[k], self.y) for k in arrays.keys()} return (nmi, arrays)
def factorize(self, show_progress=False, compute_w=True, compute_h=True, compute_err=True, niter=1): """ Factorize s.t. WH = data Parameters ---------- show_progress : bool print some extra information to stdout. compute_h : bool iteratively update values for H. compute_w : bool iteratively update values for W. compute_err : bool compute Frobenius norm |data-WH| after each update and store it to .ferr[k]. Updated Values -------------- .W : updated values for W. .H : updated values for H. .ferr : Frobenius norm |data-WH|. """ NMF.factorize(self, niter=1, show_progress=show_progress, compute_w=compute_w, compute_h=compute_h, compute_err=compute_err)
def factorize(self, niter=10, compute_w=True, compute_h=True, show_progress=False, compute_err=True): """ Factorize s.t. WH = data Parameters ---------- niter : int number of iterations. show_progress : bool print some extra information to stdout. compute_h : bool iteratively update values for H. compute_w : bool iteratively update values for W. compute_err : bool compute Frobenius norm |data-WH| after each update and store it to .ferr[k]. Updated Values -------------- .W : updated values for W. .H : updated values for H. .ferr : Frobenius norm |data-WH| for each iteration. """ # init some learning parameters self._lamb_W = 1.0/niter self._lamb_H = 1.0/niter NMF.factorize(self, niter=niter, compute_w=compute_w, compute_h=compute_h, show_progress=show_progress, compute_err=compute_err)
def __init__(self, data_1, data_2, lambd=0.5, num_bases=4, niter=100, show_progress=False, compH=True, compW=True): # generate a new data set data using a weighted # combination of data_1 and data_2 self._data_1 = data_1 self._data_2 = data_2 self._lambd = lambd data = np.concatenate( (lambd * self._data_1, (1.0 - lambd) * self._data_2), axis=0) NMF.__init__(self, data, num_bases=num_bases, niter=niter, show_progress=show_progress, compW=compW)
def run_model(vector, features, k, max_iter): model = NMF(k, max_iter) W, H = model.fit_transform(vector) # print('Cost: ', model.cost(vector)) cw = common_words(H, features, num_words=10) print('Topics in {} with {} iterations '.format(column, max_iter)) print_topics(cw) return vector, features
def main(): origin,mask = load_data(4,5,0.2) dm = origin*mask mask1 = dm / origin print(mask1) print(mask) nmf = NMF() rec = nmf.predict(dm,mask) print(rec-dm)
def __init__(self, data, num_bases=4, niter=10, show_progress=False, compW=True): NMF.__init__(self, data, num_bases=num_bases, niter=niter, show_progress=show_progress, compW=compW)
def __init__(self, data, num_bases=4, niter=100, show_progress=False, compW=True): # data can be either supplied by conventional numpy arrays or # as a numpy array within a pytables table (should be preferred for large data sets) NMF.__init__(self, data, num_bases=num_bases, niter=niter, show_progress=show_progress, compW=compW)
def __init__(self, data, num_bases=4, niter=100, show_progress=False, compW=True): # call inherited method NMF.__init__(self, data, num_bases=num_bases, niter=niter, show_progress=show_progress, compW=compW)
def factorize(self, niter=1, show_progress=False, compute_w=True, compute_h=True, compute_err=True): # enforce certain default values, otherwise it won't work NMF.factorize(self, niter=1, show_progress=show_progress, compute_w=True, compute_h=True, compute_err=compute_err)
def process_one_category(data_path): bird_category = int(data_path.split('/')[-1].split('.')[0]) filenames = os.listdir(data_path) out_dir = 'output/bird_{0:03d}'.format(bird_category) os.mkdir(out_dir) # load images raw_images = [plt.imread(os.path.join(data_path, filename)) for filename in filenames] for i in range(len(raw_images)): img = raw_images[i] if np.array(img).shape[-1] > 3: raw_images[i] = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB) cv2.imwrite(os.path.join(out_dir, 'raw_{0:03d}_{1}.png'.format(bird_category, i)), img) raw_images = [imresize(img, 224, 224) for img in raw_images] # resize raw_images = np.stack(raw_images) # preprocess images = raw_images.transpose((0, 3, 1, 2)).astype('float32') # to numpy, NxCxHxW, float32 images -= np.array([0.485, 0.456, 0.406]).reshape((1, 3, 1, 1)) # zero mean images /= np.array([0.229, 0.224, 0.225]).reshape((1, 3, 1, 1)) # unit variance images = torch.from_numpy(images) # convert to pytorch tensor if cuda: images = images.cuda() net = models.vgg19(pretrained=True) # load pre-trained VGG-19 if cuda: net = net.cuda() del net.features._modules['36'] # remove max-pooling after final conv layer with torch.no_grad(): features = net.features(images) flat_features = features.permute(0, 2, 3, 1).contiguous().view((-1, features.size(1))) # NxCxHxW -> (N*H*W)xC print('Reshaped features from {0}x{1}x{2}x{3} to ({0}*{2}*{3})x{1} = {4}x{1}'.format(*features.shape, flat_features.size(0))) for K in [15]: with torch.no_grad(): W, _ = NMF(flat_features, K, random_seed=0, cuda=cuda, max_iter=50) heatmaps = W.cpu().view(features.size(0), features.size(2), features.size(3), K).permute(0, 3, 1, 2) # (N*H*W)xK -> NxKxHxW heatmaps = torch.nn.functional.interpolate(heatmaps, size=(224, 224), mode='bilinear', align_corners=False) # 14x14 -> 224x224 heatmaps /= heatmaps.max(dim=3, keepdim=True)[0].max(dim=2, keepdim=True)[0] # normalize by factor (i.e., 1 of K) heatmaps = heatmaps.cpu().numpy() # print(heatmaps.shape) # (60, K, 224, 224) save_mask2d(heatmaps, K, out_dir)
def __init__(self, data, num_bases=0, center_mean=True, **kwargs): NMF.__init__(self, data, num_bases=num_bases) # center the data around the mean first self._center_mean = center_mean if self._center_mean: # copy the data before centering it self._data_orig = data self._meanv = self._data_orig[:,:].mean(axis=1).reshape(data.shape[0],-1) self.data = self._data_orig - self._meanv else: self.data = data
def __init__(self, data, num_bases=0, center_mean=True): NMF.__init__(self, data, num_bases=num_bases) # center the data around the mean first self._center_mean = center_mean if self._center_mean: # copy the data before centering it self._data_orig = data self._meanv = self._data_orig[:,:].mean(axis=1).reshape(data.shape[0],-1) self.data = self._data_orig - self._meanv else: self.data = data
def update_h(self): print self._method if self._method == 'pca': self.H = np.dot(pinv(self.W), self.data) if self._method == 'nmf': mdl = NMF(self.data, num_bases=self._num_bases) mdl.W = self.W mdl.factorize(compute_w=False, niter=50) self.H = mdl.H.copy() if self._method == 'aa': mdl = AA(self.data, num_bases=self._num_bases) mdl.W = self.W mdl.factorize(compute_w=False) self.H = mdl.H.copy()
def __init__(self, data, num_bases=4, niter=100, show_progress=False, compW=True): # data can be either supplied by conventional numpy arrays or # as a numpy array within a pytables table (should be preferred for large data sets) NMF.__init__(self, data, num_bases=num_bases, niter=niter, show_progress=show_progress, compW=compW) # controls how fast lambda should increase: # this influence convergence to binary values during the update. A value # <1 will result in non-binary decompositions as the update rule effectively # is a conventional nmf update rule. Values >1 give more weight to making the # factorization binary with increasing iterations. # setting either W or H to 0 results make the resulting matrix non binary. self._lamb_increase_W = 1.1 self._lamb_increase_H = 1.1
def generate_model(): return NMF(F, K, b=args.b_div, m=args.sparsity_weight, robust_normalization=True, tol=args.tol, dtype=dtype, device=args.device, keep_history=True)
df = pd.read_csv('./temp/ml-100k/u.data', sep='\t', header=None, usecols=[0, 1, 2], names=['userid', 'itemid', 'rating']) R = pd.pivot_table(df, values='rating', index=['userid'], columns=['itemid']) R.fillna(0, inplace=True) ans1 = R[2][1] R[2][1] = 0 ans2 = R[200][940] R[200][940] = 0 ans3 = R[900][931] R[900][931] = 0 nmf = NMF(R.shape[0], R.shape[1]) nmf.sess.run(tf.global_variables_initializer()) for step in range(50000): _, loss, R_pred = nmf.sess.run([nmf.train_op, nmf.loss, nmf.R_pred], { nmf.R: R.values, nmf.lr: 0.001 }) if step % 100 == 0: print("[%d] loss: %.4f | " % (step, loss), end='') print(ans1, ':', R_pred[2][1], '|', ans2, ':', R_pred[200][940], '|', ans3, ':', R_pred[900][931])
k = 5 # Stevilo priporocil rank = 5 # Dimenzija modela eta = 1e-2 # Ucni korak max_iter = 500 # Stevilo iteracij compute_error=True # Izracun napake symmetric=True # Simetircni podatki: DA min_prior_connections = 1 # Minimalni stevilo povezav, da je neko vozlisce priporoceno alpha = 0.01 # Kazen za pretirano prileganje naslovi = csv.DictReader(open(datapath, encoding="utf-8"), delimiter=",").fieldnames print("Prilagajanje modela podatkom ...") model = NMF(compute_error=compute_error, rank=rank, eta=eta, max_iter=max_iter, symmetric=symmetric, alpha=alpha) model.fit(X) Xp = model.predict_all() print("Učenje koncano!") Y = X.copy() f, axes = plt.subplots(ncols=2, nrows=1, figsize=(20, 10)) axes[0].set_xlabel("Vozlisce") axes[0].set_ylabel("Vozlisce") axes[1].set_xlabel("Vozlisce") axes[0].pcolor(Y, cmap='Oranges', vmin=0, vmax=X.max()) axes[0].set_title("Podatki") axes[1].pcolor(Xp, cmap='Oranges', vmin=0, vmax=X.max()) axes[1].set_title("Model") axes[1].set_xlim(0, X.shape[1])
def train(): # ----------- Load data ----------- # load data dict = cPickle.load(open('pre_load.p', 'rb')) tr_X, tr_y, tr_na_list, te_X, te_y, te_na_list = dict['tr_X'], dict[ 'tr_y'], dict['tr_na_list'], dict['te_X'], dict['te_y'], dict[ 'te_na_list'] tr_positive = np.take(tr_X, np.where(tr_y == 1)[0]) tr_positive = [t / np.max(t) for t in tr_positive] tr_positive = [ librosa.feature.stack_memory(t.transpose(), n_steps=sh_order) for t in tr_positive ] tr_negative = np.take(tr_X, np.where(tr_y == 0)[0]) tr_negative = [t / np.max(t) for t in tr_negative] tr_negative = [ librosa.feature.stack_memory(t.transpose(), n_steps=sh_order) for t in tr_negative ] # # # ----------- Do training seperate bases for each file----------- # nmf_model=NMF(rank_p, norm_W=1, iterations=500, update_func = "kl", verbose=True) # W_positive=[] # for f in tr_positive: # [W,H,error]=nmf_model.process(f.transpose()) # W_positive.append(W) # # nmf_model=NMF(rank_p, norm_W=1, iterations=500, update_func = "kl", verbose=True) # W_negative=[] # for f in tr_negative: # [W,H,error]=nmf_model.process(f.transpose()) # W_negative.append(W) tr_positive = np.hstack(tr_positive) tr_negative = np.hstack(tr_negative) train_data = np.hstack((tr_positive, tr_negative)) print >> sys.stderr, train_data.shape # # # # ----------- Do training overcomplete dictionary ----------- # p = decomposition.PCA(whiten=True, n_components= 0.99) # pca_data=p.fit_transform(train_data) # # num=500 # num_dim=pca_data.shape[1] # num_training_samples=pca_data.shape[0] # km = spherical_kmeans.OSKmeans(num,num_dim) # print "Learning k-means: "+ str(num) # for _ in range(1000): # print _ # for index in range(num_training_samples): # km.update(pca_data[index,:]) # codebook=km.centroids # cPickle.dump( [codebook, p], open( W_name, 'wb' ), protocol=cPickle.HIGHEST_PROTOCOL ) # # # ----------- Do training ----------- if type == '0_1': print >> sys.stderr, "NMF on positive examples" nmf_model = NMF(rank_p, norm_W=1, iterations=200, update_func="kls", verbose=False) [W_positive, H, error] = nmf_model.process(tr_positive, lam=lam) # # # a_H=np.ones(rank_n+rank_p) # # # b_H=np.ones(rank_n+rank_p) # # # [error, W_positive, H_gap] = gap_vbem(tr_positive, rank_n+rank_p, a_H, b_H, iterations=100, verbose=True) # # print >> sys.stderr, "NMF on negative examples" nmf_model = NMF(rank_n, norm_W=1, iterations=200, update_func="kls", verbose=False) [W_negative, H, error] = nmf_model.process(tr_negative, lam=lam) # # # [error, W_negative, H_gap] = gap_vbem(tr_negative, rank_n+rank_p, a_H, b_H, iterations=100, verbose=True) cPickle.dump([W_positive, W_negative], open(W_name, 'wb'), protocol=cPickle.HIGHEST_PROTOCOL) elif type == '01': # # -------- Train with masking ---------- print >> sys.stderr, "masked NMF on training files" mask = np.zeros((rank_p, tr_negative.shape[1])) V = np.hstack((tr_negative, tr_positive)) H0 = np.random.rand(rank_n + rank_p, V.shape[1]) + eps H0[-mask.shape[0]:, :mask.shape[1]] = mask nmf_model = NMF(rank_n + rank_p, norm_W=1, iterations=200, update_func="kls", verbose=False) [W, H, error] = nmf_model.process(V, H0=H0, lam=lam) print >> sys.stderr, error # # a_H=np.ones(rank_n+rankwork/bird_backup/W/W_mel_01_kl_50p_50_9folds.n.p_p) # # b_H=np.ones(rank_n+rank_p) # # [error, W_gap, H_gap] = gap_vbem(V, rank_n+rank_p, a_H, b_H, H0, iterations=100, verbose=False) # cPickle.dump(W, open(W_name, 'wb'), protocol=cPickle.HIGHEST_PROTOCOL) else: raise ValueError('Dictionary type not recognized') print >> sys.stderr, "Dictionary " + W_name + " finished!"
def fit(self, X, y=None, features=None): """ Constructs DAG according to `self.dag_method` and learns coexpression modules across multiple resolutions. Parameters ---------- X: `numpy.ndarray` or `scipy.sparse.csr_matrix` Matrix with rows corresponding to all of the samples that define the DAG and columns corresponding to features that define the correlation matrices. y Ignored features: `numpy.ndarray` of `str` A list of strings with feature labels. """ super(DecomposeDAG, self).fit(X, y, features) n_samples, n_features = X.shape if self.verbose: print('Stacking...') sys.stdout.flush() X_multi = self.multiresolution_stack(X) if self.verbose: print('Decomposing...') sys.stdout.flush() if self.decomp_method == 'nmf': #from sklearn.decomposition import NMF from nmf import NMF decomp = NMF( n_components=self.n_components, init=None, solver='cd', beta_loss='frobenius', alpha=1e-3, l1_ratio=1, random_state=69, tol=1e-2, verbose=self.verbose, ).fit(X_multi) components = decomp.components_ elif self.decomp_method == 'lda': from sklearn.decomposition import (LatentDirichletAllocation as LDA) decomp = LDA( n_components=self.n_components, learning_method='online', max_iter=20, mean_change_tol=1e-2, n_jobs=self.n_jobs, random_state=69, verbose=self.verbose, ).fit(X_multi) components = decomp.components_ elif self.decomp_method == 'hdp': from bnp.online_hdp import (HierarchicalDirichletProcess as HDP) hdp = HDP( n_topic_truncate=self.n_components, n_doc_truncate=10000, learning_method='online', n_jobs=self.n_jobs, random_state=69, verbose=self.verbose, ).fit(X_multi) components = hdp.lambda_ else: raise ValueError('Invalid decomposition method {}'.format( self.decomp_method)) n_components = components.shape[0] self.cluster_components = np.reshape( components, (n_components, n_features, len(self.nodes))) cc = np.sum(self.cluster_components, axis=1) cc /= cc.max() assert (cc.shape == (n_components, len(self.nodes))) for node_idx, node in enumerate(self.nodes): node.viz_value = list(cc[:, node_idx]) return self
def __init__(self, data, k=-1, num_bases=4): # call inherited method NMF.__init__(self, data, num_bases=num_bases) self._k = k if self._k == -1: self._k = num_bases
def run_nmf(self, tr_positive, tr_negative): '''Extract a dictionary via NMF given a method chosen in config file Args: tr_positive: a numpy array containing all the positive examples tr_negative: a numpy array containing all the negative examples Output: NONE, the dictionary is saved in a file ''' print(tr_positive.shape) if self.type == '0_1': print("NMF on positive examples") nmf_model = NMF(self.rank_1, norm_W=1, iterations=self.iterations, update_func=self.update_func, verbose=True) [W_positive, H, error] = nmf_model.process(tr_positive) print("NMF on negative examples") nmf_model = NMF(self.rank_0, norm_W=1, iterations=self.iterations, update_func=self.update_func, verbose=True) [W_negative, H, error] = nmf_model.process(tr_negative) print("Saved dictionary to " + self.W_name) W = np.hstack((W_positive, W_negative)) cPickle.dump([W_positive, W_negative], open(self.W_name, 'wb'), protocol=cPickle.HIGHEST_PROTOCOL) elif self.type == 'unsupervised': print("Unsupervised NMF") V = np.hstack((tr_negative, tr_positive)) nmf_model = NMF(self.rank_0 + self.rank_1, norm_W=1, iterations=self.iterations, update_func=self.update_func, verbose=True) [W, H, error] = nmf_model.process(V) print("Saved dictionary to " + self.W_name) cPickle.dump(W, open(self.W_name, 'wb'), protocol=cPickle.HIGHEST_PROTOCOL) elif self.type == '01': # # -------- Train with masking ---------- print("Masked NMF on training files") V = np.hstack((tr_negative, tr_positive)) mask = np.zeros((self.rank_1, tr_negative.shape[1])) H0 = np.random.rand(self.rank_0 + self.rank_1, V.shape[1]) + eps H0[-mask.shape[0]:, :mask.shape[1]] = mask nmf_model = NMF(self.rank_0 + self.rank_1, norm_W=1, iterations=self.iterations, update_func=self.update_func, verbose=True) [W, H, error] = nmf_model.process(V, H0=H0) print("Saved dictionary to " + self.W_name) cPickle.dump(W, open(self.W_name, 'wb'), protocol=cPickle.HIGHEST_PROTOCOL) elif self.type == '01_orth': # # -------- Train with masking ---------- print("masked NMF on training files") V = np.hstack((tr_negative, tr_positive)) mask = np.zeros((self.rank_1, tr_negative.shape[1])) H0 = np.random.rand(self.rank_0 + self.rank_1, V.shape[1]) + eps H0[-mask.shape[0]:, :mask.shape[1]] = mask nmf_model = NMF(self.rank_0 + self.rank_1, norm_W=1, rankW0=self.rank_0, rankW1=self.rank_1, len_V0=tr_negative.shape[1], iterations=self.iterations, update_func=self.update_func, verbose=False) print(self.lam_orth) [W, H, error] = nmf_model.process(V, H0=H0, lam_orth=self.lam_orth) print("Saved dictionary to " + self.W_name) cPickle.dump(W, open(self.W_name, 'wb'), protocol=cPickle.HIGHEST_PROTOCOL) else: raise ValueError('Dictionary type not recognized') return W
def solve(self): ''' The function is main entry for clustering. There are several methods to be used 1. kmeans or Kmeans++ 3. NMF-nonnegative matrix factorization 4. ONMF-Orthogonality constrained nonnegative matrix factorization ''' if self.method_name in { 'kmeans', 'kmeans++', 'kmod', 'msd-km', 'nmf', 'dtpp', 'hals', 'onmf-stf', 'onpmf', 'sncp1c', 'sncp2c', 'sncp4c' }: cls_assign = None time_used = 0 if self.method_name == 'kmeans': W, H = self.data_manager.gen_inits_WH(init='random', seed=self.seed_num, H_ortho=True) initial_centroids = np.asarray(W.transpose()) start_time = time.time() kmeans = KMeans(self.data_manager.get_data_mat(), self.cls_num, self.seed_num) print 'initial shape' print initial_centroids.shape cls_assign, _ = kmeans.solve(initial_centroids, self.data_manager, self.res_dir) end_time = time.time() time_used = end_time - start_time elif self.method_name == 'kmeans++': start_time = time.time() kmeans = KMeans(self.data_manager.get_data_mat(), self.cls_num, self.seed_num) initial_centroids = kmeans.create_centroids_by_kpp() cls_assign, _ = kmeans.solve(initial_centroids) end_time = time.time() time_used = end_time - start_time elif self.method_name == 'nmf': # Before nmf, we should check the validity of input data if self.data_manager.contain_zero_rows(): raise ValueError( 'Error: the data matrix has negative values!') nmf = NMF(self.data_manager, self.res_dir, self.cls_num, self.seed_num) cls_assign, time_used = nmf.solve() elif self.method_name in { 'dtpp', 'hals', 'onmf-stf', 'onpmf', 'sncp1c', 'sncp2c', 'sncp4c' }: #if self.data_manager.contain_zero_rows(): # raise ValueError('Error: the data matrix has negative values') nu = 1e-10 mul = 0 onmf = ONMF(self.data_manager, self.res_dir, self.cls_num, self.seed_num, mul, nu) cls_assign, time_used, (W, H) = onmf.solve(self.method_name) # if the dataset is '2d#X', we need to draw a figure to show the clustering # result if self.data_name.startswith('2d'): dat_path = os.path.join(root_dir, 'results', self.method_name, self.data_name, 'res' + str(self.seed_num) + '.pdf') # get the result directory where the result is stored self.data_manager.visualize_data(partition_idx=cls_assign, dat_path=dat_path, data_points=np.asarray( W.transpose())) #self.data_manager.visualize_data(partition_idx = cls_assign, dat_path = dat_path) #save clustering performance true_labels = self.data_manager.get_labels() print(true_labels.shape) temp_dict = collections.OrderedDict() temp_dict['seed'] = self.seed_num temp_dict['time'] = time_used temp_dict['Purity'] = calculate_purity(cls_assign, true_labels) temp_dict['ARI'] = adjusted_rand_idx = calculate_rand_index( cls_assign, true_labels) temp_dict['ACC'] = calculate_accuracy(cls_assign, true_labels) temp_dict['NMI'] = calculate_NMI(cls_assign, true_labels) return temp_dict elif self.method_name in {'sncp', 'sncp1', 'sncp2', 'sncp3'}: for nu in {1e-10}: for mul in {0}: onmf = ONMF(self.data_manager, self.res_dir, self.cls_num, self.seed_num, mul, nu) #onmf = ONMF(self.data_manager.get_data_mat(), self.res_dir, 20, self.SNR, self.seed_num) cls_assign, time_used, (W, H) = onmf.solve(self.method_name) if self.data_name.startswith('2d'): dat_path = os.path.join( root_dir, 'results', self.method_name, self.data_name, 'res' + str(self.seed_num) + '.pdf') self.data_manager.visualize_data( partition_idx=cls_assign, dat_path=dat_path, data_points=np.asarray(W.transpose())) elif self.method_name == 'visualize_data': self.data_manager.visualize_data() else: raise ValueError('Error: no other methods are supported now!')
def setUp(self): self.description_csv = pd.read_csv("docs/description.csv") self.description_1000_csv = pd.read_csv("docs/description_1000.csv") self.dp = DocsPreprocessor() self.description_1000 = self.dp.process(self.description_1000_csv) self.nmf = NMF(self.description_1000)
# encoding: utf-8 ''' Created on 2016年11月15日 @author: alibaba ''' import numpy as np #from basenmf import BaseNMF, NMFResult #from projective import ProjectiveNMF from nmf import NMF X=np.array([(1,2,3,4,5,6),(4,5,6,7,8,9),(1,3,5,4,2,6)])/10.0 nmf=NMF(X,2,maxiter=100) nmf_result=nmf.predict() w_nmf = nmf_result.matrices[0] print(w_nmf)
from projective import ProjectiveNMF from scipy import misc import pylab as pl import numpy as np #%% -- # 1. Lena # train lena = misc.lena() result_pnmf = ProjectiveNMF(lena, 75).predict() w = result_pnmf.matrices[0] lena_hat_pnmf = w * w.T * lena result_nmf = NMF(lena, 75, objective="kl").predict() lena_hat_nmf = np.dot(result_nmf.matrices[0], result_nmf.matrices[1]) #%% show results pl.figure(1) pl.subplot(131) pl.title("Original") pl.imshow(lena, cmap="gray") pl.subplot(132) pl.title("NMF") pl.imshow(lena_hat_nmf, cmap="gray") pl.subplot(133) pl.title("PNMF")
def __init__(self, data, num_bases=4, lamb=2.0): # call inherited method NMF.__init__(self, data, num_bases=num_bases) self._lamb = lamb