예제 #1
0
    def apply(self, X, k=2):
        """
		Apply NMF to the specified document-term matrix X.
		"""
        import nimfa
        self.W = None
        self.H = None
        initialize_only = self.max_iters < 1
        if self.update == "euclidean":
            objective = "fro"
        else:
            objective = "div"
        lsnmf = nimfa.Lsnmf(X,
                            max_iter=self.max_iters,
                            rank=k,
                            seed=self.init_strategy,
                            update=self.update,
                            objective=objective,
                            test_conv=self.test_conv)
        res = lsnmf()
        # TODO: fix
        try:
            self.W = res.basis().todense()
            self.H = res.coef().todense()
        except:
            self.W = res.basis()
            self.H = res.coef()
        # last number of iterations
        self.n_iter = res.n_iter
예제 #2
0
파일: bss.py 프로젝트: rhysnewell/pystrain
def read_covar(filename, k=10):
    array = []
    with open(filename) as f:
        for line in f:
            line = line.strip().split()
            line = list(map(int, line))
            array.append(line)
    array = np.array(array)

    lsnmf = nimfa.Lsnmf(array,
                        seed='random_vcol',
                        rank=k,
                        max_iter=10,
                        update='divergence',
                        objective='div')

    # lsnmf = nimfa.SepNmf(array, seed='random_vcol', rank=k, n_run=10)
    lsnmf_fit = lsnmf()
    # best_rank = lsnmf.estimate_rank(rank_range=range(8,15))
    # for rank, values in best_rank.items():
    #     print('Rank: %d' % rank)
    #     print('Rss: %5.4f' % values['rss'])
    #     print('Evar: %5.4f' % values['evar'])
    #     print('K-L: %5.4f' % values['kl'])
    # print(best_rank)
    print('Rss: %5.4f' % lsnmf_fit.fit.rss())
    print('Evar: %5.4f' % lsnmf_fit.fit.evar())
    print('K-L divergence: %5.4f' % lsnmf_fit.distance(metric='kl'))
    print('Sparseness, W: %5.4f, H: %5.4f' % lsnmf_fit.fit.sparseness())
    return array, lsnmf_fit
def factorize(V):
    """
    Perform LSNMF factorization on the ORL faces data matrix. 
    
    Return basis and mixture matrices of the fitted factorization model. 
    
    :param V: The ORL faces data matrix. 
    :type V: `numpy.matrix`
    """
    lsnmf = nimfa.Lsnmf(V,
                        seed="random_vcol",
                        rank=25,
                        max_iter=50,
                        sub_iter=10,
                        inner_sub_iter=10,
                        beta=0.1,
                        min_residuals=1e-8)
    print("Algorithm: %s\nInitialization: %s\nRank: %d" %
          (lsnmf, lsnmf.seed, lsnmf.rank))
    fit = lsnmf()
    print("""Stats:
            - iterations: %d
            - final projected gradients norm: %5.3f
            - Euclidean distance: %5.3f""" %
          (fit.fit.n_iter, fit.distance(), fit.distance(metric='euclidean')))
    return fit.basis(), fit.coef()
예제 #4
0
	def get_rank_est(self, k):
		print('Running NMF rank estimation for rank %d...' % k)
		t0 = time.time()
		mat = self.extant_collation_df.values
		nmf_setup = nf.Lsnmf(mat, rank=k, seed='random', max_iter=50, n_run=500, track_factor=1)
		nmf_fit = nmf_setup()
		print('Done in %0.4fs' % (time.time() - t0))
		#Return the result as a one-row DataFrame (it will be added as part of a larger DataFrame later):
		rank_est_k_df = pd.DataFrame({'max_iter': [nmf_fit.fit.max_iter], 'n_run': [nmf_fit.fit.n_run], 'coph_cor': [nmf_fit.fit.coph_cor()], 'dispersion': [nmf_fit.fit.dispersion()]}, index = [k])
		return rank_est_k_df
def nmf_library(V, W_init, correct_H):
    #comparisons with non-negative matrix factorization
    lsnmf = nimfa.Lsnmf(V,
                        seed=None,
                        rank=3,
                        max_iter=100,
                        H=np.array([0., 0., 0.]).reshape(-1, 1),
                        W=W_init)
    nmf = nimfa.Nmf(V,
                    seed=None,
                    rank=3,
                    max_iter=100,
                    H=np.array([0., 0., 0.]).reshape(-1, 1),
                    W=W_init)
    icm = nimfa.Icm(V,
                    seed=None,
                    rank=3,
                    max_iter=100,
                    H=np.array([0., 0., 0.]).reshape(-1, 1),
                    W=W_init)
    bd = nimfa.Bd(V,
                  seed=None,
                  rank=3,
                  max_iter=100,
                  H=np.array([0., 0., 0.]).reshape(-1, 1),
                  W=W_init)
    pmf = nimfa.Pmf(V,
                    seed=None,
                    rank=3,
                    max_iter=100,
                    H=np.array([0., 0., 0.]).reshape(-1, 1),
                    W=W_init)
    #lfnmf = nimfa.Lfnmf(V, seed=None, rank=3, max_iter=100, H = np.array([0.,0.,0.]).reshape(-1,1), W = W_init)

    lsnmf_fit = lsnmf()
    nmf_fit = nmf()
    icm_fit = icm()
    bd_fit = bd()
    pmf_fit = pmf()

    lsnmf_error = mean_absolute_error(
        correct_H, normalized(np.array(lsnmf.H).reshape(-1, )))
    nmf_error = mean_absolute_error(correct_H,
                                    normalized(np.array(nmf.H).reshape(-1, )))
    icm_error = mean_absolute_error(correct_H,
                                    normalized(np.array(icm.H).reshape(-1, )))
    bd_error = mean_absolute_error(correct_H,
                                   normalized(np.array(bd.H).reshape(-1, )))
    pmf_error = mean_absolute_error(correct_H,
                                    normalized(np.array(pmf.H).reshape(-1, )))

    return [lsnmf_error, nmf_error, icm_error, bd_error, pmf_error]
예제 #6
0
def nmf(nodeFeatureMatrix):
    actual_fx_matrix = nodeFeatureMatrix
    n, f = actual_fx_matrix.shape
    number_bins = int(np.log2(n))
    max_roles = min([n, f])
    best_W = None
    best_H = None
    mdlo = mdl.MDL(number_bins)
    minimum_description_length = 1e20
    min_des_not_changed_counter = 0
    for rank in range(1, max_roles + 1):
        lsnmf = nimfa.Lsnmf(actual_fx_matrix, rank=rank, max_iter=100)
        lsnmf_fit = lsnmf()
        W = np.asarray(lsnmf_fit.basis())
        H = np.asarray(lsnmf_fit.coef())
        estimated_matrix = np.asarray(np.dot(W, H))

        code_length_W = mdlo.get_huffman_code_length(W)
        code_length_H = mdlo.get_huffman_code_length(H)

        model_cost = code_length_W * (
            W.shape[0] + W.shape[1]) + code_length_H * (H.shape[0] +
                                                        H.shape[1])
        loglikelihood = mdlo.get_log_likelihood(actual_fx_matrix,
                                                estimated_matrix)

        description_length = model_cost - loglikelihood

        if description_length < minimum_description_length:
            minimum_description_length = description_length
            best_W = np.copy(W)
            best_H = np.copy(H)
            min_des_not_changed_counter = 0
        else:
            min_des_not_changed_counter += 1
            if min_des_not_changed_counter == 4:
                break

        # print ('Number of Roles: %s, Model Cost: %.2f, -loglikelihood: %.2f, Description Length: %.2f, MDL: %.2f (%s)' % (rank, model_cost, loglikelihood, description_length, minimum_description_length, best_W.shape[1]))

    # print ('MDL has not changed for these many iters:', min_des_not_changed_counter)
    print('MDL: %.2f, Roles: %s' %
          (minimum_description_length, best_W.shape[1]))

    return (best_W, best_H)
예제 #7
0
def run_lsnmf(V, rank=12, max_iter=5000):
    """
    Run least squares nonnegative matrix factorization.
    
    :param V: Target matrix to estimate.
    :type V: :class:`numpy.matrix`
    """
    rank = rank
    lsnmf = nimfa.Lsnmf(V,
                        seed="random_vcol",
                        rank=rank,
                        max_iter=max_iter,
                        sub_iter=10,
                        inner_sub_iter=10,
                        beta=0.1,
                        min_residuals=1e-5)
    fit = lsnmf()
    return print_info(fit)
예제 #8
0
	def get_nmf_results(self, k):
		print('Running NMF with rank k = %d...' % k)
		t0 = time.time()
		mat = self.extant_collation_df.values
		nmf_setup = nf.Lsnmf(mat, rank=k, seed='nndsvd', max_iter=32000)
		nmf_fit = nmf_setup()
		t1 = time.time()
		print('Done in %0.4fs' % (t1 - t0))
		#Get the factor matrices:
		W = nmf_fit.basis()
		H = nmf_fit.coef()
		#Convert to DataFrames, complete with their own labels:
		W_df = pd.DataFrame(W)
		H_df = pd.DataFrame(H)
		W_df.index = self.extant_collation_df.index
		W_df.columns = self.get_cluster_labels(k)
		H_df.index = self.get_cluster_labels(k)
		H_df.columns = self.extant_collation_df.columns
		nmf_summary_df = pd.DataFrame({'time (s)': [t1 - t0], 'n_iter': [nmf_fit.fit.n_iter], 'dist': [nmf_fit.fit.distance()], 'evar': [nmf_fit.fit.evar()], 'W_sparseness': [nmf_fit.fit.sparseness()[0]], 'H_sparseness': [nmf_fit.fit.sparseness()[1]]}, index = [k])
		self.W_df = W_df
		self.H_df = H_df
		self.nmf_summary_df = nmf_summary_df
		return
예제 #9
0
# Normalize: V = (V - V.min()) / (V.max() - V.min())
# V = nimfa.examples.medulloblastoma.read(normalize = True)

# actually is nimfa.methods.factorization.lsnmf
# print "Number of zero elements: ", V.size - np.count_nonzero(V)
'''
Example-1: Simple Usage -- Projected Gradient Update
Covers: 1) Input/Output 2) Metrics
'''

V = np.array([[1, 2, 3], [4, 5, 6], [6, 7, 8]])
print('Target:\n%s' % V)

lsnmf = nimfa.Lsnmf(V,
                    distance="euclidean",
                    seed='random_vcol',
                    max_iter=10,
                    rank=3)
lsnmf_fit = lsnmf()

W = lsnmf_fit.basis()
print('Basis matrix:\n%s' % W)

H = lsnmf_fit.coef()
print('Mixture matrix:\n%s' % H)

print('Target estimate:\n%s' % np.dot(W, H))

print('K-L divergence: %5.3f' % lsnmf_fit.distance(metric='kl'))
print('Euclidean distance: %5.3f' % lsnmf_fit.distance(metric='euclidean'))
print('Iterations: %d' % lsnmf_fit.n_iter)
예제 #10
0
import numpy as np

import nimfa

V = np.random.rand(40, 100)
lsnmf = nimfa.Lsnmf(V, seed="random_vcol", rank=10, max_iter=12, sub_iter=10,
                    inner_sub_iter=10, beta=0.1)
lsnmf_fit = lsnmf()
예제 #11
0
    node_feature = args.node_feature
    out_prefix = args.output_prefix
    out_dir = args.output_dir

    refex_features = np.loadtxt(node_feature, delimiter=',')
    actual_fx_matrix = refex_features[:, 1:]

    n, f = actual_fx_matrix.shape
    print 'Number of Features: ', f
    print 'Number of Nodes: ', n

    sparsity_threshold = 1.0
    for i in xrange(1, 6):
        for rank in xrange(20, 29 + 1):
            lsnmf = nimfa.Lsnmf(actual_fx_matrix, rank=rank, max_iter=200)
            lsnmf_fit = lsnmf()
            G = np.asarray(lsnmf_fit.basis())
            F = np.asarray(lsnmf_fit.coef())

            G, F = glrd_sparse(V=actual_fx_matrix,
                               G=G,
                               F=F,
                               r=rank,
                               err_V=sparsity_threshold,
                               err_F=sparsity_threshold)
            G[G <= 0.0] = 0.0
            F[F <= 0.0] = 0.0

            w_out = '%s-%s-%s-nodeRoles.txt' % (rank, i, out_prefix)
            h_out = '%s-%s-%s-roleFeatures.txt' % (rank, i, out_prefix)
예제 #12
0
Y = (Y * 5).astype(int) + 1

#X = np.loadtxt("X_gen.txt")  #sample matrice by Ganesh
#Y = np.loadtxt("Y_gen.txt")
#print X
#print Y
T = np.random.rand(10, 20)
T = T > 0.3
Q = (np.dot(X, Y)).astype(float)  #matice is 30% sparse
Q[T == False] = 0
#print Q
#print T
#print Q[T]
sparseQ = csr_matrix(Q)

L = nf.Lsnmf(sparseQ, seed="random_vcol", rank=5, max_iter=10, beta=0.1)
Lmod = L.factorize()
Xnew = L.basis().todense()
Ynew = L.coef().todense()
Qnew = L.fitted().todense()
Qx = Q - Qnew
Xx = X - Xnew
Yx = Y - Ynew
print rmse(Xx)
print X
print Xnew
print Qnew
#print Qnew[T]- Q[T]
#print rmse(Xnew-X)
#print rmse(Ynew-Y)
#print Qnew
예제 #13
0
        model_cost_b = code_length_b * (n_b + f_b)
        model_cost_bu = code_length_bu * (n_bu + f_bu)

        model_costs.append((bins, model_cost_b))
        model_costs_uniform.append((bins, model_cost_bu))

        p_ari = []
        p_ari_u = []
        s_ari = []
        s_ari_u = []

        c = 0
        for rank in xrange(33, 43):
            for i in xrange(10):
                c += 1
                fctr = nimfa.Lsnmf(full_fx_matrix, rank=rank, max_iter=200)
                fctr_b = nimfa.Lsnmf(binned_fx_matrix, rank=rank, max_iter=200)
                fctr_bu = nimfa.Lsnmf(binned_fx_matrix_u,
                                      rank=rank,
                                      max_iter=200)

                fctr_res = fctr()
                fctr_res_b = fctr_b()
                fctr_res_bu = fctr_bu()

                W = np.asarray(fctr_res.basis())
                W_b = np.asarray(fctr_res_b.basis())
                W_bu = np.asarray(fctr_res_bu.basis())

                actual_primary, actual_secondary = get_role_assignment(W)
                estimated_primary, estimated_secondary = get_role_assignment(
예제 #14
0
R = np.array(expert_category_matrix)
N = len(R)
M = len(R[0])
K = 10

P = np.random.rand(N, K)
Q = np.random.rand(M, K)
nP, nQ = matrix_factorization(R, P, Q, K)
nR = np.dot(nP, nQ.T)

print '#################computing the R hat matrix'
print 'approach nimfa , given library for computing R hat'

V = R  #nimfa.examples.medulloblastoma.read(normalize=True)

lsnmf = nimfa.Lsnmf(V, seed='random_vcol', rank=50, max_iter=100)
lsnmf_fit = lsnmf()

print('Rss: %5.4f' % lsnmf_fit.fit.rss())
print('Evar: %5.4f' % lsnmf_fit.fit.evar())
print('K-L divergence: %5.4f' % lsnmf_fit.distance(metric='kl'))
print('Sparseness, W: %5.4f, H: %5.4f' % lsnmf_fit.fit.sparseness())

print '#################   Finished computing the R hat matrix'

# for each expert in train set, output the score in the table for that [expert][category]
#compare it to the output in the train file
#this gives train accuracy
print nR
#convert test data into expert_tag table
# for each expert, output the score in the table for that [expert][category]
예제 #15
0
def alsPred(matrix=None,
            sparseness=0.3,
            method="lsnmf",
            n_features=5,
            n_iterations=30,
            low=1,
            high=5):
    """
    Decomposes a given matrix using the specified ALS algorithm. 
    Instead of the matrix, dimensions can be given to create a random 
    matrix of specified sparseness.
    
    INPUTS
    matrix : Can be either a 2-D numpy array or a tuple of size 2
    specifying the matrix dimensions
    
    sparseness : Determines the sparseness of the random matrix
    to be generated
                  
    method : Specifies the ALS algorithm variation
    "lsnmf" for Lsnmf model implemented in Nimfa package
    "wr" for ALS-WR
    
    n_features : Number of features of the decomposed matrices
    
    n_iterations : Number of iterations for the ALS algorithm
    
    low, high : Minimum and maximum values for filling the random matrix
    
    """

    #generate a random matrix if matrix is not specified
    if isinstance(matrix, tuple) and len(matrix) == 2:
        mat = np.random.randint(low=low, high=(high + 1), size=matrix)
        mask = np.random.choice([0.0, 1.0],
                                size=mat.size,
                                p=[sparseness,
                                   (1.0 - sparseness)]).reshape(mat.shape)
        mat = mat * mask
        print mat

    if method == "lsnmf":

        mask = (mat > 0.5).astype("int")
        sparseMat = csr_matrix(mat)

        als = nf.Lsnmf(sparseMat,
                       seed="random_vcol",
                       rank=n_features,
                       max_iter=n_iterations,
                       beta=0.5)
        als_fit = als.factorize()

        U = np.array(als_fit.basis().todense())
        V = np.array(als_fit.coef().todense())
        predictedMat = np.round(np.array(np.dot(U, V)), 2)

        RMSE = RMSE_matrix(mat, predictedMat)
        return mat, predictedMat, np.round(U, 2), np.round(V, 2), RMSE

    elif method == "wr":
        mask = (mat > 0.5).astype("int")
        output = als(mat, n_factors=n_features, n_iterations=n_iterations)
        RMSE = RMSE_matrix(mat, output[0])
        return mat, output[0], output[1], output[2], RMSE

    return 0
예제 #16
0
import numpy as np

import nimfa

V = np.array([[1, 2, 3], [4, 5, 6], [6, 7, 8]])
print('Target:\n%s' % V)

lsnmf = nimfa.Lsnmf(V, max_iter=10, rank=3)
lsnmf_fit = lsnmf()

W = lsnmf_fit.basis()
print('Basis matrix:\n%s' % W)

H = lsnmf_fit.coef()
print('Mixture matrix:\n%s' % H)

print('K-L divergence: %5.3f' % lsnmf_fit.distance(metric='kl'))

print('Rss: %5.3f' % lsnmf_fit.fit.rss())
print('Evar: %5.3f' % lsnmf_fit.fit.evar())
print('Iterations: %d' % lsnmf_fit.n_iter)
print('Target estimate:\n%s' % np.dot(W, H))
예제 #17
0
# W : Keep check of which cells have ratings


R = np.zeros((num_users,num_movies))
W = np.zeros((num_users,num_movies))  

for i in range(len(train)):
    R[ train.iloc[i]['userID']-1 , train.iloc[i]['movieID']-1 ] = train.iloc[i]['rating']    #userID is 1-based while R matrix is 0-based
    W[ train.iloc[i]['userID']-1 , train.iloc[i]['movieID']-1 ] = 1
    
sparseR = csr_matrix(R)    

#%%################################################################

## ALS implementation using NIMFA package
als =  nf.Lsnmf(sparseR,seed="random_vcol",rank=100,max_iter=15,beta=0.1)            #Try using different rank (#of features) and see the reduction in training error
als_fit= als.factorize()


#%%
user_features = als_fit.basis()
movie_features = als_fit.coef()
predictedR = np.dot( user_features.todense() , movie_features.todense() )

get_train_error(R,predictedR,W,rmse=True)
#%%
get_test_error( test,predictedR,rmse=True )
#%%


예제 #18
0
            fica = FastICA(n_components=k).fit(datas)
            H = fica.components_
        if seed == "skmeans":
            skm = SphericalKMeans(n_clusters=k).fit(datas)
            H = skm.cluster_centers_
        if seed == "kmeans":
            skm = KMeans(n_clusters=k).fit(datas)
            H = skm.cluster_centers_

        W = np.random.random((_data_dimension, k))
        options['seed'] = None
        options['W'] = W
        options['H'] = H

    if methode == "lsnmf":
        init_nmf = nimfa.Lsnmf(datas, **options)

    if methode == "nmf":
        init_nmf = nimfa.Nmf(datas, **options)

    if methode == "sepnmf":
        init_nmf = nimfa.SepNmf(datas, **options)

    if methode == "nsnmf":
        init_nmf = nimfa.Nsnmf(datas, **options)

    res_nmf = init_nmf()

    result.append(res_nmf.summary())

    name = "nmf_result/" + str(methode) + "-norm-" + str(
예제 #19
0
import numpy as np

import nimfa

V = np.array([[1, 2, 3], [4, 5, 6], [6, 7, 8]])
print('Target:\n%s' % V)

lsnmf = nimfa.Lsnmf(V,
                    seed='random_vcol',
                    max_iter=10,
                    rank=3,
                    track_error=True)
lsnmf_fit = lsnmf()

W = lsnmf_fit.basis()
print('Basis matrix:\n%s' % W)

H = lsnmf_fit.coef()
print('Mixture matrix:\n%s' % H)

# Objective function value for each iteration
print('Error tracking:\n%s' % lsnmf_fit.fit.tracker.get_error())

sm = lsnmf_fit.summary()
print('Rss: %5.3f' % sm['rss'])
print('Evar: %5.3f' % sm['evar'])
print('Iterations: %d' % sm['n_iter'])
예제 #20
0
                             bbox_inches='tight')

print("Finished Company BM25")

###############################################################################
##ANALYST BM25
###############################################################################

analysts_vec = GroupVectorizer(tf_type='bm25',
                               apply_idf=True,
                               idf_type='smooth',
                               apply_dl=True,
                               dl_type='linear').fit(trigram_docs, analysts)
analyst_doc_term_matrix = analysts_vec.transform(trigram_docs, analysts)

mod = nimfa.Lsnmf(V=analyst_doc_term_matrix, max_iter=200, rank=5)
nmf_grid = nimfa.Lsnmf.estimate_rank(mod,
                                     rank_range=np.arange(2, 20),
                                     what=['cophenetic', 'rss'],
                                     n_run=10)

fig = plt.figure(figsize=(8, 6))
fig_plt = sns.barplot(x=np.arange(2, 20),
                      y=[i['cophenetic'] for i in nmf_grid.values()])
fig_plt.set_xlabel("N-Components")
fig_plt.set_ylabel("Cophenetic")
fig_plt.set_title('Cophenetic Score vs. N-Components')
print(fig_plt)
fig_plt.get_figure().savefig(figure_directory +
                             "NMF/BM25/AnalystCophenticScoresLSNMF.png",
                             bbox_inches='tight')