Exemplo n.º 1
0
def matdecomp(imregion, method):
    """Compute matrix decomposition

    Parameters
    ----------
    imregion : 2D array
        The image region data
    method : str
        Options for method ('eigen', 'NMF')
        
    """

    if method == 'eigen':
        ## columns are eigen vectors
        e_vals, e_vecs = LA.eig(imregion)

        return e_vecs

    if method == 'NMF':
        model = ProjectedGradientNMF(n_components=2, init='random',random_state=0)
        model.fit(imregion)
        
        comp = model.components_
        err = model.reconstruction_err_

        return comp
Exemplo n.º 2
0
 def extract_codes(self,
                   X,
                   n_components=16,
                   log_amplitude=True,
                   **nmf_args):
     """Given a spectrogram, learn a dictionary of 2D patch atoms from spectrogram data
     inputs:
         X - spectrogram data (frequency x time)
         n_components - how many components to extract [16]
         log_amplitude - weather to apply log amplitude scaling log(1+X)
         **nmf_args - keyword arguments for ProjectedGradientNMF(...) [None]
     outputs:
         self.data - 2D patches of input spectrogram
         self.D.components_ - dictionary of 2D NMF components
     """
     zscore = False
     self._extract_data_patches(X, zscore, log_amplitude)
     self.n_components = n_components
     nmf_args.setdefault('sparseness', 'components')
     nmf_args.setdefault('init', 'nndsvd')
     nmf_args.setdefault('beta', 0.5)
     print("NMF...")
     self.model = ProjectedGradientNMF(n_components=self.n_components,
                                       **nmf_args)
     self.model.fit(self.data)
     self.D = self.model
Exemplo n.º 3
0
 def fit(self, trainSamples, trainTargets):
     self.dataModel = MemeryDataModel(trainSamples, trainTargets)
     #print 'train user:' + str(self.dataModel.getUsersNum())
     V = self.dataModel.getData()
     model = ProjectedGradientNMF(n_components=self.factors, max_iter=1000, nls_max_iter=1000)
     self.pu = model.fit_transform(V)
     self.qi = model.fit(V).components_.transpose()
Exemplo n.º 4
0
Arquivo: nmf.py Projeto: kuntzer/sclas
class NMF(method.Method):
	
	def __init__(self, params):
		self.params = params
		self.dec = ProjectedGradientNMF(**params)
	
	def __str__(self):
		return "Non-Negative matrix factorization by Projected Gradient (NMF)"
		
	def train(self, data):
		"""
		Train the NMF on the withened data
		
		:param data: whitened data, ready to use
		"""
		self.dec.fit(data)
	
	def encode(self, data):
		"""
		Encodes the ready to use data
		
		:returns: encoded data with dimension n_components
		"""
		return self.dec.transform(data)
	
	def decode(self, components):
		"""
		Decode the data to return whitened reconstructed data
		
		:returns: reconstructed data
		"""
		return self.dec.inverse_transform(components)
    def init_rois(self, n_components=100, show=False):
        Ain, Cin, center = greedyROI2d(self.Y,
                                       nr=n_components,
                                       gSig=[2, 2],
                                       gSiz=[7, 7],
                                       use_median=False)
        Cn = np.mean(self.Y, axis=-1)

        if show:
            pl1 = pl.imshow(Cn, interpolation='none')
            pl.colorbar()
            pl.scatter(x=center[:, 1], y=center[:, 0], c='m', s=40)
            pl.axis((-0.5, self.Y.shape[1] - 0.5, -0.5, self.Y.shape[0] - 0.5))
            pl.gca().invert_yaxis()

        active_pixels = np.squeeze(np.nonzero(np.sum(Ain, axis=1)))
        Yr = np.reshape(self.Y,
                        (self.Y.shape[0] * self.Y.shape[1], self.Y.shape[2]),
                        order='F')
        P = arpfit(Yr, p=2, pixels=active_pixels)
        Y_res = Yr - np.dot(Ain, Cin)
        model = ProjectedGradientNMF(n_components=1,
                                     init='random',
                                     random_state=0)
        model.fit(np.maximum(Y_res, 0))
        fin = model.components_.squeeze()

        self.Yr, self.Cin, self.fin, self.Ain, self.P, self.Cn = Yr, Cin, fin, Ain, P, Cn
    def get_cluster_membership(self):
        """ Determine the cluster number that each sample is associated with. """

        model = ProjectedGradientNMF(n_components=self._num_clusters,
                                     init='random',
                                     beta=.3,
                                     eta=.5,
                                     max_iter=5000)

        w = model.fit_transform(self._matrix)
        h = model.components_

        # convert the 'H' matrix, which represents weights for our data matrix W, into
        # an array representing cluster membership. Index of biggest value in each
        # col of matrix H is the cluster
        clusters = []
        model_width = len(h[0])

        for col_idx in range(model_width):
            max_val = dict()
            for row_idx in range(self._num_clusters):
                h_val = h[row_idx][col_idx]

                if not max_val or h_val > max_val['val']:
                    max_val = {'row_idx': row_idx, 'val': h_val}

            clusters.append(max_val['row_idx'])

        # clusters array, w, h
        return (clusters, w, h)
Exemplo n.º 7
0
def _nmf(X, K):
    nmf = ProjectedGradientNMF(n_components=K, max_iter=1000)
    nmf.fit(X)

    B = nmf.components_
    A = np.dot(X, np.linalg.pinv(B))

    return (A, B)
Exemplo n.º 8
0
def nmfModel(matrix, nTopics):
    t=time()
    print "Starting Factorization"
    nmf = ProjectedGradientNMF(nTopics, max_iter=220, sparseness='data', init='nndsvd')
    W = nmf.fit_transform(matrix)
    H = nmf.components_
    print "Factorization took %s minutes"%(round((time()-t)/60., 2))
    return W, H, nmf
Exemplo n.º 9
0
 def fit(self, trainSamples, trainTargets):
     self.dataModel = MemeryDataModel(trainSamples, trainTargets)
     #print 'train user:' + str(self.dataModel.getUsersNum())
     V = self.dataModel.getData()
     model = ProjectedGradientNMF(n_components=self.factors,
                                  max_iter=1000,
                                  nls_max_iter=1000)
     self.pu = model.fit_transform(V)
     self.qi = model.fit(V).components_.transpose()
 def nmf(self, k):
     
     nmf = ProjectedGradientNMF(n_components=k, max_iter=200)
     P = nmf.fit_transform(self.tdm)
     Q = nmf.components_.T
     self.P = P
     self.Q = Q
     self.er = nmf.reconstruction_err_
     #print "\tError: ", self.er
     return P, Q
Exemplo n.º 11
0
 def extract_codes(self, X, **kwargs):
     self.standardize=False
     self._extract_data_patches(X)
     kwargs.setdefault('sparseness','components')
     kwargs.setdefault('init','nndsvd')
     kwargs.setdefault('beta',0.5)
     print("NMF...")
     self.model = ProjectedGradientNMF(n_components=self.n_components, **kwargs)
     self.model.fit(self.data)        
     self.D = self.model
     return self
Exemplo n.º 12
0
    def __init__(self,model,beta=1, eta=0.1, init='nndsvd', max_iter=500,
        n_components=100, nls_max_iter=2000, random_state=0, sparseness=None,tol=0.0001):

        self.check_non_negtive(model)
        self.model = model
        super(NMFpredictor,self).__init__()
        
        self.nmf = ProjectedGradientNMF(beta=beta, eta=eta, init=init, max_iter=max_iter,
                   n_components=n_components, nls_max_iter=nls_max_iter, random_state=random_state, 
                   sparseness=sparseness,tol=tol)
        self.user_latent_M, self.item_latent_M = self.construct_latent_matrics()
def calcNMF(delta_data, components):

    data = preprocess(delta_data)
    nmf = ProjectedGradientNMF(n_components=components)
    x_nmf = nmf.fit_transform(data['cleanMatrix'])
   
    nmf_fill = np.ones((delta_data.shape[0],components))*np.nan
    nmf_fill[data['cleanind']] = x_nmf
    nmf_weights = nmf.components_.T
    delta_nmf = {'transform':nmf_fill,
                 'weights' : nmf_weights,
                }
    return delta_nmf
Exemplo n.º 14
0
    def __nmf_initialization(A, ncomms):
        try:
            from sklearn.decomposition import ProjectedGradientNMF
        except ImportError:
            print("sklearn module is missing.")
            return

        model = ProjectedGradientNMF(n_components=ncomms, init='nndsvd')
        Uin = np.asmatrix(model.fit_transform(A))
        Vin = np.asmatrix(model.components_)
        Vin = Vin.T
        init_dict = {'U': Uin, 'V': Vin}
        return init_dict
Exemplo n.º 15
0
class SparseApproxSpectrumNonNegative(SparseApproxSpectrum):
    """Non-negative sparse dictionary learning from 2D spectrogram patches 
    initialization:
        patch_size=(12,12) - size of time-frequency 2D patches in spectrogram units (freq,time)
        max_samples=1000000 - if num audio patches exceeds this threshold, randomly sample spectrum
    """
    def __init__(self, patch_size=(12, 12), max_samples=1000000):
        self.patch_size = patch_size
        self.max_samples = max_samples
        self.D = None
        self.data = None
        self.components = None
        self.zscore = False
        self.log_amplitude = False

    def extract_codes(self,
                      X,
                      n_components=16,
                      log_amplitude=True,
                      **nmf_args):
        """Given a spectrogram, learn a dictionary of 2D patch atoms from spectrogram data
        inputs:
            X - spectrogram data (frequency x time)
            n_components - how many components to extract [16]
            log_amplitude - weather to apply log amplitude scaling log(1+X)
            **nmf_args - keyword arguments for ProjectedGradientNMF(...) [None]
        outputs:
            self.data - 2D patches of input spectrogram
            self.D.components_ - dictionary of 2D NMF components
        """
        zscore = False
        self._extract_data_patches(X, zscore, log_amplitude)
        self.n_components = n_components
        nmf_args.setdefault('sparseness', 'components')
        nmf_args.setdefault('init', 'nndsvd')
        nmf_args.setdefault('beta', 0.5)
        print("NMF...")
        self.model = ProjectedGradientNMF(n_components=self.n_components,
                                          **nmf_args)
        self.model.fit(self.data)
        self.D = self.model

    def reconstruct_spectrum(self, w=None, randomize=False):
        "reconstruct by fitting current NMF 2D dictionary to self.data"
        if w is None:
            self.w = self.model.transform(self.data)
            w = self.w
        return SparseApproxSpectrum.reconstruct_spectrum(self,
                                                         w=w,
                                                         randomize=randomize)
Exemplo n.º 16
0
def test_projgrad_nmf_sparseness():
    # Test sparseness
    # Test that sparsity constraints actually increase sparseness in the
    # part where they are applied.
    tol = 1e-2
    A = np.abs(random_state.randn(10, 10))
    m = ProjectedGradientNMF(n_components=5, random_state=0, tol=tol).fit(A)
    data_sp = ProjectedGradientNMF(n_components=5, sparseness='data',
                                   random_state=0,
                                   tol=tol).fit(A).data_sparseness_
    comp_sp = ProjectedGradientNMF(n_components=5, sparseness='components',
                                   random_state=0,
                                   tol=tol).fit(A).comp_sparseness_
    assert_greater(data_sp, m.data_sparseness_)
    assert_greater(comp_sp, m.comp_sparseness_)
Exemplo n.º 17
0
    def reducedim_nmf(self,factors):
        print "Number of factors is "+str(factors)

        model = ProjectedGradientNMF(n_components=factors,init='random',random_state=0)
        self.reducedmatrix= model.fit_transform(self.fullmatrix)  #left factor w (n*k)
        h= model.components_ #right factor h (k*d)

        if self.testing:
            print self.fullmatrix
            print self.reducedmatrix
            print h
            v = numpy.dot(self.reducedmatrix,h)
            print v
        print "Completed NMF routine"
        for vector in self.vectordict.values():
            vector.array=sparse.csc_matrix(self.reducedmatrix[vector.rowindex])
        print "Stored individual vectors"
Exemplo n.º 18
0
 def train_model(self):
     print 'begin'
     RATE_MATRIX = np.zeros((9238, 7973))
     for line in self.train.values:
         print line
         uid = int(float(line[1]))
         iid = int(float(line[2]))
         RATE_MATRIX[uid][iid] = int(float(line[3]))
     V = spr.csr_matrix(RATE_MATRIX)
     model = ProjectedGradientNMF(n_components=self.n_features, max_iter=1000, nls_max_iter=10000)
     self.pu = model.fit_transform(V)
     self.qi = model.fit(V).components_.transpose()
     print model.reconstruction_err_
     self.ValidateF1()
     t = pd.DataFrame(np.array(self.pu))
     t.to_csv('50pu')
     t = pd.DataFrame(np.array(self.qi))
     t.to_csv('50qi')
     print("model generation over")
Exemplo n.º 19
0
def recommend(matrix_3filled, matrix_raw, user, numOfNeighbors=5):
    	
	# The following 3 lines uses Scikit-learn. For more information, refer to the documentation link in README.
    	model = ProjectedGradientNMF(n_components=2, init='random', random_state=0)
    	model.fit(matrix_3filled)

	# transformed matrix is the result of non-negative matrix factorization, and we will use this for the recommendations
    	transformed = np.dot(model.fit_transform(matrix_3filled), model.components_)
    
   	neighbors=[]
	# Calculate distances from the current user to every other users.
    	distances = np.sum((transformed-transformed[user])**2, axis=1)

	# Find nearest neighbors.
    	for x in xrange(numOfNeighbors):
        	distances[np.argmin(distances)] = sys.float_info.max
        	neighbors.append(np.argmin(distances))

	# Get an average for nearest neighbors. average is a vector containing the average rating for each humor.
    	average=[0.0]*transformed.shape[1]
    	for x in xrange(numOfNeighbors):
        	average += transformed[neighbors[x]]
    	average = average/numOfNeighbors

	# Find the unrated items for current users.
    	unratedItems=[]
    	for x in xrange(np.shape(matrix_raw)[1]):
        	if matrix_raw[user][x] == 0:
            		unratedItems.append(x)
    
	# If there are no unrated items, just return an item with max average rating.
    	if len(unratedItems) is 0:
        	item = np.argmax(average)
        	return item
	# Else, return an unrated item with max average rating.
    	else:
        	maxAverage = 0
        	item = np.argmax(average)
        	for x in xrange(len(unratedItems)):
            		if average[unratedItems[x]] > maxAverage:
                		maxAverage = average[unratedItems[x]] 
                		item = unratedItems[x]
        	return item
Exemplo n.º 20
0
def matrixFactorization(inmatrix, p_components=False):
	from sklearn.decomposition import PCA
	from sklearn.decomposition import ProjectedGradientNMF
	import pdb
	if p_components:
		p_comp = p_components
	else:
		pca = PCA(n_components=inmatrix.shape[1])
		pca.fit(inmatrix)
		explained_variance = pca.explained_variance_ratio_.cumsum()
		explained_variance = explained_variance[explained_variance <= .9]
		p_comp = len(explained_variance)
	model = ProjectedGradientNMF(n_components=p_comp,
				     init='nndsvd',
				     beta=1,
				     sparseness=None)
	#pdb.set_trace()
	model.fit(inmatrix)
	return model
class SparseApproxSpectrumNonNegative(SparseApproxSpectrum):
    """Non-negative sparse dictionary learning from 2D spectrogram patches 
    initialization:
    	patch_size=(12,12) - size of time-frequency 2D patches in spectrogram units (freq,time)
    	max_samples=1000000 - if num audio patches exceeds this threshold, randomly sample spectrum
    """
    def __init__(self, patch_size=(12,12), max_samples=1000000):
        self.patch_size = patch_size
        self.max_samples = max_samples
        self.D = None
        self.data = None
        self.components = None
        self.zscore=False
        self.log_amplitude=False

    def extract_codes(self, X, n_components=16, log_amplitude=True, **nmf_args):
    	"""Given a spectrogram, learn a dictionary of 2D patch atoms from spectrogram data
        inputs:
            X - spectrogram data (frequency x time)
            n_components - how many components to extract [16]
            log_amplitude - weather to apply log amplitude scaling log(1+X)
            **nmf_args - keyword arguments for ProjectedGradientNMF(...) [None]
        outputs:
            self.data - 2D patches of input spectrogram
            self.D.components_ - dictionary of 2D NMF components
        """
        zscore=False
        self._extract_data_patches(X, zscore, log_amplitude)
        self.n_components=n_components
        nmf_args.setdefault('sparseness','components')
        nmf_args.setdefault('init','nndsvd')
        nmf_args.setdefault('beta',0.5)
        print "NMF..."
        self.model = ProjectedGradientNMF(n_components=self.n_components, **nmf_args)
        self.model.fit(self.data)
        self.D = self.model

    def reconstruct_spectrum(self, w=None, randomize=False):
    	"reconstruct by fitting current NMF 2D dictionary to self.data"
        if w is None:
            self.w = self.model.transform(self.data)
            w = self.w
        return SparseApproxSpectrum.reconstruct_spectrum(self, w=w, randomize=randomize)
Exemplo n.º 22
0
 def extract_codes(self, X, **kwargs):
     self.standardize=False
     self._extract_data_patches(X)
     kwargs.setdefault('sparseness','components')
     kwargs.setdefault('init','nndsvd')
     kwargs.setdefault('beta',0.5)
     print "NMF..."
     self.model = ProjectedGradientNMF(n_components=self.n_components, **kwargs)
     self.model.fit(self.data)        
     self.D = self.model
     return self
    def init_rois(self, n_components=100, show=False):
        Ain,Cin,center = greedyROI2d(self.Y, nr=n_components, gSig=[2,2], gSiz=[7,7], use_median=False)
        Cn = np.mean(self.Y, axis=-1)

        if show:
            pl1 = pl.imshow(Cn,interpolation='none')
            pl.colorbar()
            pl.scatter(x=center[:,1], y=center[:,0], c='m', s=40)
            pl.axis((-0.5,self.Y.shape[1]-0.5,-0.5,self.Y.shape[0]-0.5))
            pl.gca().invert_yaxis()

        active_pixels = np.squeeze(np.nonzero(np.sum(Ain,axis=1)))
        Yr = np.reshape(self.Y,(self.Y.shape[0]*self.Y.shape[1],self.Y.shape[2]),order='F')
        P = arpfit(Yr, p=2, pixels=active_pixels)
        Y_res = Yr - np.dot(Ain,Cin)
        model = ProjectedGradientNMF(n_components=1, init='random', random_state=0)
        model.fit(np.maximum(Y_res,0))
        fin = model.components_.squeeze()
        
        self.Yr,self.Cin,self.fin,self.Ain,self.P,self.Cn = Yr,Cin,fin,Ain,P,Cn
Exemplo n.º 24
0
    def reducedim_nmf(self, factors):
        print "Number of factors is " + str(factors)

        model = ProjectedGradientNMF(n_components=factors,
                                     init='random',
                                     random_state=0)
        self.reducedmatrix = model.fit_transform(
            self.fullmatrix)  #left factor w (n*k)
        h = model.components_  #right factor h (k*d)

        if self.testing:
            print self.fullmatrix
            print self.reducedmatrix
            print h
            v = numpy.dot(self.reducedmatrix, h)
            print v
        print "Completed NMF routine"
        for vector in self.vectordict.values():
            vector.array = sparse.csc_matrix(
                self.reducedmatrix[vector.rowindex])
        print "Stored individual vectors"
Exemplo n.º 25
0
class NMFSpectrum(SparseApproxSpectrum):
    def __init__(self, **kwargs):
        SparseApproxSpectrum.__init__(self,**kwargs)

    def extract_codes(self, X, **kwargs):
        self.standardize=False
        self._extract_data_patches(X)
        kwargs.setdefault('sparseness','components')
        kwargs.setdefault('init','nndsvd')
        kwargs.setdefault('beta',0.5)
        print("NMF...")
        self.model = ProjectedGradientNMF(n_components=self.n_components, **kwargs)
        self.model.fit(self.data)        
        self.D = self.model
        return self

    def reconstruct_spectrum(self, w=None, randomize=False):
        if w is None:
            self.w = self.model.transform(self.data)
            w = self.w
        return SparseApproxSpectrum.reconstruct_spectrum(self, w=w, randomize=randomize)
def decomposition(V, W, H, n_components, solver='mu', update_H=True):
    if solver != 'project':
        W, H, _ = non_negative_factorization(V,
                                             W=W,
                                             H=H,
                                             n_components=n_components,
                                             update_H=update_H,
                                             max_iter=1000,
                                             solver=solver)
        #regularization='transformation', l1_ratio=0.1)
    else:
        model = ProjectedGradientNMF(n_components=n_components,
                                     init='random',
                                     random_state=0,
                                     sparseness='data',
                                     beta=0,
                                     max_iter=100000)
        model.fit(V)
        H = model.components_
        W = model.fit_transform(V)
    return W, H
Exemplo n.º 27
0
class NMFSpectrum(SparseApproxSpectrum):
    def __init__(self, **kwargs):
        SparseApproxSpectrum.__init__(self,**kwargs)

    def extract_codes(self, X, **kwargs):
        self.standardize=False
        self._extract_data_patches(X)
        kwargs.setdefault('sparseness','components')
        kwargs.setdefault('init','nndsvd')
        kwargs.setdefault('beta',0.5)
        print "NMF..."
        self.model = ProjectedGradientNMF(n_components=self.n_components, **kwargs)
        self.model.fit(self.data)        
        self.D = self.model
        return self

    def reconstruct_spectrum(self, w=None, randomize=False):
        if w is None:
            self.w = self.model.transform(self.data)
            w = self.w
        return SparseApproxSpectrum.reconstruct_spectrum(self, w=w, randomize=randomize)
Exemplo n.º 28
0
 def _nmf_fixed_component(self, i, X):
     """
     Uses sklearn to make the non negative factorization
     input: i, number of clusters for this NMF instance
     author: Arthur Desjardins
     """
     model = ProjectedGradientNMF(n_components=i, init='nndsvd')
     model.fit(X)
     #  H-matrix (clusters x words)
     H = model.components_
     # W-matrix (documents x clusters)
     W = model.transform(X)
     # word matrix
     words = open(attributFile).read().split()
     # processing extremely basic cluster bush
     most_relevant_words = np.argmax(H, axis=1)
     docs_per_cluster = [0]*i
     for tweet in W:
         most_relevant_cluster = np.argmax(tweet)
         docs_per_cluster[most_relevant_cluster] += 1
     clusters = dict(((words[most_relevant_words[i]], docs_per_cluster[i])
                      for i in range(0, i)))
     return clusters
Exemplo n.º 29
0
    def _nonNegativeFactorization(self):
        """
        Uses sklearn to make the non negative factorization
        """

        print 'Loading data..'
        X = np.asmatrix(np.loadtxt(dataFile))
        print 'Data loaded. Making model..'
        model = ProjectedGradientNMF(init='nndsvd')
        print 'Fitting model..'
        model.fit(X)
        print 'Model fit'

        print 'Error rate is', model.reconstruction_err_

        #  H-matrix
        outFile1 = open(factoredHMatrix, 'w')
        np.savetxt(outFile1, model.components_, fmt='%i')
        outFile1.close

        # W-matrix
        outFile2 = open(factoredWMatrix, 'w')
        np.savetxt(outFile2, model.transform(X), fmt='%i')
        outFile2.close
Exemplo n.º 30
0
def perform_nmf(X, w_dir):

    # factorize composition into components
    print "Performing NMF..."
    n_com = 48
    model = ProjectedGradientNMF(n_components=n_com,
                                 sparseness='data',
                                 beta=1,
                                 eta=0.9,
                                 tol=0.000001,
                                 max_iter=2000,
                                 nls_max_iter=5000,
                                 random_state=None)
    model.fit(X)
    print model.reconstruction_err_
    nmf_components = model.components_
    print "done."

    # visualize Base Rules
    # nmf_components = project_data(nmf_components)
    f_name = w_dir + "base_rules_48.png"
    visualize_base_rules(nmf_components, n_com, f_name)

    return model
 def extract_codes(self, X, n_components=16, log_amplitude=True, **nmf_args):
 	"""Given a spectrogram, learn a dictionary of 2D patch atoms from spectrogram data
     inputs:
         X - spectrogram data (frequency x time)
         n_components - how many components to extract [16]
         log_amplitude - weather to apply log amplitude scaling log(1+X)
         **nmf_args - keyword arguments for ProjectedGradientNMF(...) [None]
     outputs:
         self.data - 2D patches of input spectrogram
         self.D.components_ - dictionary of 2D NMF components
     """
     zscore=False
     self._extract_data_patches(X, zscore, log_amplitude)
     self.n_components=n_components
     nmf_args.setdefault('sparseness','components')
     nmf_args.setdefault('init','nndsvd')
     nmf_args.setdefault('beta',0.5)
     print "NMF..."
     self.model = ProjectedGradientNMF(n_components=self.n_components, **nmf_args)
     self.model.fit(self.data)
     self.D = self.model
Exemplo n.º 32
0
class NMFpredictor(Predictor):
    def __init__(self,model,beta=1, eta=0.1, init='nndsvd', max_iter=500,
        n_components=100, nls_max_iter=2000, random_state=0, sparseness=None,tol=0.0001):

        self.check_non_negtive(model)
        self.model = model
        super(NMFpredictor,self).__init__()
        
        self.nmf = ProjectedGradientNMF(beta=beta, eta=eta, init=init, max_iter=max_iter,
                   n_components=n_components, nls_max_iter=nls_max_iter, random_state=random_state, 
                   sparseness=sparseness,tol=tol)
        self.user_latent_M, self.item_latent_M = self.construct_latent_matrics()

    def construct_latent_matrics(self):
    	start = time.time()
        data_matrix = self.model.get_data_matrix()
        user_latent_M = self.nmf.fit_transform(data_matrix)
        item_latent_M = self.nmf.components_
        print "use time: ", time.time() - start
        return user_latent_M, item_latent_M

    def predict(self,user_id, item_id):
        user_no = self.model.user_id_to_no[user_id]
        item_no = self.model.item_id_to_no[item_id]
        pref = np.dot(self.user_latent_M[user_no,:], self.item_latent_M[:,item_no])

        if pref > self.model.max_pref:
            pref = self.model.max_pref
        if pref < self.model.min_pref:
            pref = self.model.min_pref

        return pref

    def check_non_negtive(self,model):
        if model.min_pref < 0:
            raise NotImplementedError("non_negtive!")
Exemplo n.º 33
0
import numpy


client = MongoClient('mongodb://localhost:27017/')
mydb = client['movie_database']


movies = mydb.movies.find()
i = 1
for movie in movies:
    print str(i)+" >> "+movie.get("title") +"--"+ movie.get("_id")
    i = i + 1
users = mydb.users.find()
i = 1
for user in users:
    print str(i) + " >>" + user.get("_id") + "--" + user.get("password")

activities = mydb.activity.find()
i = 1
for activity in activities:
    print str(i) + " >>" + str(activity)

A = numpy.random.uniform(size = [40, 30])
nmf_model = ProjectedGradientNMF(n_components = 5, init='random', random_state=0)
W = nmf_model.fit_transform(A);
H = nmf_model.components_;


print W
print H
Exemplo n.º 34
0
fr = frame.drop('Email', 1)
#NMF will not use email or total score
fr = fr.drop('Total Score', 1)

feature_names = fr.columns

X = np.array(fr.astype(float))
'''for i in range(60):												#Test error as a function of number of topics

   model = ProjectedGradientNMF(n_components=i, init='nndsvda',random_state=0,max_iter=500)
   model.fit(X)

   print (i,model.reconstruction_err_);'''

model = ProjectedGradientNMF(n_components=11,
                             init='nndsvda',
                             random_state=0,
                             max_iter=500)  #Perform the NMF
Xtrans = model.fit_transform(X)

for topic_idx, topic in enumerate(
        model.components_
):  #Print the rubric items with strongest contribution in topics
    sorte = np.sort(topic)[::-1]
    sorteargs = np.argsort(topic)[::-1]
    i = 0
    print("Topic #%d:" % topic_idx)
    while (sorte[i] > 1.5
           ):  #Only show things where contribution is large (1.5 is arbitrary)
        print feature_names[sorteargs[i]], np.mean(
            np.transpose(X)[sorteargs[i]]) / ptvals[feature_names[
                sorteargs[i]]]
Exemplo n.º 35
0
if ans != "y":
    exit()

from sklearn.cluster import MiniBatchKMeans, KMeans
km = MiniBatchKMeans(n_clusters=k,
                     init='k-means++',
                     n_init=1,
                     init_size=1000,
                     batch_size=1000,
                     verbose=1)
km2 = KMeans(n_clusters=k, init='k-means++', verbose=1)
y2 = km2.fit_transform(X)

topics5 = [[(km.cluster_centers_[l][i], feature_names[i])
            for i in np.argsort(-np.abs(km.cluster_centers_[l]))[:10]]
           for l in range(k)]
print topics5

### NMF #######################
ans = raw_input("Start NMF with Scikit ? ")
if ans != "y":
    exit()

from sklearn.decomposition import ProjectedGradientNMF
# BEWARE : THIS IS COMPUTATIONNALY INTENSIVE
nmf = ProjectedGradientNMF(n_components=k, max_iter=10, nls_max_iter=100)
nmf.fit(X)

topics6 = [[(nmf.components_[l][i], feature_names[i])
            for i in np.argsort(-np.abs(nmf.components_[l]))[:10]]
           for l in range(k)]
def select_features_nmf(train_X, train_y, test_X, k):
    selector = ProjectedGradientNMF(n_components=k, init='nndsvd', random_state=42)
    selector.fit(train_X)
    train_X = selector.transform(train_X)
    test_X = selector.transform(test_X)
    return train_X, test_X
Exemplo n.º 37
0
def main():

    es_client = Elasticsearch(hosts = [{ "host" : "localhost", "port" : 9200 }])

    index_name = "slclusters"

    if es_client.indices.exists(index_name):
        print("deleting '%s' index..." % (index_name))
        print(es_client.indices.delete(index = index_name, ignore=[400, 404]))

    print("creating '%s' index..." % (index_name))
    print(es_client.indices.create(index = index_name))


    import re
    rr=re.compile(r"[\w']+")
    tok=lambda a:rr.findall(a)

    ff1=open('../docker/syslog.csv').readlines()
    aa=[]
    for d in ff1:
        #print(d)
        try:
            aa.append(json.loads(d))
        except:
            continue
    print(len(aa))
    # ff='\n'.join(ff1)
    docs=[]
    other=[]
    # aa=json.loads(ff)
    #print(aa)

    for iii,row in enumerate(aa):
        if len(tok(row['syslog_message']))>3:
            doc={}
            doc['created_at']=datetime.strptime(row["@timestamp"], "%Y-%m-%dT%H:%M:%S.000Z")
            doc['text']=row['syslog_message']
            docs.append(doc['text'])
            other.append( doc['created_at'] )
            print(doc['text'])
            print(tok(doc['text']))
            print()
            if len(docs)>=100000:
                break


    cv=CountVectorizer(tokenizer=tok, max_df=0.5,min_df=5)



    # for iii,t in enumerate(tc):
    #     print(iii,t)
    #     if iii>100:
    #         break
    M=cv.fit_transform(docs).astype(np.float)
    M2=Normalizer(copy=False).fit_transform(M)

    km=KMeans(n_clusters=30, init='k-means++', max_iter=200, n_init=5,\
                verbose=True)

    km.fit_transform(M2)
    clusters=km.labels_

    sortInds=[i[0] for i in sorted(enumerate(clusters), key=lambda x:x[1])]

    nmf=ProjectedGradientNMF(n_components=30)
    M3=nmf.fit_transform(M2)
    print(M3.shape)

    tDict={}
    maxInd=0
    esDocs=[]
    for iii in sortInds:
        dd={}
        dd['message']=docs[iii]
        dd['cluster']=int(clusters[iii])

        c2=tuple(np.argsort(M3[iii,:])[-1:])
        if c2 in tDict:
            cc=tDict[c2]
        else:
            cc=maxInd
            tDict[c2]=maxInd
            maxInd=maxInd+1

        dd['cluster2']=cc
        dd['created_at']=other[iii]
        esDocs.append(dd)
        #print(clusters[iii],other[iii],other[iii])

    res = helpers.bulk(es_client, esDocs, index=index_name, doc_type='syslogmsg', refresh=True)
Exemplo n.º 38
0
def main():

    es_client = Elasticsearch(hosts = [{ "host" : "localhost", "port" : 9200 }])

    index_name = "twclusters"

    if es_client.indices.exists(index_name):
        print("deleting '%s' index..." % (index_name))
        print(es_client.indices.delete(index = index_name, ignore=[400, 404]))

    print("creating '%s' index..." % (index_name))
    print(es_client.indices.create(index = index_name))


    from tokenizers import tokenize_nor,get_nor_stopwords
    tok=lambda a:tokenize_nor(a,get_nor_stopwords())

    docs=[]
    other=[]
    conn=sqlite3.connect('../data/tweets.sqlite')
    cur=conn.execute('select * from T')
    for iii,row in enumerate(cur):
        doc={}
        doc['_id']=row[3]
        doc['created_at']=datetime.strptime(row[0], "%Y-%m-%d %H:%M:%S")
        doc['author_id']=row[1]
        doc['text']=row[4]
        doc['language']=row[5]
        if len(tok(doc['text']))>2:
            docs.append(doc['text'])
            other.append( (doc['created_at'],doc['author_id']) )
        if len(docs)>=100000:
            break

    cur.close()


    cv=CountVectorizer(tokenizer=tok, max_df=0.5,min_df=5)



    # for iii,t in enumerate(tc):
    #     print(iii,t)
    #     if iii>100:
    #         break
    M=cv.fit_transform(docs).astype(np.float)
    M2=Normalizer(copy=False).fit_transform(M)

    km=KMeans(n_clusters=20, init='k-means++', max_iter=200, n_init=5,\
                verbose=True)

    km.fit_transform(M2)
    clusters=km.labels_

    sortInds=[i[0] for i in sorted(enumerate(clusters), key=lambda x:x[1])]

    nmf=ProjectedGradientNMF(n_components=10)
    M3=nmf.fit_transform(M2)
    print(M3.shape)

    tDict={}
    maxInd=0
    esDocs=[]
    for iii in sortInds:
        dd={}
        dd['tweet']=docs[iii]
        dd['cluster']=int(clusters[iii])

        c2=tuple(np.argsort(M3[iii,:])[-2:])
        if c2 in tDict:
            cc=tDict[c2]
        else:
            cc=maxInd
            tDict[c2]=maxInd
            maxInd=maxInd+2

        dd['cluster2']=cc
        dd['created_at']=other[iii][1]
        dd['author_id']=other[iii][0]
        esDocs.append(dd)
        #print(clusters[iii],other[iii],other[iii])

    res = helpers.bulk(es_client, esDocs, index=index_name, doc_type='tweet', refresh=True)
Exemplo n.º 39
0
et = ExtraTreesClassifier()
ab = AdaBoostClassifier()
clf2 = svm.LinearSVC(penalty='l1', loss='l2', C=100, dual=False)
clf = svm.SVC(kernel='rbf')
logreg = linear_model.LogisticRegression(C=100, penalty='l2')
knn = KNeighborsClassifier(n_neighbors=5)
sgdc = SGDClassifier()
gnb = GaussianNB()
mnb = MultinomialNB()
bnb = BernoulliNB()
prcp = Perceptron()
rbm = BernoulliRBM(random_state=0, verbose=True)
rbm.learning_rate = 0.02
rbm.n_iter = 20
rbm.n_components = 1000
NMF = ProjectedGradientNMF(n_components=2, init='random', random_state=0)
PCA = PCA()
LDA = LDA()
#ICA = ICA()

classifier = Pipeline(steps=[('rbm', rbm), ('logreg', logreg)])
file_handler_features = open('feature_vectors_heroes.csv', 'r')


def unique(training_data, test_data):
    for item in training_data:
        if item in test_data:
            print 'Item in test'


def hold_out(training_data, results):
Exemplo n.º 40
0
	model=ProjectedGradientNMF(n_components=latentFactorNum,init='nndsvd',tol=tol,max_iter=max_iter);
	print "nnmf start:",datetime.now();
	W,H=model.fit_transform(mat);
	print "nnmf end:",datetime.now();
	return W,H;
if __name__=="__main__":
	if len(argv)!=3:
		print "usage:",argv[0],"datafile_prefix threshold";
	else:
		t,users=load_index_map(argv[1]+".user");
		t,brands=load_index_map(argv[1]+".brand");
		clickMat=convert(argv[1]+".clk.lbm",len(users),len(brands));
		buyMat=convert(argv[1]+".buy.lbm",len(users),len(brands));
		testUCMat=convert("data/8.clk.lbm",len(users),len(brands)).todense();
		testUBMat=convert("data/8.clk.lbm",len(users),len(brands)).todense();
		model=ProjectedGradientNMF(n_components=50,init='nndsvd',tol=1e-8,max_iter=1000);
		print "nnmf start:",datetime.now();
		#W,H=model.fit_transform(clickMat);
		W,H=model.fit_transform(buyMat);
		print "nnmf end:",datetime.now();
		Y=np.dot(W,H);   # prediction

		#cuMat=np.transpose(clickMat).todense();
		#cbMat=cuMat.dot(buyMat.todense());
		#buyPredict=np.dot(Y,cbMat);
		buyPredict=Y;
		#print "error=",norm(clickMat-Y);
		fout=open("/tmp/score","w");
		for i in range(len(users)):
			content=users[i];
			for j in range(len(brands)):
Exemplo n.º 41
0
def nmf(mat,latentFactorNum=50,tol=1e-8,max_iter=1000):
	model=ProjectedGradientNMF(n_components=latentFactorNum,init='nndsvd',tol=tol,max_iter=max_iter);
	print "nnmf start:",datetime.now();
	W,H=model.fit_transform(mat);
	print "nnmf end:",datetime.now();
	return W,H;
Exemplo n.º 42
0
# Split into training and test
#answers_train, answers_test, cats_train, cats_test = train_test_split(answers, cats, test_size = 0.3)#, random_state=42)

# Word counts
count_vect = CountVectorizer(stop_words = 'english')
answers_train = count_vect.fit_transform(answers_train)
answers_test = count_vect.transform(answers_test)

# Tf-idf
tfidf_transformer = TfidfTransformer()
answers_train = tfidf_transformer.fit_transform(answers_train)
answers_test = tfidf_transformer.transform(answers_test)

# NMF fit on training set
print("Fitting NMF on training word count matrix with shape" + str(answers_train.shape))
nmf = ProjectedGradientNMF(n_components = 100, max_iter=200)
answers_train = nmf.fit_transform(answers_train)
answers_test = nmf.transform(answers_test)

# Fit SVM classifier
print("Fitting SVM classifier on matrix with shape" + str(answers_train.shape))
svc = svm.LinearSVC()
svc.fit(answers_train, cats_train)

print("SVM train classification %: " + str(svc.score(answers_train, cats_train) * 100))
print("SVM test classification %: " + str(svc.score(answers_test, cats_test) * 100))
mc_label = Counter(cats_train).most_common(1)[0][0]
print("Best guess % = " + str( float(Counter(cats_test)[mc_label]) / len(cats_test) * 100))

# Metrics
np.set_printoptions(linewidth=200, precision=3)
Cn = local_correlations(Y)
plt1 = plt.imshow(Cn,interpolation='none')
plt.colorbar()

plt.scatter(x=center[:,1], y=center[:,0], c='m', s=40)
plt.axis((-0.5,d2-0.5,-0.5,d1-0.5))
plt.gca().invert_yaxis()
#%%
crd = plot_contours(coo_matrix(Ain[:,::-1]),Cn,thr=0.9)
#%%
active_pixels = np.squeeze(np.nonzero(np.sum(Ain,axis=1)))
Yr = np.reshape(Y,(d1*d2,T),order='F')
p = 2;
P = arpfit(Yr,p=1,pixels = active_pixels)
Y_res = Yr - np.dot(Ain,Cin)
model = ProjectedGradientNMF(n_components=1, init='random', random_state=0)
model.fit(np.maximum(Y_res,0))

fin = model.components_.squeeze()
#%%
t1 = time()
A,b,Cin = update_spatial_components(Yr, Cin, fin, Ain, d1=d1, d2=d2, sn = P['sn'],dist=2,max_size=8,min_size=3)
t_elSPATIAL = time() - t1
#%%
crd = plot_contours(A,Cn2,thr=0.9,cmap=pl.cm.gray)
#%%
t1 = time()
C,f,Y_res,Pnew = update_temporal_components(Yr,A,b,Cin,fin,ITER=2,deconv_method = 'spgl1')
t_elTEMPORAL2 = time() - t1
#%%
t1 = time()
Exemplo n.º 44
0

import numpy as np
X = np.array([[1,1,2,3], [2, 1,4,5], [3, 2,4,5], [4, 1,2,1], [5, 4,3,1], [6, 1,4,3]])
from sklearn.decomposition import ProjectedGradientNMF
model = ProjectedGradientNMF(n_components=2, init='random', random_state=0)

print model.fit(X)
#ProjectedGradientNMF(beta=1, eta=0.1, init='random', max_iter=200,
#        n_components=2, nls_max_iter=2000, random_state=0, sparseness=None,
#        tol=0.0001)
print model.components_
#array([[ 0.77032744,  0.11118662],
#       [ 0.38526873,  0.38228063]])
print model.reconstruction_err_
#0.00746...

W = model.fit_transform(X);
H = model.components_;

print 'w: ' + str(W)
print 'h: ' + str(H)

model = ProjectedGradientNMF(n_components=2, sparseness='components', init='random', random_state=0)


print model.fit(X)
#ProjectedGradientNMF(beta=1, eta=0.1, init='random', max_iter=200,
#            n_components=2, nls_max_iter=2000, random_state=0,
#            sparseness='components', tol=0.0001)
Exemplo n.º 45
0
    global indexes

    parser = argparse.ArgumentParser(description='Compute Non-negative Matrix Factorization')
    parser.add_argument('data_matrix', help='path to data file, should be readable by numpy')
    parser.add_argument('k', type=int, help='number of components to keep')
    parser.add_argument('feature_list', help='path to file containing list of feature names')
    parser.add_argument('index_file', help='path to array_index for this dataset')
    
    args = vars(parser.parse_args())
    data = np.loadtxt(args['data_matrix'])
    k = args['k']
    with open(args['feature_list']) as f:
        feature_list = map(str.rstrip, f.readlines())
    indexes = np.loadtxt(args['index_file'])

    model = ProjectedGradientNMF(n_components=k, init='random', random_state=0)
    H = model.fit_transform(data) # H is submissions(row) by factors(cols)
    W = model.components_    # W is factors(row) by features(cols)
    magnitude = np.prod([np.sum(H, axis = 0), np.sum(W, axis = 1)], axis = 0)

    savetxt_3d(np.array(sort_by_row(W))[:, 0:20, :], 'nmf/factors_and_sorted_features.np', "factor")
    show_feature_name('nmf/factors_and_sorted_features.np', feature_list)

    subs_and_sorted_factors = sort_by_row(H)
    for sub in subs_and_sorted_factors:
        for factor in sub:
            factor[0] += 1
    savetxt_3d(subs_and_sorted_factors, 'nmf/subs_and_sorted_factors.np', "submission")

    print "\n-------------- pattern of dominating factors ----------------\n"
Exemplo n.º 46
0
import numpy as np
X = np.array([[1, 1], [2, 1], [3, 1.2], [4, 1], [5, 0.8], [6, 1]])
from sklearn.decomposition import ProjectedGradientNMF
model = ProjectedGradientNMF(n_components=10, init='random', random_state=0)
model.fit(X)

print model.components_
U = X.dot(model.components_.T)
print U
print U.dot(model.components_)
model.reconstruction_err_

model = ProjectedGradientNMF(
    n_components=2, sparseness='components', init='random', random_state=0)
model.fit(X)
ProjectedGradientNMF(beta=1, eta=0.1, init='random', max_iter=200,
                     n_components=2, nls_max_iter=2000, random_state=0,
                     sparseness='components', tol=0.0001)
model.components_
model.reconstruction_err_
Exemplo n.º 47
0
####THEIRS- not needed
# Example data matrix X

###MINE
X = DataFrame(matrix)
X_imputed = X.copy()
X = pa.DataFrame(matrix)# DataFrame(toy_vals, index = range(nrows), columns = range(ncols))
###use some way to mask only a few vals.... thst too either 0 or 1
msk = (X.values + np.random.randn(*X.shape) - X.values) < 0.8
X_imputed.values[~msk] = 0


##THEIRS

# Hiding values to test imputation
# Initializing model
nmf_model = ProjectedGradientNMF(n_components = 600, init='nndsvda', random_state=0,max_iter=300, eta=0.01, alpha = 0.01)
nmf_model.fit(X_imputed.values)

# iterate model
#while nmf_model.reconstruction_err_**2 > 10:
    #nmf_model = NMF( n_components = 600, init='nndsvda', random_state=0,max_iter=300, eta=0.01, alpha = 0.01)
W = nmf_model.fit_transform(X_imputed.values)
X_imputed.values[~msk] = W.dot(nmf_model.components_)[~msk]
print nmf_model.reconstruction_err_

H = nmf_model.components_
rHat = np.dot(W,H)
np.savetxt("rHat.txt" ,rHat) 
Exemplo n.º 48
0
import numpy as np
X = np.array([[1, 1], [2, 1], [3, 1.2], [4, 1], [5, 0.8], [6, 1]])
from sklearn.decomposition import ProjectedGradientNMF
model = ProjectedGradientNMF(n_components=10, init='random', random_state=0)
model.fit(X)

print model.components_
U = X.dot(model.components_.T)
print U
print U.dot(model.components_)
model.reconstruction_err_

model = ProjectedGradientNMF(n_components=2,
                             sparseness='components',
                             init='random',
                             random_state=0)
model.fit(X)
ProjectedGradientNMF(beta=1,
                     eta=0.1,
                     init='random',
                     max_iter=200,
                     n_components=2,
                     nls_max_iter=2000,
                     random_state=0,
                     sparseness='components',
                     tol=0.0001)
model.components_
model.reconstruction_err_
Exemplo n.º 49
0
def driver_movie_data_test_sklearn(train_filename,test_filename,k):

    (A,movie_ids,user_ids,m_count,u_count) = read_data(train_filename)

    # Do nnmf
    #(U1,V1) = hack_nmf_iter(A,k,.07,16*A.nnz)

    model = ProjectedGradientNMF(n_components=k)

    model.fit(A)
    V1 = model.components_
    U1 = model.transform(A)
    print A.shape
    print U1.shape
    print V1.shape
    # Read test data
    (A,movie_ids,user_ids,m_count,u_count) = read_data(test_filename,movie_ids,user_ids,m_count,u_count,discard=True)
    (error,del_U,del_V,random_pairs) =  evaluate_gradients(A,U1,V1,.07,16*A.nnz,hard=True)

    reverse_user = inverse_map(user_ids)
    reverse_movie = inverse_map(movie_ids)
    
    # Test on Ratings!
    outfile = open("test.sklearn.predictions","w")
    print ("Doing %d test ratings" % A.nnz)
    (n,m) = A.shape
    for row in xrange(n):
        for row_col_index in xrange(A.indptr[row],A.indptr[row+1]):
            col = A.indices[row_col_index]
            elt = A.data[row_col_index]
            print >> outfile, "%s,%s,%0.5f" % (reverse_movie[row],reverse_user[col], nd.dot(U1[row,:],V1[:,col])) 

    # Test on completely random pairs
    outfile = open("test.sklearn.rndpairs.predictions","w")
    for n_pairs in xrange(1000):
        row = r.randint(0,n-1)
        col = r.randint(0,m)
        print >> outfile, "%s,%s,%0.5f" % (reverse_movie[row],reverse_user[col], nd.dot(U1[row,:],V1[:,col])) 
    
    # Test on difficult distribution that ephasizes non-rated pairs where movies and users
    # are chosen based on rating count.
    outfile = open("test.sklearn.hard.rndpairs.predictions","w")
    for n_pairs in xrange(1000):
        i = r.randint(0,A.nnz -1)
        row = find_index(A.indptr,i)
        j = r.randint(0,A.nnz -1)
        col = A.indices[j]
        if (row > A.shape[0]-1):
            print row, A.shape, "what is going on"
            continue
        if (col > A.shape[1]-1):
            print col, A.shape, "what is going on"
            continue
        #print "shape,row,col", A.shape,row,col
        # if (A[row][col] > 0):
        #    continue
        print >> outfile, "%s,%s,%0.5f" % (reverse_movie[row],reverse_user[col], nd.dot(U1[row,:],V1[:,col])) 


    print ("test rsme", math.sqrt(error))
    for i in xrange(k):
        print ("Factor:", i)
        print_movie_factor(U1,reverse_movie, i)
    return(U1,V1,reverse_movie,reverse_user)
Exemplo n.º 50
0
def filter_1sigma_nmf_new(dma, iter_date, df, header_df):
    print 'Get the 1-sigma filtered data'
    print df.shape[1]
    idx_vt = df.shape[1] - 1
    mean_viewtime = df[idx_vt].mean()
    std_viewtime = df[idx_vt].std()

    print mean_viewtime / 3600.0, std_viewtime / 3600.0

    reduced_df = df[(df[idx_vt] >= LOW_LIMIT)
                    & (df[idx_vt] <= HIGH_LIMIT)].reset_index()
    print reduced_df.shape

    reduced_df[range(1, idx_vt)] = reduced_df[range(1, idx_vt)].div(
        1.0 * reduced_df[idx_vt], 'index')
    dev_id_list = reduced_df[0]

    reduced_df_vsum = reduced_df[range(1, idx_vt)].sum()
    reduced_df_vsum = reduced_df_vsum[reduced_df_vsum > 0.00]
    idx_list = reduced_df_vsum.index.tolist()
    reduced_df_1 = reduced_df[range(1, idx_vt)][reduced_df_vsum.index.tolist()]

    # Select the header accordingly
    reduced_header_df = header_df[idx_list]

    #program_viewtime_array = np.array(reduced_df[range(1,idx_vt)].astype(np.float))
    program_viewtime_array = np.array(reduced_df_1.astype(np.float))
    program_name_array = np.array(reduced_header_df)

    t_program_viewtime_array = program_viewtime_array.transpose()

    cluster_num = 14
    # Non-negative Matrix Factorization
    model = ProjectedGradientNMF(n_components=cluster_num,
                                 sparseness='data',
                                 init='nndsvd',
                                 max_iter=400,
                                 random_state=0)
    WW = model.fit_transform(t_program_viewtime_array)
    t_WW = WW.transpose()
    HH = model.components_
    t_HH = HH.transpose()
    #print t_HH.shape
    #print pd.DataFrame(t_HH).head()
    membership = [-1 for item in range(0, t_HH.shape[0])]
    # Assign the membership
    for i in range(0, t_HH.shape[0]):
        membership[i] = np.argmax(t_HH[i])

    dd = reduced_header_df
    print dd.shape
    print program_name_array.shape
    print program_viewtime_array.shape

    file = open(
        'decompose_results_clusters_%s_%s_%s.csv' %
        (iter_date.month, iter_date.day, dma), 'w')
    file.write(
        'Cluster_id,Dev_num,Household_num,Feature_val,Feature_fraction,Program_name\n'
    )
    file.write(
        '-1,%s,%s,,,\n' %
        (len(dev_id_list), get_household_num(dma, dev_id_list.tolist())))
    cluster_num = t_WW.shape[0]

    for i in range(0, cluster_num):
        dev_indices = [index for index, v in enumerate(membership) if v == i]
        dev_in_cluster = dev_id_list[dev_indices]
        dev_num = len(dev_in_cluster)
        household_num = get_household_num(dma, dev_in_cluster.tolist())

        #print heapq.nlargest(10,t_WW[i])
        feature_val = np.sort(t_WW[i])
        feature_val = feature_val[::-1]
        #print 't_WW:',t_WW[i]
        #print 'sorted t_WW:',feature_val
        val_sum = np.sum(feature_val)
        feature_frac = feature_val * 1.0 / val_sum
        accumulated_frac = 0
        cut_ind = 0
        for frac in feature_frac:
            accumulated_frac += frac
            cut_ind += 1
            if accumulated_frac > 0.6:
                break
        idx_list = np.argsort(t_WW[i])[::-1][:cut_ind]
        program_list = program_name_array[0][idx_list]
        for j in range(0, cut_ind):
            file.write('%s,%s,%s,%s,%s,%s\n' %
                       (i, dev_num, household_num, feature_val[j],
                        feature_frac[j], program_list[j]))
        #file.write(' '.join(program_name_array[0][idx_list]))
        #file.write('\n')
    file.close()
    #income_analysis(dma, dev_id_list, cluster_num, membership)
    #child_present_analysis(dma, dev_id_list, cluster_num, membership)
    #age_analysis(dma, dev_id_list, cluster_num, membership)
    clusters_obj = all_clusters(dma, cluster_num, dev_id_list, membership)
    return clusters_obj
Exemplo n.º 51
0
    row_info = train_data.iloc[row]
    curr_q, curr_u, label = row_info[0], row_info[1], row_info[2]

    #print curr_q, curr_u
    question_index = question_list.index(curr_q)
    user_index = expert_list.index(curr_u)

    matrix[question_index][user_index] = label
#print matrix

# In[57]:
print 'running model...'

model = ProjectedGradientNMF(n_components=50,
                             init='nndsvda',
                             random_state=0,
                             max_iter=300,
                             eta=0.01,
                             alpha=0.01)
W = model.fit_transform(matrix)
H = model.components_
rHat = np.dot(W, H)
print 'recon error: ', model.reconstruction_err_

#np.savetxt("rHat.txt",rHat)

#pickle.dump(question_list, 'qList.txt')
# np.savetxt("qList.txt",question_list)
#np.savetxt( user_list,"uList.txt")

# In[61]:
Exemplo n.º 52
0
LC = out.tolist()
X = []
Y = []
for i in LC:
    X.append(i[0])
    Y.append(i[1])

cpmC = pca.components_

for i in range(len(cpmC[1])):
    if cpmC[1][i] * cpmC[1][i] > 0.04:
        print app[i]
        print i

from sklearn.decomposition import ProjectedGradientNMF
pca = ProjectedGradientNMF(n_components=2)

out = pca.fit_transform(catBarr)

LC = out.tolist()
X = []
Y = []
Z = []
for i in LC:
    X.append(i[0])
    Y.append(i[1])

cpmC = pca.components_
lis1 = cpmC[0].tolist()

for i in range(len(lis1)):
Exemplo n.º 53
0
def driver_movie_data_test_sklearn(train_filename, test_filename, k):

    (A, movie_ids, user_ids, m_count, u_count) = read_data(train_filename)

    # Do nnmf
    #(U1,V1) = hack_nmf_iter(A,k,.07,16*A.nnz)

    model = ProjectedGradientNMF(n_components=k)

    model.fit(A)
    V1 = model.components_
    U1 = model.transform(A)
    print A.shape
    print U1.shape
    print V1.shape
    # Read test data
    (A, movie_ids, user_ids, m_count, u_count) = read_data(test_filename,
                                                           movie_ids,
                                                           user_ids,
                                                           m_count,
                                                           u_count,
                                                           discard=True)
    (error, del_U, del_V, random_pairs) = evaluate_gradients(A,
                                                             U1,
                                                             V1,
                                                             .07,
                                                             16 * A.nnz,
                                                             hard=True)

    reverse_user = inverse_map(user_ids)
    reverse_movie = inverse_map(movie_ids)

    # Test on Ratings!
    outfile = open("test.sklearn.predictions", "w")
    print("Doing %d test ratings" % A.nnz)
    (n, m) = A.shape
    for row in xrange(n):
        for row_col_index in xrange(A.indptr[row], A.indptr[row + 1]):
            col = A.indices[row_col_index]
            elt = A.data[row_col_index]
            print >> outfile, "%s,%s,%0.5f" % (reverse_movie[row],
                                               reverse_user[col],
                                               nd.dot(U1[row, :], V1[:, col]))

    # Test on completely random pairs
    outfile = open("test.sklearn.rndpairs.predictions", "w")
    for n_pairs in xrange(1000):
        row = r.randint(0, n - 1)
        col = r.randint(0, m)
        print >> outfile, "%s,%s,%0.5f" % (reverse_movie[row],
                                           reverse_user[col],
                                           nd.dot(U1[row, :], V1[:, col]))

    # Test on difficult distribution that ephasizes non-rated pairs where movies and users
    # are chosen based on rating count.
    outfile = open("test.sklearn.hard.rndpairs.predictions", "w")
    for n_pairs in xrange(1000):
        i = r.randint(0, A.nnz - 1)
        row = find_index(A.indptr, i)
        j = r.randint(0, A.nnz - 1)
        col = A.indices[j]
        if (row > A.shape[0] - 1):
            print row, A.shape, "what is going on"
            continue
        if (col > A.shape[1] - 1):
            print col, A.shape, "what is going on"
            continue
        #print "shape,row,col", A.shape,row,col
        # if (A[row][col] > 0):
        #    continue
        print >> outfile, "%s,%s,%0.5f" % (reverse_movie[row],
                                           reverse_user[col],
                                           nd.dot(U1[row, :], V1[:, col]))

    print("test rsme", math.sqrt(error))
    for i in xrange(k):
        print("Factor:", i)
        print_movie_factor(U1, reverse_movie, i)
    return (U1, V1, reverse_movie, reverse_user)
Exemplo n.º 54
0
Arquivo: nmf.py Projeto: kuntzer/sclas
	def __init__(self, params):
		self.params = params
		self.dec = ProjectedGradientNMF(**params)
Exemplo n.º 55
0
genreMat4 = np.vstack(genreMat4)
print genreMat4

index = filmsbygenre['Action']
E = y[index, :]

### K-Means ###################
ans = raw_input("Start K-Means with Scikit ? ")
if ans != "y":
    exit()

from sklearn.cluster import MiniBatchKMeans, KMeans
km = MiniBatchKMeans(n_clusters=k, init='k-means++', n_init=1, init_size=1000, batch_size=1000, verbose=1)
km2 = KMeans(n_clusters = k, init='k-means++', verbose=1)
y2 = km2.fit_transform(X)

topics5 = [[(km.cluster_centers_[l][i], feature_names[i]) for i in np.argsort(-np.abs(km.cluster_centers_[l]))[:10]] for l in range(k)]
print topics5


### NMF #######################
ans = raw_input("Start NMF with Scikit ? ")
if ans != "y":
    exit()

from sklearn.decomposition import ProjectedGradientNMF
# BEWARE : THIS IS COMPUTATIONNALY INTENSIVE
nmf = ProjectedGradientNMF(n_components=k, max_iter = 10, nls_max_iter=100)
nmf.fit(X)

topics6 = [[(nmf.components_[l][i], feature_names[i]) for i in np.argsort(-np.abs(nmf.components_[l]))[:10]] for l in range(k)]
Exemplo n.º 56
0
__author__ = 'juliewe'


import numpy as np
#import sklearn.decomposition.NMF as NMF
#implements C.J.Lin's projected gradient methods for NMF

X = np.array([[1,1],[2,1],[3,1.2],[4,1],[5,0.8],[6,1]])  #n*d
from sklearn.decomposition import ProjectedGradientNMF
model = ProjectedGradientNMF(n_components=2,init='random',random_state=0)
w= model.fit_transform(X)  #left factor w (n*k)
h= model.components_ #right factor h (k*d)

print w
print h
v = np.dot(w,h)
print v



Exemplo n.º 57
0
t1 = time()
Ain,Cin,center = greedyROI2d(Y, nr = nr, gSig = [4,4], gSiz = [9,9])
t_elGREEDY = time()-t1

#%% arpfit


active_pixels = np.squeeze(np.nonzero(np.sum(Ain,axis=1)))
Yr = np.reshape(Y,(d1*d2,T),order='F')
p = 2;
P = arpfit(Yr,p=2,pixels = active_pixels)

#%% nmf

Y_res = Yr - np.dot(Ain,Cin)
model = ProjectedGradientNMF(n_components=1, init='random', random_state=0)
model.fit(np.maximum(Y_res,0)) 

fin = model.components_.squeeze()

#%% update spatial components

t1 = time()
A,b = update_spatial_components(Yr, Cin, fin, Ain, d1=d1, d2=d2, sn = P['sn'])
t_elSPATIAL = time() - t1

#%% 
t1 = time()
C,f,Y_res,Pnew = update_temporal_components(Yr,A,b,Cin,fin,ITER=2)
t_elTEMPORAL1 = time() - t1
Exemplo n.º 58
0
__author__ = 'juliewe'

import numpy as np
#import sklearn.decomposition.NMF as NMF
#implements C.J.Lin's projected gradient methods for NMF

X = np.array([[1, 1], [2, 1], [3, 1.2], [4, 1], [5, 0.8], [6, 1]])  #n*d
from sklearn.decomposition import ProjectedGradientNMF
model = ProjectedGradientNMF(n_components=2, init='random', random_state=0)
w = model.fit_transform(X)  #left factor w (n*k)
h = model.components_  #right factor h (k*d)

print w
print h
v = np.dot(w, h)
print v