Exemplo n.º 1
0
Arquivo: nmf.py Projeto: kuntzer/sclas
class NMF(method.Method):
	
	def __init__(self, params):
		self.params = params
		self.dec = ProjectedGradientNMF(**params)
	
	def __str__(self):
		return "Non-Negative matrix factorization by Projected Gradient (NMF)"
		
	def train(self, data):
		"""
		Train the NMF on the withened data
		
		:param data: whitened data, ready to use
		"""
		self.dec.fit(data)
	
	def encode(self, data):
		"""
		Encodes the ready to use data
		
		:returns: encoded data with dimension n_components
		"""
		return self.dec.transform(data)
	
	def decode(self, components):
		"""
		Decode the data to return whitened reconstructed data
		
		:returns: reconstructed data
		"""
		return self.dec.inverse_transform(components)
Exemplo n.º 2
0
class SparseApproxSpectrumNonNegative(SparseApproxSpectrum):
    """Non-negative sparse dictionary learning from 2D spectrogram patches 
    initialization:
        patch_size=(12,12) - size of time-frequency 2D patches in spectrogram units (freq,time)
        max_samples=1000000 - if num audio patches exceeds this threshold, randomly sample spectrum
    """
    def __init__(self, patch_size=(12, 12), max_samples=1000000):
        self.patch_size = patch_size
        self.max_samples = max_samples
        self.D = None
        self.data = None
        self.components = None
        self.zscore = False
        self.log_amplitude = False

    def extract_codes(self,
                      X,
                      n_components=16,
                      log_amplitude=True,
                      **nmf_args):
        """Given a spectrogram, learn a dictionary of 2D patch atoms from spectrogram data
        inputs:
            X - spectrogram data (frequency x time)
            n_components - how many components to extract [16]
            log_amplitude - weather to apply log amplitude scaling log(1+X)
            **nmf_args - keyword arguments for ProjectedGradientNMF(...) [None]
        outputs:
            self.data - 2D patches of input spectrogram
            self.D.components_ - dictionary of 2D NMF components
        """
        zscore = False
        self._extract_data_patches(X, zscore, log_amplitude)
        self.n_components = n_components
        nmf_args.setdefault('sparseness', 'components')
        nmf_args.setdefault('init', 'nndsvd')
        nmf_args.setdefault('beta', 0.5)
        print("NMF...")
        self.model = ProjectedGradientNMF(n_components=self.n_components,
                                          **nmf_args)
        self.model.fit(self.data)
        self.D = self.model

    def reconstruct_spectrum(self, w=None, randomize=False):
        "reconstruct by fitting current NMF 2D dictionary to self.data"
        if w is None:
            self.w = self.model.transform(self.data)
            w = self.w
        return SparseApproxSpectrum.reconstruct_spectrum(self,
                                                         w=w,
                                                         randomize=randomize)
class SparseApproxSpectrumNonNegative(SparseApproxSpectrum):
    """Non-negative sparse dictionary learning from 2D spectrogram patches 
    initialization:
    	patch_size=(12,12) - size of time-frequency 2D patches in spectrogram units (freq,time)
    	max_samples=1000000 - if num audio patches exceeds this threshold, randomly sample spectrum
    """
    def __init__(self, patch_size=(12,12), max_samples=1000000):
        self.patch_size = patch_size
        self.max_samples = max_samples
        self.D = None
        self.data = None
        self.components = None
        self.zscore=False
        self.log_amplitude=False

    def extract_codes(self, X, n_components=16, log_amplitude=True, **nmf_args):
    	"""Given a spectrogram, learn a dictionary of 2D patch atoms from spectrogram data
        inputs:
            X - spectrogram data (frequency x time)
            n_components - how many components to extract [16]
            log_amplitude - weather to apply log amplitude scaling log(1+X)
            **nmf_args - keyword arguments for ProjectedGradientNMF(...) [None]
        outputs:
            self.data - 2D patches of input spectrogram
            self.D.components_ - dictionary of 2D NMF components
        """
        zscore=False
        self._extract_data_patches(X, zscore, log_amplitude)
        self.n_components=n_components
        nmf_args.setdefault('sparseness','components')
        nmf_args.setdefault('init','nndsvd')
        nmf_args.setdefault('beta',0.5)
        print "NMF..."
        self.model = ProjectedGradientNMF(n_components=self.n_components, **nmf_args)
        self.model.fit(self.data)
        self.D = self.model

    def reconstruct_spectrum(self, w=None, randomize=False):
    	"reconstruct by fitting current NMF 2D dictionary to self.data"
        if w is None:
            self.w = self.model.transform(self.data)
            w = self.w
        return SparseApproxSpectrum.reconstruct_spectrum(self, w=w, randomize=randomize)
Exemplo n.º 4
0
class NMFSpectrum(SparseApproxSpectrum):
    def __init__(self, **kwargs):
        SparseApproxSpectrum.__init__(self,**kwargs)

    def extract_codes(self, X, **kwargs):
        self.standardize=False
        self._extract_data_patches(X)
        kwargs.setdefault('sparseness','components')
        kwargs.setdefault('init','nndsvd')
        kwargs.setdefault('beta',0.5)
        print("NMF...")
        self.model = ProjectedGradientNMF(n_components=self.n_components, **kwargs)
        self.model.fit(self.data)        
        self.D = self.model
        return self

    def reconstruct_spectrum(self, w=None, randomize=False):
        if w is None:
            self.w = self.model.transform(self.data)
            w = self.w
        return SparseApproxSpectrum.reconstruct_spectrum(self, w=w, randomize=randomize)
Exemplo n.º 5
0
class NMFSpectrum(SparseApproxSpectrum):
    def __init__(self, **kwargs):
        SparseApproxSpectrum.__init__(self,**kwargs)

    def extract_codes(self, X, **kwargs):
        self.standardize=False
        self._extract_data_patches(X)
        kwargs.setdefault('sparseness','components')
        kwargs.setdefault('init','nndsvd')
        kwargs.setdefault('beta',0.5)
        print "NMF..."
        self.model = ProjectedGradientNMF(n_components=self.n_components, **kwargs)
        self.model.fit(self.data)        
        self.D = self.model
        return self

    def reconstruct_spectrum(self, w=None, randomize=False):
        if w is None:
            self.w = self.model.transform(self.data)
            w = self.w
        return SparseApproxSpectrum.reconstruct_spectrum(self, w=w, randomize=randomize)
Exemplo n.º 6
0
 def _nmf_fixed_component(self, i, X):
     """
     Uses sklearn to make the non negative factorization
     input: i, number of clusters for this NMF instance
     author: Arthur Desjardins
     """
     model = ProjectedGradientNMF(n_components=i, init='nndsvd')
     model.fit(X)
     #  H-matrix (clusters x words)
     H = model.components_
     # W-matrix (documents x clusters)
     W = model.transform(X)
     # word matrix
     words = open(attributFile).read().split()
     # processing extremely basic cluster bush
     most_relevant_words = np.argmax(H, axis=1)
     docs_per_cluster = [0]*i
     for tweet in W:
         most_relevant_cluster = np.argmax(tweet)
         docs_per_cluster[most_relevant_cluster] += 1
     clusters = dict(((words[most_relevant_words[i]], docs_per_cluster[i])
                      for i in range(0, i)))
     return clusters
Exemplo n.º 7
0
    def _nonNegativeFactorization(self):
        """
        Uses sklearn to make the non negative factorization
        """

        print 'Loading data..'
        X = np.asmatrix(np.loadtxt(dataFile))
        print 'Data loaded. Making model..'
        model = ProjectedGradientNMF(init='nndsvd')
        print 'Fitting model..'
        model.fit(X)
        print 'Model fit'

        print 'Error rate is', model.reconstruction_err_

        #  H-matrix
        outFile1 = open(factoredHMatrix, 'w')
        np.savetxt(outFile1, model.components_, fmt='%i')
        outFile1.close

        # W-matrix
        outFile2 = open(factoredWMatrix, 'w')
        np.savetxt(outFile2, model.transform(X), fmt='%i')
        outFile2.close
def select_features_nmf(train_X, train_y, test_X, k):
    selector = ProjectedGradientNMF(n_components=k, init='nndsvd', random_state=42)
    selector.fit(train_X)
    train_X = selector.transform(train_X)
    test_X = selector.transform(test_X)
    return train_X, test_X
Exemplo n.º 9
0
def driver_movie_data_test_sklearn(train_filename,test_filename,k):

    (A,movie_ids,user_ids,m_count,u_count) = read_data(train_filename)

    # Do nnmf
    #(U1,V1) = hack_nmf_iter(A,k,.07,16*A.nnz)

    model = ProjectedGradientNMF(n_components=k)

    model.fit(A)
    V1 = model.components_
    U1 = model.transform(A)
    print A.shape
    print U1.shape
    print V1.shape
    # Read test data
    (A,movie_ids,user_ids,m_count,u_count) = read_data(test_filename,movie_ids,user_ids,m_count,u_count,discard=True)
    (error,del_U,del_V,random_pairs) =  evaluate_gradients(A,U1,V1,.07,16*A.nnz,hard=True)

    reverse_user = inverse_map(user_ids)
    reverse_movie = inverse_map(movie_ids)
    
    # Test on Ratings!
    outfile = open("test.sklearn.predictions","w")
    print ("Doing %d test ratings" % A.nnz)
    (n,m) = A.shape
    for row in xrange(n):
        for row_col_index in xrange(A.indptr[row],A.indptr[row+1]):
            col = A.indices[row_col_index]
            elt = A.data[row_col_index]
            print >> outfile, "%s,%s,%0.5f" % (reverse_movie[row],reverse_user[col], nd.dot(U1[row,:],V1[:,col])) 

    # Test on completely random pairs
    outfile = open("test.sklearn.rndpairs.predictions","w")
    for n_pairs in xrange(1000):
        row = r.randint(0,n-1)
        col = r.randint(0,m)
        print >> outfile, "%s,%s,%0.5f" % (reverse_movie[row],reverse_user[col], nd.dot(U1[row,:],V1[:,col])) 
    
    # Test on difficult distribution that ephasizes non-rated pairs where movies and users
    # are chosen based on rating count.
    outfile = open("test.sklearn.hard.rndpairs.predictions","w")
    for n_pairs in xrange(1000):
        i = r.randint(0,A.nnz -1)
        row = find_index(A.indptr,i)
        j = r.randint(0,A.nnz -1)
        col = A.indices[j]
        if (row > A.shape[0]-1):
            print row, A.shape, "what is going on"
            continue
        if (col > A.shape[1]-1):
            print col, A.shape, "what is going on"
            continue
        #print "shape,row,col", A.shape,row,col
        # if (A[row][col] > 0):
        #    continue
        print >> outfile, "%s,%s,%0.5f" % (reverse_movie[row],reverse_user[col], nd.dot(U1[row,:],V1[:,col])) 


    print ("test rsme", math.sqrt(error))
    for i in xrange(k):
        print ("Factor:", i)
        print_movie_factor(U1,reverse_movie, i)
    return(U1,V1,reverse_movie,reverse_user)
Exemplo n.º 10
0
# Word counts
count_vect = CountVectorizer(stop_words = 'english')
answers_train = count_vect.fit_transform(answers_train)
answers_test = count_vect.transform(answers_test)

# Tf-idf
tfidf_transformer = TfidfTransformer()
answers_train = tfidf_transformer.fit_transform(answers_train)
answers_test = tfidf_transformer.transform(answers_test)

# NMF fit on training set
print("Fitting NMF on training word count matrix with shape" + str(answers_train.shape))
nmf = ProjectedGradientNMF(n_components = 100, max_iter=200)
answers_train = nmf.fit_transform(answers_train)
answers_test = nmf.transform(answers_test)

# Fit SVM classifier
print("Fitting SVM classifier on matrix with shape" + str(answers_train.shape))
svc = svm.LinearSVC()
svc.fit(answers_train, cats_train)

print("SVM train classification %: " + str(svc.score(answers_train, cats_train) * 100))
print("SVM test classification %: " + str(svc.score(answers_test, cats_test) * 100))
mc_label = Counter(cats_train).most_common(1)[0][0]
print("Best guess % = " + str( float(Counter(cats_test)[mc_label]) / len(cats_test) * 100))

# Metrics
np.set_printoptions(linewidth=200, precision=3)
cats_pred = svc.predict(answers_test)
#c = metrics.confusion_matrix(labels_test, csvm.predict(data_test))
Exemplo n.º 11
0
def driver_movie_data_test_sklearn(train_filename, test_filename, k):

    (A, movie_ids, user_ids, m_count, u_count) = read_data(train_filename)

    # Do nnmf
    #(U1,V1) = hack_nmf_iter(A,k,.07,16*A.nnz)

    model = ProjectedGradientNMF(n_components=k)

    model.fit(A)
    V1 = model.components_
    U1 = model.transform(A)
    print A.shape
    print U1.shape
    print V1.shape
    # Read test data
    (A, movie_ids, user_ids, m_count, u_count) = read_data(test_filename,
                                                           movie_ids,
                                                           user_ids,
                                                           m_count,
                                                           u_count,
                                                           discard=True)
    (error, del_U, del_V, random_pairs) = evaluate_gradients(A,
                                                             U1,
                                                             V1,
                                                             .07,
                                                             16 * A.nnz,
                                                             hard=True)

    reverse_user = inverse_map(user_ids)
    reverse_movie = inverse_map(movie_ids)

    # Test on Ratings!
    outfile = open("test.sklearn.predictions", "w")
    print("Doing %d test ratings" % A.nnz)
    (n, m) = A.shape
    for row in xrange(n):
        for row_col_index in xrange(A.indptr[row], A.indptr[row + 1]):
            col = A.indices[row_col_index]
            elt = A.data[row_col_index]
            print >> outfile, "%s,%s,%0.5f" % (reverse_movie[row],
                                               reverse_user[col],
                                               nd.dot(U1[row, :], V1[:, col]))

    # Test on completely random pairs
    outfile = open("test.sklearn.rndpairs.predictions", "w")
    for n_pairs in xrange(1000):
        row = r.randint(0, n - 1)
        col = r.randint(0, m)
        print >> outfile, "%s,%s,%0.5f" % (reverse_movie[row],
                                           reverse_user[col],
                                           nd.dot(U1[row, :], V1[:, col]))

    # Test on difficult distribution that ephasizes non-rated pairs where movies and users
    # are chosen based on rating count.
    outfile = open("test.sklearn.hard.rndpairs.predictions", "w")
    for n_pairs in xrange(1000):
        i = r.randint(0, A.nnz - 1)
        row = find_index(A.indptr, i)
        j = r.randint(0, A.nnz - 1)
        col = A.indices[j]
        if (row > A.shape[0] - 1):
            print row, A.shape, "what is going on"
            continue
        if (col > A.shape[1] - 1):
            print col, A.shape, "what is going on"
            continue
        #print "shape,row,col", A.shape,row,col
        # if (A[row][col] > 0):
        #    continue
        print >> outfile, "%s,%s,%0.5f" % (reverse_movie[row],
                                           reverse_user[col],
                                           nd.dot(U1[row, :], V1[:, col]))

    print("test rsme", math.sqrt(error))
    for i in xrange(k):
        print("Factor:", i)
        print_movie_factor(U1, reverse_movie, i)
    return (U1, V1, reverse_movie, reverse_user)