예제 #1
0
def tfidf_nmf(release_texts, n_components=10, max_features=None):
    '''
        Creates and fits tfidf and NMF models.

        INPUT:
        - n_components: number of latent features for the NMF model to find
        - max_features: max number of features (vocabulary size) for the tfidf model to consider

        OUTPUT:
        - tfidf_vectorizer: tfidf model object
        - tfidf_sparse:tfidf sparse matrix
        - nmf: NMF model object
        - W: Feature matrix output from NMF factorization into W and H matrices
    '''
    # tfidf model
    custom_stop_words = make_stop_words()
    tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words=custom_stop_words, max_features=max_features)
    tfidf_sparse = tfidf_vectorizer.fit_transform(release_texts)

    # normalize row-wise so each row sums to one
    tfidf_sparse = normalize(tfidf_sparse, axis=1, norm='l1')

    # nmf model
    nmf = NMF(n_components=n_components, random_state=1)
    nmf.fit(tfidf_sparse)
    W = nmf.transform(tfidf_sparse)
    return tfidf_vectorizer, tfidf_sparse, nmf, W
예제 #2
0
def produceEncoding( trainX, nComponents ):
    '''Produces an NMF encoding from the training
    data matrix'''
    model = NMF( n_components=nComponents, solver='cd', \
                tol=1e-4, max_iter=200, alpha=0.0 )
    model.fit( trainX )
    return model
예제 #3
0
	def __Factorize_NMF(self,K):
		model = NMF(n_components=K,max_iter=self._iteration)
		model.fit(self._mat)
		user_fmat = model.fit_transform(self._mat)
		item_fmat = model.components_.T

		return user_fmat,item_fmat
def plot_nmf_illustration():
    rnd = np.random.RandomState(5)
    X_ = rnd.normal(size=(300, 2))
    # Add 8 to make sure every point lies in the positive part of the space
    X_blob = np.dot(X_, rnd.normal(size=(2, 2))) + rnd.normal(size=2) + 8

    nmf = NMF(random_state=0)
    nmf.fit(X_blob)
    X_nmf = nmf.transform(X_blob)

    fig, axes = plt.subplots(1, 2, figsize=(15, 5))

    axes[0].scatter(X_blob[:, 0], X_blob[:, 1], c=X_nmf[:, 0], linewidths=0, s=60, cmap='viridis')
    axes[0].set_xlabel("feature 1")
    axes[0].set_ylabel("feature 2")
    axes[0].arrow(0, 0, nmf.components_[0, 0], nmf.components_[0, 1], width=.1,
                  head_width=.3, color='k')
    axes[0].arrow(0, 0, nmf.components_[1, 0], nmf.components_[1, 1], width=.1,
                  head_width=.3, color='k')
    axes[0].set_aspect('equal')
    axes[0].set_title("NMF with two components")

    # second plot
    nmf = NMF(random_state=0, n_components=1)
    nmf.fit(X_blob)

    axes[1].scatter(X_blob[:, 0], X_blob[:, 1], c=X_nmf[:, 0], linewidths=0,
                    s=60, cmap='viridis')
    axes[1].set_xlabel("feature 1")
    axes[1].set_ylabel("feature 2")
    axes[1].arrow(0, 0, nmf.components_[0, 0], nmf.components_[0, 1], width=.1,
                  head_width=.3, color='k')

    axes[1].set_aspect('equal')
    axes[1].set_title("NMF with one component")
def get_topics_nmf(urls, num_topics):
    '''Input: URL containing links to each document (pdf) in the
    corpus (i.e. arxiv)  Output: the num_topics most important latent
    topics from the corpus (via NMF)
    '''
    article_info = []
    for url in urls:
        article_info.append(get_text(url))

    text = []
    for thing in article_info:
        text.extend(thing[0])
    text = clean_pdf_text(text)

    tfidf_math = TfidfVectorizer(max_features=100, stop_words=math_stop(),
                                 ngram_range=(1, 1), decode_error='ignore')
    M = tfidf_math.fit_transform(text)

    feature_names = tfidf_math.get_feature_names()
    feature_names = [WordNetLemmatizer().lemmatize(word)
                     for word in feature_names]
    nmf = NMF(n_components=num_topics)
    nmf.fit(M)
    topics = []
    for topic_idx, topic in enumerate(nmf.components_):
        topics.append((" ".join([feature_names[i] for i in
                                topic.argsort()[:-10 - 1:-1]])))
    return M, topics, text, title_list, urls
예제 #6
0
def fit_nmf(tfidf):
    '''takes in a tfidf sparse vector and finds the top topics'''
    nmf = NMF(n_components=n_topics, random_state=1, alpha=.1, l1_ratio=.5)
    nmf.fit(tfidf)
    tfidf_feature_names = tfidf_vectorizer.get_feature_names()
    nmf_topic_dict = print_top_words(nmf, tfidf_feature_names, n_top_words)
    return nmf, nmf_topic_dict
예제 #7
0
class NMFReducer():

    def __init__(self, dataset, dataset_name, num_components=10):
        self.dataset = dataset
        self.dataset_name = dataset_name
        self.labels = dataset.target
        self.scaler = MinMaxScaler()
        self.data = self.scaler.fit_transform(dataset.data)
        self.n_samples, self.n_features = self.data.shape

        self.reducer = NMF(n_components=num_components, max_iter=5000)

    def reduce(self):
        self.reducer.fit(self.data)
        self.reduced = self.scaler.fit_transform(self.reducer.transform(self.data))
        return self.reduced

    def benchmark(self, estimator, name, data):
        t0 = time()
        sample_size = 300
        labels = self.labels

        estimator.fit(data)
        print('% 9s   %.2fs    %i   %.3f   %.3f   %.3f   %.3f   %.3f    %.3f'
              % (name, (time() - t0), estimator.inertia_,
                 metrics.homogeneity_score(labels, estimator.labels_),
                 metrics.completeness_score(labels, estimator.labels_),
                 metrics.v_measure_score(labels, estimator.labels_),
                 metrics.adjusted_rand_score(labels, estimator.labels_),
                 metrics.adjusted_mutual_info_score(labels,  estimator.labels_),
                 metrics.silhouette_score(data, estimator.labels_,
                                          metric='euclidean',
                                          sample_size=sample_size)))

    def display_reduced_digits(self):
        sys.stdout = open('out/NMFReduceDigitsOutput.txt', 'w')
        print("NMF Reduction of %s:\n" % self.dataset_name)
        print(40 * '-')
        print(self.reduced)
        print("\nLength of 1 input vector before reduction: %d \n" % len(self.data.tolist()[0]))
        print("Length of 1 input vector after reduction: %d \n" % len(self.reduced.tolist()[0]))
        print(40 * '-')
        print(self.reducer.reconstruction_err_)

    def display_reduced_iris(self):
        sys.stdout = open('out/NMFReduceIrisOutput.txt', 'w')
        print("NMF Reduction of %s:\n" % self.dataset_name)
        print(40 * '-')
        print(self.reduced)
        print("\nLength of 1 input vector before reduction: %d \n" % len(self.data.tolist()[0]))
        print("Length of 1 input vector after reduction: %d \n" % len(self.reduced.tolist()[0]))
        print(40 * '-')
        print(self.reducer.reconstruction_err_)

    def reduce_crossvalidation_set(self, X_train, X_test):
        self.reducer.fit(X_train)
        reduced_X_train = self.scaler.transform(X_train)
        reduced_X_test = self.scaler.transform(X_test)
        return reduced_X_train, reduced_X_test
예제 #8
0
def NMF_train(X_train, X_test, n):
    nmf_model = NMF(n_components=n)
    nmf_model.fit(X_train)
    
    H = nmf_model.components_;
    W = nmf_model.fit_transform(X_train)
    W_test = nmf_model.transform(X_test)
    
    return H, W, W_test
예제 #9
0
def nmf_error(eta):
    '''
    Decompose the ETAs using nonnegative matrix factorization with 1 component.
    The reconstruction error is an estimate of the non-inspiratory activity.
    '''
    nmf = NMF(n_components=1)
    nmf.fit(eta)
    reconstruction_err = nmf.reconstruction_err_
    return float(reconstruction_err)
예제 #10
0
파일: rolx.py 프로젝트: Bubblbu/Circulo
def get_factorization(V, num_roles):
    """ Obtains a nonnegative matrix factorization of the matrix V with num_roles intermediate roles. """
    model = NMF(n_components=num_roles, init='random', random_state=0)
    model.fit(V)
    
    node_roles = model.transform(V)
    role_features = model.components_
    
    return np.matrix(node_roles), np.matrix(role_features)
예제 #11
0
파일: NFMplot.py 프로젝트: Gurado/pipelines
def performNMF(M, fragmentsLookupTable, fragmentsCount):

    if (args.verbose):
        print >> sys.stdout, "- %s START   : calculating NMF" % (timeStamp())

    t0 = time()
    model = NMF(
        n_components=args.components,
        init='nndsvd',
        beta=10000.0,
        max_iter=1000,
        tol=5e-3,
        sparseness='components')
    model.fit(M)
    train_time = (time() - t0)
    components_ = model.components_

    # print >> sys.stdout, components_
    N = model.transform(M)

    if (args.verbose):
        print >> sys.stdout, "- %s FINISH  : calculating NMF" % (timeStamp())

    if (args.verbose):
        print >> sys.stdout, "- %s START   : mapping components" % (
            timeStamp())

    # convert components into locations
    for i in xrange(args.components):
        output = gzip.open(
            args.outdir + "/NMF_component_" + str(i) + ".txt.gz", 'wb')
        if (args.verbose):
            print >> sys.stdout, "-            : processing component %d" % (i)

        try:

            for j in xrange(model.components_[i].shape[0]):
                # print >> sys.stdout, model.components_[i]
                # print >> sys.stdout, "Max value %f" (numpy.max(model.components_[i]))
                # if (model.components_[i][j] != 0):
                fragment1 = j / fragmentsCount
                fragment2 = j % fragmentsCount

                (chr1, midpoint1) = fragmentsLookupTable[fragment1]
                (chr2, midpoint2) = fragmentsLookupTable[fragment2]
                output.write(
                    "%s\t%i\t%s\t%i\t%f\n" % (chr1, midpoint1, chr2, midpoint2,
                                              model.components_[i][j]))

        finally:
            output.close()

    if (args.verbose):
        print >> sys.stdout, "- %s FINISH  : mapping components" % (
            timeStamp())

    return (N, model)
    def fit(self):
        nmf = NMF(**self.fit_parameters)
        nmf.fit(self.input_data)

        self.output_data = nmf.transform(self.input_data)
        self.mapper_data = nmf.components_
        self.model_attributes = {"n_topics": nmf.n_components,
                                 }
        self._log_model_results()
        return self
예제 #13
0
  def _get(self, index, block, shape):

      offset = (asarray(index[1]) * asarray(shape))[1:]
      dims = block.shape[1:]
      max_size = prod(dims) / 2 if self.max_size == 'full' else self.max_size

      # reshape to t x spatial dimensions
      data = block.reshape(block.shape[0], -1)

      # build and apply NMF model to block
      model = SKNMF(self.k, max_iter=self.max_iter)
      model.fit(clip(data, 0, inf))

      # reconstruct sources as spatial objects in one array
      components = model.components_.reshape((self.k,) + dims)

      # convert from basis functions into shape
      # by median filtering (optional), applying a percentile threshold,
      # finding connected components and removing small objects
      combined = []
      for component in components:
          tmp = component > percentile(component, self.percentile)
          regions = remove_small_objects(label(tmp), min_size=self.min_size)
          ids = unique(regions)
          ids = ids[ids > 0]
          for ii in ids:
              r = regions == ii
              r = median_filter(r, 2)
              coords = asarray(where(r)).T + offset
              if (size(coords) > 0) and (size(coords) < max_size):
                  combined.append(one(coords))

      # merge overlapping sources
      if self.overlap is not None:

          # iterate over source pairs and find a pair to merge
          def merge(sources):
              for i1, s1 in enumerate(sources):
                  for i2, s2 in enumerate(sources[i1+1:]):
                      if s1.overlap(s2) > self.overlap:
                          return i1, i1 + 1 + i2
              return None

          # merge pairs until none left to merge
          pair = merge(combined)
          testing = True
          while testing:
              if pair is None:
                  testing = False
              else:
                  combined[pair[0]] = combined[pair[0]].merge(combined[pair[1]])
                  del combined[pair[1]]
                  pair = merge(combined)

      return combined
예제 #14
0
def get_latent_vector(X):
    # for language: n_components=150
    # for repo: n_components=?
    model = NMF(n_components=150, init='nndsvd', max_iter=1000, random_state=1126)
    print('NMF', model)
    model.fit(X)
    W = model.transform(X)
    H = model.components_

    normalized_matrix = normalize(W, axis=1, norm='l2')
    return normalized_matrix
예제 #15
0
def reduce_dimensions(total_mat, n_topics):
    """
    Calculates and returns nmf 
    Input is data matrix, shape (n_samples, n_features)
    returns W array, shape (n_samples, n_components)
    """
    nmf = NMF(n_components = n_topics, random_state=42, alpha=.2,  l1_ratio=0.5)
    nmf.fit(total_mat)
    X = nmf.transform(total_mat) 
    w = nmf.components_ 
    return nmf 
예제 #16
0
class MatrixFactorization:
    def __init__(self):
        self.nmf = NMF()

    def fit(self, X):
        self.nmf.fit(X)
        u = self.nmf.transform(X)
        return u.dot(self.nmf.components_)

    def predict(self, X):
        u = self.nmf.transform(X)
        return u.dot(self.nmf.components_)
예제 #17
0
파일: features.py 프로젝트: linhr/dighub
class DescriptionNMF(DescriptionVector):
    def __init__(self, dataset, n_topics=None, vocabulary_size=None):
        super(DescriptionNMF, self).__init__(dataset, vocabulary_size)
        self.n_topics = n_topics
        
        self.transformer = NMF(n_components=n_topics)
        self.transformer.fit(self.features)
        
    def get_components(self):
        words = self.vectorizer.get_feature_names()
        words = numpy.array(words, dtype=object)
        indices = numpy.argsort(-numpy.absolute(self.transformer.components_))
        return words[indices]
예제 #18
0
def nmf(X, n, binary, d):
	name = "nmf" + str(n)
	if binary:
		name = name + "_binary_" + d
	print name
	model = NMF(n_components = n)
	model.fit(X)
	A = model.components_
	A_T = A.transpose()
	Z = model.fit_transform(X)
	def get_prob(i, j):
		return np.sum((Z[i,:]*A[:,j]))
	display_results(name, get_prob, binary, d)
예제 #19
0
def learnNMFDict(features, components=25):
	from sklearn.decomposition import NMF

	nmfHOG = NMF(n_components=components)
	nmfHOF = NMF(n_components=components)

	nmfHOG.fit(np.array([x['hog'] for x in features]).T)
	nmfHOF.fit(np.array([x['hof'] for x in features]).T)

	hogComponents = icaHOG.components_.T
	hofComponents = icaHOF.components_.T

	return hogComponents, hofComponents	
예제 #20
0
def nmf_new(mut_final, mut_diff, mut_mean_qn, mut_median_qn, n_components,
            init='nndsvdar', random_state=0):
    # Numerical solver to use: ‘pg’ is a Projected Gradient solver (deprecated).
    # ‘cd’ is a Coordinate Descent solver (recommended).
    model = NMF(n_components=n_components, init=init,
                random_state=random_state)
    # TODO en boucle
    model.fit(mut_final)
    gene_comp = model.components_.copy()
    patient_strat = np.argmax(model.fit_transform(mut_final), axis=1).copy()
    # fit_transform more efficient than calling fit followed by transform

    model.fit(mut_diff)
    gene_comp_diff = model.components_.copy()
    patient_strat_diff = np.argmax(
        model.fit_transform(mut_diff), axis=1).copy()

    model.fit(mut_mean_qn)
    gene_comp_mean_qn = model.components_.copy()
    patient_strat_mean_qn = np.argmax(
        model.fit_transform(mut_mean_qn), axis=1).copy()

    model.fit(mut_median_qn)
    gene_comp_median_qn = model.components_.copy()
    patient_strat_median_qn = np.argmax(
        model.fit_transform(mut_median_qn), axis=1).copy()

    return (gene_comp, patient_strat,
            gene_comp_diff, patient_strat_diff,
            gene_comp_mean_qn, patient_strat_mean_qn,
            gene_comp_median_qn, patient_strat_median_qn)
예제 #21
0
def nmf_old(mut_final, mut_diff, mut_mean_qn, mut_median_qn, n_components,
            init='nndsvdar', random_state=0):
    # fit followed by transform
    model = NMF(n_components=n_components, init=init,
                random_state=random_state)
    # TODO en boucle
    model.fit(mut_final)
    gene_comp = model.components_.copy()
    patient_strat = np.argmax(model.transform(mut_final), axis=1).copy()

    model.fit(mut_diff)
    gene_comp_diff = model.components_.copy()
    patient_strat_diff = np.argmax(
        model.transform(mut_diff), axis=1).copy()

    model.fit(mut_mean_qn)
    gene_comp_mean_qn = model.components_.copy()
    patient_strat_mean_qn = np.argmax(
        model.transform(mut_mean_qn), axis=1).copy()

    model.fit(mut_median_qn)
    gene_comp_median_qn = model.components_.copy()
    patient_strat_median_qn = np.argmax(
        model.transform(mut_median_qn), axis=1).copy()

    return (gene_comp, patient_strat,
            gene_comp_diff, patient_strat_diff,
            gene_comp_mean_qn, patient_strat_mean_qn,
            gene_comp_median_qn, patient_strat_median_qn)
def nmf_faces(X_train, X_test):
    # Build NMF models with 10, 50, 100 and 500 components
    # this list will hold the back-transformd test-data
    reduced_images = []
    for n_components in [10, 50, 100, 500]:
        # build the NMF model
        nmf = NMF(n_components=n_components, random_state=0)
        nmf.fit(X_train)
        # transform the test data (afterwards has n_components many dimensions)
        X_test_nmf = nmf.transform(X_test)
        # back-transform the transformed test-data
        # (afterwards it's in the original space again)
        X_test_back = np.dot(X_test_nmf, nmf.components_)
        reduced_images.append(X_test_back)
    return reduced_images
예제 #23
0
def test_nmf_fit_close(solver):
    rng = np.random.mtrand.RandomState(42)
    # Test that the fit is not too far away
    pnmf = NMF(5, solver=solver, init='nndsvdar', random_state=0,
               max_iter=600)
    X = np.abs(rng.randn(6, 5))
    assert_less(pnmf.fit(X).reconstruction_err_, 0.1)
예제 #24
0
def get_topics(n_components=10, n_top_words=15, print_output=True):
	custom_stop_words = make_stop_words(new_stop_words)
	tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words=custom_stop_words)
	tfidf = tfidf_vectorizer.fit_transform(release_texts)
	tfidf = row_normalize_tfidf(tfidf)

	nmf = NMF(n_components=n_components, random_state=1)
	# nmf = NMF(n_components=n_topics, random_state=1, alpha=.1, l1_ratio=.5)
	nmf.fit(tfidf)
	W = nmf.transform(tfidf)
	

	if print_output:
		print("\nTopics in NMF model:")
		tfidf_feature_names = tfidf_vectorizer.get_feature_names()
		print_top_words(nmf, tfidf_feature_names, n_top_words)
	
	return tfidf, nmf, W
def nmf(x, n_topics):
  print("Non Negative Matrix Factorization (NMF), topics={}".format(n_topics))
  nmf = NMF(
    n_components=n_topics, 
    random_state=1, 
    alpha=.1, 
    l1_ratio=.5
  )
  x = to_ndarray(x)
  nmf_fitted = nmf.fit( x )
  return nmf_fitted.transform(x), nmf_fitted
예제 #26
0
def test_nmf_fit_close(solver):
    rng = np.random.mtrand.RandomState(42)
    # Test that the fit is not too far away
    pnmf = NMF(
        5,
        solver=solver,
        init="nndsvdar",
        random_state=0,
        max_iter=600,
    )
    X = np.abs(rng.randn(6, 5))
    assert pnmf.fit(X).reconstruction_err_ < 0.1
예제 #27
0
def compute_PCA_ICA_NMF(n_components=5):
    spec_mean = spectra.mean(0)

    # PCA: use randomized PCA for speed
    pca = RandomizedPCA(n_components - 1)
    pca.fit(spectra)
    pca_comp = np.vstack([spec_mean, pca.components_])

    # ICA treats sequential observations as related.  Because of this, we need
    # to fit with the transpose of the spectra
    ica = FastICA(n_components - 1)
    ica.fit(spectra.T)
    ica_comp = np.vstack([spec_mean, ica.transform(spectra.T).T])

    # NMF requires all elements of the input to be greater than zero
    spectra[spectra < 0] = 0
    nmf = NMF(n_components)
    nmf.fit(spectra)
    nmf_comp = nmf.components_

    return pca_comp, ica_comp, nmf_comp
예제 #28
0
class NMFSpectrum(SparseApproxSpectrum):
    def __init__(self, **kwargs):
        SparseApproxSpectrum.__init__(self,**kwargs)

    def extract_codes(self, X, **kwargs):
        self.standardize=False
        self._extract_data_patches(X)
        kwargs.setdefault('sparseness','components')
        kwargs.setdefault('init','nndsvd')
        kwargs.setdefault('beta',0.5)
        print("NMF...")
        self.model = NMF(n_components=self.n_components, **kwargs)
        self.model.fit(self.data)        
        self.D = self.model
        return self

    def reconstruct_spectrum(self, w=None, randomize=False):
        if w is None:
            self.w = self.model.transform(self.data)
            w = self.w
        return SparseApproxSpectrum.reconstruct_spectrum(self, w=w, randomize=randomize)
예제 #29
0
def plot_reconstruction_error(matrix, lower, upper):
    """Plotting function for reconstruction error of NMF models vs the number of components
    Parameters
    ----------
    matrix : pivoted input matrix for NMF fitting
    lower : lower bound on number of components
    upper : upper bound on number of components
    Returns
    -------
    saved figure
    """
    nmf_results = []
    for k in range(lower, upper + 1):
        model = NMF(n_components=k, init='random', random_state=0)
        model.fit(matrix)
        nmf_results.append((k, model.reconstruction_err_))
    ax = plt.scatter(*zip(*nmf_results))
    plt.xlabel('N Clusters')
    plt.ylabel('Reconstruction Error')
    plt.title('N Clusters Vs Associated Reconstruction Error')
    return ax
예제 #30
0
class NMFFeatures(object):
    def __init__(self, num_components=128, features=None):
        self.nmf = None
        self.feature_extractor = None
        self.num_components = num_components
        self.features = features
    
    def fit(self, X, y=None):
        self.nmf = NMF(n_components=self.num_components, max_iter=50, random_state=42)
        self.feature_extractor = TfidfVectorizer(tokenizer=utils.get_tokens, ngram_range=(1, 1), stop_words='english', vocabulary=self.features)
        
        transformed_features = self.feature_extractor.fit_transform(X)
        self.nmf.fit(transformed_features)
        
    def transform(self, X):
        transformed_features = self.feature_extractor.transform(X)
        return self.nmf.transform(transformed_features)
    
    def fit_transform(self, X, y=None):
        self.fit(X, y)
        return self.transform(X)
예제 #31
0
파일: spy.py 프로젝트: Jash271/YouGlance
    def segregate_topic(self, thresh=None):
        if thresh is None:
            d = {}
            tfidf = TfidfVectorizer(max_df=0.96,
                                    min_df=2,
                                    stop_words="english")
            x = tfidf.fit_transform(self.df["cleaned_text"])
            nmf_model = NMF(n_components=self.topic, random_state=21)
            nmf_model.fit(x)
            for index, topic in enumerate(nmf_model.components_):

                d[index] = [
                    tfidf.get_feature_names()[i] for i in topic.argsort()[-20:]
                ]

            result = nmf_model.transform(x)

            y = result.argmax(axis=1)
            self.df["topic_label"] = y

            return (self.df, d)
        else:
            d = {}
            tfidf = TfidfVectorizer(max_df=0.96,
                                    min_df=2,
                                    stop_words="english")
            x = tfidf.fit_transform(self.df["cleaned_text"])
            nmf_model = NMF(n_components=thresh, random_state=21)
            nmf_model.fit(x)
            for index, topic in enumerate(nmf_model.components_):

                d[index] = [
                    tfidf.get_feature_names()[i] for i in topic.argsort()[-20:]
                ]

            result = nmf_model.transform(x)

            y = result.argmax(axis=1)
            self.df["topic_label"] = y
            return (self.df, d)
    def fit(self, num_factors=100,
            l1_ratio = 0.5,
            solver = "multiplicative_update",
            init_type = "random",
            beta_loss = "frobenius",
            verbose = False,
            random_seed = None):


        assert l1_ratio>= 0 and l1_ratio<=1, "{}: l1_ratio must be between 0 and 1, provided value was {}".format(self.RECOMMENDER_NAME, l1_ratio)

        if solver not in self.SOLVER_VALUES:
           raise ValueError("Value for 'solver' not recognized. Acceptable values are {}, provided was '{}'".format(self.SOLVER_VALUES.keys(), solver))

        if init_type not in self.INIT_VALUES:
           raise ValueError("Value for 'init_type' not recognized. Acceptable values are {}, provided was '{}'".format(self.INIT_VALUES, init_type))

        if beta_loss not in self.BETA_LOSS_VALUES:
           raise ValueError("Value for 'beta_loss' not recognized. Acceptable values are {}, provided was '{}'".format(self.BETA_LOSS_VALUES, beta_loss))

        start_time = time.time()
        self._print("Computing NMF decomposition...")

        nmf_solver = NMF(n_components  = num_factors,
                         init = init_type,
                         solver = self.SOLVER_VALUES[solver],
                         beta_loss = beta_loss,
                         random_state = random_seed,
                         l1_ratio = l1_ratio,
                         shuffle = True,
                         verbose = verbose,
                         max_iter = 500)

        nmf_solver.fit(self.URM_train)

        self.ITEM_factors = nmf_solver.components_.copy().T
        self.USER_factors = nmf_solver.transform(self.URM_train)

        new_time_value, new_time_unit = seconds_to_biggest_unit(time.time()-start_time)
        self._print("Computing NMF decomposition... done in {:.2f} {}".format( new_time_value, new_time_unit))
예제 #33
0
    def fit(self, data):
        """
        fit searches for the background profile using NMF decomposition
        - (N, M, M) image series --> (N, M**2) flattened images
        - (N, M**2) = (N, n_components) @ (n_components, M**2) NMF decomposition
        - background: (n_components, M**2) --> (important_components, M**2)

        Parameters
        ----------
        data : np.ndarray
            Input data (series of 2D images, 3D total)
        center : tuple
            (corner_x, corner_y) tuple
        n_components : int, optional
            n_components for dimensionality reduction, by default 5
        important_components : int, optional
            number of components to account for, by default 1

        Returns
        -------
        np.ndarray
            Background profile
        """
        X = data.reshape(data.shape[0], -1)

        nmf = NMF(
            n_components=self.n_components,
        )

        nmf.fit(X)
        coeffs = nmf.transform(X)
        bg_full = nmf.components_[: self.important_components, :].reshape(
            (-1, *data.shape[1:])
        )

        # memorize scalefactors and background
        self._scales = coeffs[:, : self.important_components].reshape(1, -1)
        self._bg = bg_full

        return bg_full
예제 #34
0
def predict_new_input(mat, mv, mv_re, id_list,
                      rate_list):  # mv is the hashing dictionary
    n, m = mat.shape
    new_row = np.zeros(m).reshape((1, m))
    for id, rate in zip(id_list, rate_list):
        new_row[0, mv_re[id]] = rate
    mat = np.append(mat, new_row, axis=0)
    n = n + 1
    # check whether to discard the two lines below:
    known = [x for x in range(m) if mat[-1, x] != 0]
    desired = [
        x for x in range(m) if mat[-1, x] == 0
    ]  # stores all the index of movies in the matrix that is to predict ratings

    predicts = dict()  # key is movie_id and value is the predicted ratings

    T1 = mat[:, known]
    model = NMF(n_components=3,
                tol=0.005)  # give a 5-component model if not specify
    model.fit(T1)
    W1 = model.fit_transform(T1)
    H1 = model.components_
    #print model.reconstruction_err_
    #print 'W1', W1[-1,:]

    for i in desired:  # each time focus on a even smaller matrix with only one desired entry to predict
        cols = known
        cols.append(i)
        T2 = mat[:-1, cols]
        model2 = NMF(
            n_components=3, tol=0.005
        )  # if do not specify n_components, there will be 23846 components
        model2.fit(T2)
        W2 = model2.fit_transform(T2)
        H2 = model2.components_
        #print model2.reconstruction_err_
        #print 'H2', H2[:,-1]
        predicts[mv[i]] = np.dot(H2[:, -1], W1[-1, :])
    #print predicts
    return predicts
예제 #35
0
    def denoise(self, data, center, radius=45):
        """
        nmf_denoise performs NMF-decomposition based denoising
        - (N, M, M) image series --> (N, M**2) flattened images
        - (N, M**2) = (N, n_components) @ (n_components, M**2) NMF decomposition
        - background: (n_components, M**2) --> (important_components, M**2)
        - scales: (N, n_components) --> (N, important_components)
        - scaled_background = scales @ background
        - return arr - scaled_background

        Parameters
        ----------
        data : np.ndarray
            Input data (series of 2D images, 3D total)
        center : tuple
            (corner_x, corner_y) tuple
        n_components : int, optional
            n_components for dimensionality reduction, by default 5
        important_components : int, optional
            number of components to account for, by default 1

        Returns
        -------
        np.ndarray
            Denoised data
        """
        img_shape = data.shape[1:]
        X = data.reshape(data.shape[0], -1)

        nmf = NMF(n_components=self.n_components)
        nmf.fit(X)
        coeffs = nmf.transform(X)

        bg_full = nmf.components_
        bg_scaled = (coeffs[:, :self.important_components]
                     @ bg_full[:self.important_components, :]).reshape(
                         data.shape[0], *img_shape)

        return apply_mask(data - bg_scaled, radius=radius, center=center)
예제 #36
0
def get_NMF(fileData,normalized_axis=1,norm='l2'):
    data = np.load(fileData)

    n_components = np.size(np.unique(data['labels']))
    features = data['data']
    print features
    features = as_float_array(features)
    if normalized_axis != None:
        features = normalize(features,norm=norm,axis=normalized_axis)
        
    model = NMF(n_components = n_components)
    print n_components
    print model
    model.fit(np.transpose(features))
    G = model.components_
    print G
    labels_pred = np.zeros(features.shape[0])
    for i in xrange(G.shape[1]):
        temp2= np.argmax(G[:,i])
        labels_pred[i] = temp2
    labels_true = data['labels']
    return labels_true,labels_pred,features
예제 #37
0
def in36():
    from sklearn.datasets import fetch_lfw_people
    people = fetch_lfw_people(min_faces_per_person=20, resize=0.7)
    mask = np.zeros(people.target.shape, dtype=np.bool)
    for target in np.unique(people.target):
        mask[np.where(people.target == target)[0][:50]] = 1
    x_people = people.data[mask]
    y_people = people.target[mask]
    x_people = x_people / 255
    from sklearn.neighbors import KNeighborsClassifier
    x_train, x_test, y_train, y_test = train_test_split(x_people,
                                                        y_people,
                                                        stratify=y_people,
                                                        random_state=0)
    from sklearn.decomposition import NMF
    nmf = NMF(n_components=15, random_state=0)
    nmf.fit(x_train)
    x_train_nmf = nmf.transform(x_train)
    x_test_nmf = nmf.transform(x_test)
    knn = KNeighborsClassifier(n_neighbors=1)
    knn.fit(x_train_nmf, y_train)
    print(knn.score(x_test_nmf, y_test))
예제 #38
0
def retrain_nmf():
    #this is a function which retrains periodically my nmf model
    #it should be trained on the latest user-ratings matrix available
    R = np.array(session.query(umr).all()).T
    #create a model and set the hyperparameters
    # model assumes R ~ PQ'
    model = NMF(
        n_components=10,
        init='random',
        random_state=10,
    )
    model.fit(R)
    Q = model.components_  # movie-genre matrix
    P = model.transform(R)  # user-genre matrix
    error = model.reconstruction_err_  #reconstruction error
    nR = np.dot(P, Q)  #the reconstructed matrix
    #pickle my model
    list_pickle_path = os.path.dirname(os.path.abspath(__file__)) + '/nmf.pkl'
    nmf_pickle = open(list_pickle_path, 'wb')
    picklerick.dump(model, nmf_pickle)
    nmf_pickle.close()
    return
예제 #39
0
class NMFImpl:
    def __init__(
        self,
        n_components=None,
        init=None,
        solver="cd",
        beta_loss="frobenius",
        tol=0.0001,
        max_iter=200,
        random_state=None,
        alpha=0.0,
        l1_ratio=0.0,
        verbose=0,
        shuffle=False,
    ):
        self._hyperparams = {
            "n_components": n_components,
            "init": init,
            "solver": solver,
            "beta_loss": beta_loss,
            "tol": tol,
            "max_iter": max_iter,
            "random_state": random_state,
            "alpha": alpha,
            "l1_ratio": l1_ratio,
            "verbose": verbose,
            "shuffle": shuffle,
        }
        self._wrapped_model = SKLModel(**self._hyperparams)

    def fit(self, X, y=None):
        if y is not None:
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def transform(self, X):
        return self._wrapped_model.transform(X)
예제 #40
0
def get_NMF(fileData, normalized_axis=1, norm='l2'):
    data = np.load(fileData)

    n_components = np.size(np.unique(data['labels']))
    features = data['data']
    print features
    features = as_float_array(features)
    if normalized_axis != None:
        features = normalize(features, norm=norm, axis=normalized_axis)

    model = NMF(n_components=n_components)
    print n_components
    print model
    model.fit(np.transpose(features))
    G = model.components_
    print G
    labels_pred = np.zeros(features.shape[0])
    for i in xrange(G.shape[1]):
        temp2 = np.argmax(G[:, i])
        labels_pred[i] = temp2
    labels_true = data['labels']
    return labels_true, labels_pred, features
    def __init__(self, HAll, NComp):
        HAll = np.vstack(HAll)
        # print(len(data_samples))
        # print((tfidf.shape))
        #Fit the NMF model
        nmf_model = NMF(n_components=NComp,
                        random_state=1,
                        alpha=.1,
                        l1_ratio=.5)

        nmf = nmf_model.fit(HAll)
        self.U = nmf_model.transform(HAll)
        self.L = nmf_model.components_
예제 #42
0
def plot_nmf_illustration():
    rnd = np.random.RandomState(5)
    X_ = rnd.normal(size=(300, 2))
    # Add 8 to make sure every point lies in the positive part of the space
    X_blob = np.dot(X_, rnd.normal(size=(2, 2))) + rnd.normal(size=2) + 8

    nmf = NMF(random_state=0, n_components=2)
    nmf.fit(X_blob)
    X_nmf = nmf.transform(X_blob)

    fig, axes = plt.subplots(1, 2, figsize=(15, 5))

    axes[0].set_xlim([0, 12])
    axes[0].set_ylim([0, 12])
    axes[0].scatter(X_blob[:, 0], X_blob[:, 1], c=X_nmf[:, 0], linewidths=0, s=60, cmap='viridis')
    axes[0].set_xlabel("특성 1")
    axes[0].set_ylabel("특성 2")
    axes[0].arrow(0, 0, nmf.components_[0, 0], nmf.components_[0, 1], width=.1,
                  head_width=.3, color='k')
    axes[0].arrow(0, 0, nmf.components_[1, 0], nmf.components_[1, 1], width=.1,
                  head_width=.3, color='k')
    axes[0].set_aspect('equal')
    axes[0].set_title("성분이 2개인 NMF")

    # second plot
    nmf = NMF(random_state=0, n_components=1)
    nmf.fit(X_blob)

    axes[1].set_xlim([0, 12])
    axes[1].set_ylim([0, 12])
    axes[1].scatter(X_blob[:, 0], X_blob[:, 1], c=X_nmf[:, 0], linewidths=0,
                    s=60, cmap='viridis')
    axes[1].set_xlabel("특성 1")
    axes[1].set_ylabel("특성 2")
    axes[1].arrow(0, 0, nmf.components_[0, 0], nmf.components_[0, 1], width=.1,
                  head_width=.3, color='k')

    axes[1].set_aspect('equal')
    axes[1].set_title("성분이 1개인 NMF")
예제 #43
0
def nmf(DATA, nTOPICS, nWORDS):
    # ----- Topic Modelling using Non-negative Matrix Factorisation(NMF)
    # Instantiate Tfidf model
    tfidf = TfidfVectorizer(max_df=0.9, min_df=2, stop_words='english')
    # Create document timeframes matrix with tfidf model
    dtm = tfidf.fit_transform(DATA)

    # Instantiate NMF model
    nmf_model = NMF(n_components=nTOPICS)
    # Apply non-negative matrix factorisation on the document timeframes matrix
    nmf_model.fit(dtm)
    # nmf_model.transform() returns a matrix with coefficients that shows how much each document belongs to a topic
    topicResults = nmf_model.transform(dtm)

    # Store top nWords in dataFrame topics. These are the words thata describes the topic.
    topics = {}
    for i, t in enumerate(nmf_model.components_):
        # Negating an array causes the highest value to be the lowest value and vice versa
        topWordsIndex = (-t).argsort()[:nWORDS]
        topics[i] = [tfidf.get_feature_names()[i] for i in topWordsIndex]

    return topicResults, topics
def df_with_clean_text(data, number):
    df = pd.read_json(data, orient='split')
    df['clean_text'] = df['Text'].apply(lambda x: clean_text2(x))
    df
    # use tfidf by removing tokens that don't appear in at least 50 documents
    vect = TfidfVectorizer(min_df=3, stop_words='english')
    X = vect.fit_transform(df.clean_text)
    # NMF
    model = NMF(n_components=number, random_state=5)

    # Fit the model to TF-IDF
    model.fit(X)

    components_df = pd.DataFrame(model.components_, columns=vect.get_feature_names())
    print(components_df)
    Topic = []
    for i in range(number):
        Topic.append(i + 1)
    Topic

    components_df["Topic"] = Topic
    return components_df.to_json(date_format='iso', orient='split')
예제 #45
0
def grid_search_nmf_ncomponents(tfidf, folds, low, high):
	tfidf_dense = tfidf.toarray()
	mse_min = 99
	mse_min_ncomponents = -1
	for i in xrange(low, high + 1):
		print 'Fitting n_components = %d ...' %i
		mse_arr = []
		for j in xrange(1, folds + 1):
			print 'Testing fold # %d' %j
			test_size = 1./folds
			A_train, A_test = tfidf_traintestsplit(tfidf, test_size=test_size)
			nmf_temp = NMF(n_components=i, random_state=1)
			nmf_temp.fit(A_train)
			W = nmf_temp.transform(A_train)
			H = nmf_temp.components_
			tfidf_pred = np.dot(W, H)
			mse_fold = mean_squared_error(A_test.toarray(), tfidf_pred)
			mse_arr.append(mse_fold)
		mse_temp = np.mean(mse_arr)
		y_mse_tts.append((i, mse_temp))
		if mse_temp < mse_min:
			mse_min = mse_temp
			mse_min_ncomponents = i

		# cv = cross_val_score(nmf_temp, tfidf, scoring='mean_squared_error', cv=5)
		# nmf_temp.fit(tfidf)
		# W = nmf_temp.transform(tfidf)
		# H = nmf_temp.components_
		# tfidf_pred = np.dot(W, H)
		# mse_temp = mean_squared_error(tfidf_dense, tfidf_pred)
		# y_mse.append(mse_temp)
		# x_range.append(i)
		print 'MSE of n_components = %d: %.10f' %(i, mse_temp)
		print '-------------------------------'
		# if mse_temp < mse_min:
		# 	mse_min = mse_temp
		# 	mse_min_ncomponents = i
	return mse_min_ncomponents
예제 #46
0
def grid_search_nmf_ncomponents(tfidf, folds, low, high):
    tfidf_dense = tfidf.toarray()
    mse_min = 99
    mse_min_ncomponents = -1
    for i in xrange(low, high + 1):
        print 'Fitting n_components = %d ...' % i
        mse_arr = []
        for j in xrange(1, folds + 1):
            print 'Testing fold # %d' % j
            test_size = 1. / folds
            A_train, A_test = tfidf_traintestsplit(tfidf, test_size=test_size)
            nmf_temp = NMF(n_components=i, random_state=1)
            nmf_temp.fit(A_train)
            W = nmf_temp.transform(A_train)
            H = nmf_temp.components_
            tfidf_pred = np.dot(W, H)
            mse_fold = mean_squared_error(A_test.toarray(), tfidf_pred)
            mse_arr.append(mse_fold)
        mse_temp = np.mean(mse_arr)
        y_mse_tts.append((i, mse_temp))
        if mse_temp < mse_min:
            mse_min = mse_temp
            mse_min_ncomponents = i

        # cv = cross_val_score(nmf_temp, tfidf, scoring='mean_squared_error', cv=5)
        # nmf_temp.fit(tfidf)
        # W = nmf_temp.transform(tfidf)
        # H = nmf_temp.components_
        # tfidf_pred = np.dot(W, H)
        # mse_temp = mean_squared_error(tfidf_dense, tfidf_pred)
        # y_mse.append(mse_temp)
        # x_range.append(i)
        print 'MSE of n_components = %d: %.10f' % (i, mse_temp)
        print '-------------------------------'
        # if mse_temp < mse_min:
        # 	mse_min = mse_temp
        # 	mse_min_ncomponents = i
    return mse_min_ncomponents
예제 #47
0
    def Quick_nmf(self, k = 5, top = 10, tfidf = None, print_tops = True, stop_words =[]):

        text = self.text
        labels = self.beer_names

        if tfidf == None:
            stopwords = set(list(ENGLISH_STOP_WORDS) + stop_words)
            tfidf = TfidfVectorizer(ngram_range=(1, 2), max_df=.8, min_df=.2, stop_words = stopwords, max_features = 10000, )

        X = tfidf.fit_transform(text)
        bag = np.array(tfidf.get_feature_names())
        self.bag_of_words = bag

        nmf = NMF(n_components = k)
        # nmf = TruncatedSVD(n_components = k)
        nmf.fit(X)
        W = nmf.transform(X) #len(beers),k
        H = nmf.components_ #k,len(beers)

        all_words = []
        for group in range(k):
            #idx of the top ten words for each group
            i_words = np.argsort(H[group,:])[::-1][:top]
            words = bag[i_words]
            all_words.append(words)

            i_label = np.argsort(W[:,group])[::-1][:top]

            if print_tops:
                print('-'*10)
                print('Group:',group)
                print('WORDS')
                for word in words:
                    print('-->',word)
                print('LABELS')
                for i in i_label:
                    print('==>',labels[i])
        return W,H,nmf,tfidf,all_words
예제 #48
0
    def fit(self, num_factors=10,
            l1_ratio = 0.5,
            solver = "multiplicative_update",
            init_type = "random",
            beta_loss = "frobenius"):

        print('|{}| training |'.format(self.NAME))
        assert l1_ratio>= 0 and l1_ratio<=1, "{}: l1_ratio must be between 0 and 1, provided value was {}".format(self.RECOMMENDER_NAME, l1_ratio)

        if solver not in self.SOLVER_VALUES:
           raise ValueError("Value for 'solver' not recognized. Acceptable values are {}, provided was '{}'".format(self.SOLVER_VALUES.keys(), solver))

        if init_type not in self.INIT_VALUES:
           raise ValueError("Value for 'init_type' not recognized. Acceptable values are {}, provided was '{}'".format(self.INIT_VALUES, init_type))

        if beta_loss not in self.BETA_LOSS_VALUES:
           raise ValueError("Value for 'beta_loss' not recognized. Acceptable values are {}, provided was '{}'".format(self.BETA_LOSS_VALUES, beta_loss))

        print("|{}| Computing NMF decomposition ... |".format(self.NAME))

        nmf_solver = NMF(n_components  = num_factors,
                         init = init_type,
                         solver = self.SOLVER_VALUES[solver],
                         beta_loss = beta_loss,
                         random_state = None,
                         l1_ratio = l1_ratio,
                         shuffle = True,
                         verbose=True,
                         max_iter = 500)

        nmf_solver.fit(self.URM_train)

        self.ITEM_factors = nmf_solver.components_.copy().T
        self.USER_factors = nmf_solver.transform(self.URM_train)

        self.r_hat = self.USER_factors.dot(self.ITEM_factors)

        print("|{}| Done |".format(self.NAME))
예제 #49
0
def compute_PCA_ICA_NMF(n_components=5):
    spec_mean = spectra.mean(0)

    # PCA: use randomized PCA for speed
    pca = RandomizedPCA(n_components - 1)
    pca.fit(spectra)
    pca_comp = np.vstack([spec_mean,
                          pca.components_])

    # ICA treats sequential observations as related.  Because of this, we need
    # to fit with the transpose of the spectra
    ica = FastICA(n_components - 1)
    ica.fit(spectra.T)
    ica_comp = np.vstack([spec_mean,
                          ica.transform(spectra.T).T])

    # NMF requires all elements of the input to be greater than zero
    spectra[spectra < 0] = 0
    nmf = NMF(n_components)
    nmf.fit(spectra)
    nmf_comp = nmf.components_

    return pca_comp, ica_comp, nmf_comp
예제 #50
0
def run_nmf(X, vectorizer, n_topics=4, print_top_words=False):
    '''
    INPUT: Vectorized word array, vectorizer object, number of latent 
    features to uncover, whether to print the top words from each latent
    feature
    OUTPUT: Saves pickled NMF model, returns latent weights matrix that
    can be concatenated with our dataset as additional features  
    '''
    nmf = NMF(n_components=n_topics)
    nmf.fit(X)
    cPickle.dump(nmf, open('models/nmf.pkl', 'wb'))
    H = nmf.transform(X)

    if print_top_words==True:
        feature_names = vectorizer.get_feature_names()
        n_top_words   = 10    
        for topic_idx, topic in enumerate(nmf.components_):
            print("Topic #%d:" % topic_idx)
            print(" ".join([feature_names[i]
                            for i in topic.argsort()[:-n_top_words - 1:-1]]))
            print()

    return H 
예제 #51
0
def setup_nmf(all_ratings=None, engine=None, number_of_genres = 10):
    ''''''
    print('Start loading NMF')
    # Create a sparse matrix of user, movie ids and their ratings (User Movie Ratings UMR)
    user_movie_ratings = pd.pivot_table(all_ratings, values='rating', index='userId', columns='movieId')

    # Fill sparse matrix' NaNs with 0 to make it dense
    user_movie_id_ratings_matrix = user_movie_ratings.fillna(0)

    # Create and fit a NMF model
    number_of_genres = number_of_genres
    m = NMF(n_components=number_of_genres)
    m.fit(user_movie_id_ratings_matrix)

    # Create a movie-genre matrix
    # Q: movie-matrix. Each genre (row) has a coefficient for each movie (columns). Number of genres is set by the NMF hyperparameter n_components=2. Q is the film submatrix.
    genre_movie_matrix = m.components_

    # P: Create a user submatrix
    # P = m.transform(user_movie_id_ratings_matrix)
    print('Done loading NMF')

    return m, genre_movie_matrix, user_movie_id_ratings_matrix
예제 #52
0
def ncimages(images, n_components=10, n_eigen=10):
    """Non-negtive images

    apply NMF
    
    Arguments:
        images {List[Image]} -- list of images
    
    Keyword Arguments:
        n_components {number} -- number of components (default: {10})
        n_eigen {number} -- number of eigen images (default: {10})
    """

    size = images[0].size
    mode = images[0].mode
    data = PIL_ext.tomatrix(images, 'col')
    nmf = NMF(n_components=n_components)
    nmf.fit(data)
    eigens = nmf.transform(data)
    eigens *= 256
    return [
        PIL_ext.toimage(eigens[:, i], size, mode=mode) for i in range(n_eigen)
    ]
예제 #53
0
파일: nmf.py 프로젝트: vjlbym/thunder
    def extract(self, block):
        from numpy import clip, inf, percentile, asarray, where, size, prod
        from sklearn.decomposition import NMF
        from skimage.measure import label
        from skimage.morphology import remove_small_objects

        # get dimensions
        n = self.componentsPerBlock
        dims = block.shape[1:]

        # handle maximum size
        if self.maxArea == "block":
            maxArea = prod(dims) / 2
        else:
            maxArea = self.maxArea

        # reshape to be t x all spatial dimensions
        data = block.reshape(block.shape[0], -1)

        # build and apply NMF model to block
        model = NMF(n, max_iter=self.maxIter)
        model.fit(clip(data, 0, inf))

        # reconstruct sources as spatial objects in one array
        comps = model.components_.reshape((n, ) + dims)

        # convert from basis functions into shape
        # by finding connected components and removing small objects
        combined = []
        for c in comps:
            tmp = c > percentile(c, self.percentile)
            shape = remove_small_objects(label(tmp), min_size=self.minArea)
            coords = asarray(where(shape)).T
            if (size(coords) > 0) and (size(coords) < maxArea):
                combined.append(Source(coords))

        return combined
예제 #54
0
def var_embedding(a,
                  b,
                  df=None,
                  n_components=2,
                  ret_b_emb=False,
                  log_cts=False):
    """
    Get embeddings of a wrt b.
    """
    if df is None:
        df = conf.df
    sdf = wcooc.pairs_to_cooc_sparse_df(df[[a, b]].itertuples(index=False,
                                                              name=None))
    if log_cts:
        sdf = np.log10(sdf + 1)
    X = sdf.sparse.to_coo()

    nmf = NMF(n_components=n_components)
    nmf.fit(X)
    emb_a = pd.DataFrame(
        nmf.components_.T,
        index=sdf.columns,
        columns=[f"n{i}" for i in range(1, n_components + 1)],
    )
    emb_a.index.name = a
    emb_a.columns.name = b

    if ret_b_emb:
        emb_b = pd.DataFrame(
            nmf.fit_transform(X),
            index=sdf.index,
            columns=[f"n{i}" for i in range(1, n_components + 1)],
        )
        emb_b.index.name = b
        emb_b.columns.name = a
        return emb_a, emb_b
    return emb_a
예제 #55
0
def my_cross_val_score(CZ,
                       n_comp=3,
                       model='pca',
                       k=5,
                       random_state=False,
                       shuffle=True):
    if shuffle == True:
        save = np.copy(CZ)
        np.random.shuffle(CZ)
    kf = KFold(n_splits=k)
    cv = []
    pca = decomposition.PCA(n_components=n_comp, random_state=random_state)
    nmf = NMF(n_components=n_comp, random_state=random_state)
    kmeans = KMeans(n_clusters=n_comp, random_state=random_state)
    ksvd = ApproximateKSVD(n_components=n_comp)
    for train, test in kf.split(CZ):
        cz_test = CZ[test]
        cz_train = CZ[train]
        if model == 'pca':
            pca.fit(cz_train)
            CZ_reconstructed = pca.inverse_transform(pca.transform(cz_test))
        elif model == 'nmf':
            nmf.fit(cz_train)
            CZ_reconstructed = np.dot(nmf.transform(cz_test), nmf.components_)
        elif model == 'k_means':
            kmeans.fit(cz_train)
            CZ_reconstructed = kmeans.cluster_centers_[kmeans.predict(cz_test)]
        elif model == 'k_svd':
            meantr = np.mean(cz_train, axis=1)[:, np.newaxis]
            meantest = np.mean(cz_test, axis=1)[:, np.newaxis]
            dictionary = ksvd.fit(cz_train - meantr).components_
            gamma = ksvd.transform(cz_test - meantest)
            CZ_reconstructed = gamma.dot(dictionary) + meantest
        cv.append(mean_squared_error(CZ_reconstructed, cz_test))
    if shuffle == True:
        CZ = save
    return cv
예제 #56
0
파일: nmf.py 프로젝트: lnxpy/lale
class NMFImpl():
    def __init__(self,
                 n_components=None,
                 init=None,
                 solver='cd',
                 beta_loss='frobenius',
                 tol=0.0001,
                 max_iter=200,
                 random_state=None,
                 alpha=0.0,
                 l1_ratio=0.0,
                 verbose=0,
                 shuffle=False):
        self._hyperparams = {
            'n_components': n_components,
            'init': init,
            'solver': solver,
            'beta_loss': beta_loss,
            'tol': tol,
            'max_iter': max_iter,
            'random_state': random_state,
            'alpha': alpha,
            'l1_ratio': l1_ratio,
            'verbose': verbose,
            'shuffle': shuffle
        }
        self._wrapped_model = SKLModel(**self._hyperparams)

    def fit(self, X, y=None):
        if (y is not None):
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def transform(self, X):
        return self._wrapped_model.transform(X)
예제 #57
0
def zero_nmf_model(ratings):
    # Create a sparse matrix of all user ratings for all movies
    reviews = pd.pivot_table(ratings, 'rating', 'userid', 'movieid')
    reviews = reviews.fillna(0)

    # instantiate the NMF model
    model = NMF(n_components=42, init='random', random_state=42)

    model.fit(reviews)

    # Note: R matrix is reviews
    Q = model.components_  # movie-feature matrix
    P = model.transform(reviews)  # user-feature matrix

    # dot product of P and Q is our Rhat (Rpredictions)
    Rhat = np.dot(P, Q)

    # 610 movies, 9724 users
    # Rhat.shape

    # only useful to compare against other models
    # model.reconstruction_err_

    return Rhat, model
예제 #58
0
def run_nmf(X, vectorizer, n_topics=4, print_top_words=False):
    '''
	INPUT: Vectorized word array, vectorizer object, number of latent 
	features to uncover, whether to print the top words from each latent
	feature
	OUTPUT: Saves pickled NMF model, returns latent weights matrix that
	can be concatenated with our dataset as additional features  
	'''
    nmf = NMF(n_components=n_topics)
    nmf.fit(X)
    cPickle.dump(nmf, open('../models/nmf.pkl', 'wb'))
    H = nmf.transform(X)

    if print_top_words == True:
        feature_names = vectorizer.get_feature_names()
        n_top_words = 15
        for topic_idx, topic in enumerate(nmf.components_):
            print("Topic #%d:" % topic_idx)
            print(" ".join([
                feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]
            ]))
            print()

    return H
def decomp(mix_met, component=0, method=2, comps=3):
    if method == 0:
        nmf = NMF(n_components=8, init='nndsvd')  # , random_state=0)
    elif method == 1:
        nmf = FastICA(n_components=comps, random_state=0)
        nmf_1 = nmf.fit_transform(mix_met)
        A = nmf.mixing_
        A1 = A[:, 0].reshape(len(A[:, 0]), 1)
        A2 = A[:, 1].reshape(len(A[:, 0]), 1)
        if comps > 2: A3 = A[:, 2].reshape(len(A[:, 0]), 1)
    else:
        nmf = PCA(n_components=comps)
        nmf.fit(mix_met)
        A = nmf.components_
        A1 = A[0, :].reshape(len(A[0, :]), 1)
        A2 = A[1, :].reshape(len(A[1, :]), 1)
        A3 = A[2, :].reshape(len(A[2, :]), 1)
    nmf_1 = nmf.fit_transform(mix_met)
    sygnal = nmf_1[:, component]
    frst_comp = np.dot(nmf_1[:, 0].reshape(len(nmf_1), 1), A1.T)
    scnd_comp = np.dot(nmf_1[:, 1].reshape(len(nmf_1), 1), A2.T)
    if comps > 2: thrd_comp = np.dot(nmf_1[:, 2].reshape(len(nmf_1), 1), A3.T)
    if comps > 2 and len(nmf_1[:, 0]) < len(A1):
        return sygnal, np.array([frst_comp, scnd_comp, thrd_comp]), np.array(
            [nmf_1[:, 0], nmf_1[:, 1], nmf_1[:, 2]]), np.array([A1, A2, A3])
    elif comps > 2 and len(nmf_1[:, 0]) > len(A1):
        return sygnal, np.array([frst_comp, scnd_comp, thrd_comp]), np.array(
            [A1, A2, A3]), np.array([nmf_1[:, 0], nmf_1[:, 1], nmf_1[:, 2]])
    elif comps == 2 and len(nmf_1[:, 0]) > len(A1):
        return sygnal, np.array([frst_comp, scnd_comp]), np.array(
            [A1, A2]), np.array([nmf_1[:, 0], nmf_1[:, 1]])
    else:
        return sygnal, np.array([frst_comp, scnd_comp
                                 ]), np.array([nmf_1[:, 0],
                                               nmf_1[:,
                                                     1]]), np.array([A1, A2])
예제 #60
0
def nonnegative_matrix_factorization(parameters,
                                     X_train,
                                     X_val,
                                     y_train,
                                     characterize=False):
    vec = StemmedTfidfVectorizer(**filter_parameters(parameters, 'vec'))

    vec.fit(X_train, y_train)
    X_train = vec.transform(X_train)
    X_val = vec.transform(X_val)

    nmf = NMF(random_state=0, **filter_parameters(parameters, 'nmf'))
    nmf.fit(X_train, y_train)
    X_train = nmf.transform(X_train)
    X_val = nmf.transform(X_val)

    if characterize:
        characterize_topics(nmf, vec.get_feature_names())

    model = parameters['model'](**filter_parameters(parameters, 'clf'))
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)

    return model, y_pred