def test_nmf_underflow(): # Regression test for an underflow issue in _beta_divergence rng = np.random.RandomState(0) n_samples, n_features, n_components = 10, 2, 2 X = np.abs(rng.randn(n_samples, n_features)) * 10 W = np.abs(rng.randn(n_samples, n_components)) * 10 H = np.abs(rng.randn(n_components, n_features)) X[0, 0] = 0 ref = nmf._beta_divergence(X, W, H, beta=1.0) X[0, 0] = 1e-323 res = nmf._beta_divergence(X, W, H, beta=1.0) assert_almost_equal(res, ref)
def test_nmf_underflow(): # Regression test for an underflow issue in _beta_divergence rng = np.random.RandomState(0) n_samples, n_features, n_components = 10, 2, 2 X = np.abs(rng.randn(n_samples, n_features)) * 10 W = np.abs(rng.randn(n_samples, n_components)) * 10 H = np.abs(rng.randn(n_components, n_features)) X[0, 0] = 0 ref = nmf._beta_divergence(X, W, H, beta=1.0) X[0, 0] = 1e-323 res = nmf._beta_divergence(X, W, H, beta=1.0) assert_almost_equal(res, ref)
def compute_factorization_error(target, left_factor, right_factor, link, beta_loss): if target is None: return 0 elif link == "linear": return _beta_divergence(target, left_factor, right_factor, beta_loss, square_root=True) elif link == "logit": return np.linalg.norm(target - sigmoid(np.dot(left_factor, right_factor)))
def fit_transform(self, X, y=None, W=None, H=None): """Learn a NMF model for the data X and returns the transformed data. This is more efficient than calling fit followed by transform. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) Data matrix to be decomposed y : Ignored W : array-like, shape (n_samples, n_components) If init='custom', it is used as initial guess for the solution. H : array-like, shape (n_components, n_features) If init='custom', it is used as initial guess for the solution. Returns ------- W : array, shape (n_samples, n_components) Transformed data. """ X = check_array(X, accept_sparse=('csr', 'csc'), dtype=float) W, H, n_iter_ = non_negative_factorization( X=X, W=W, H=H, n_components=self.n_components, init=self.init, update_H=True, solver=self.solver, beta_loss=self.beta_loss, tol=self.tol, max_iter=self.max_iter, alpha=self.alpha, l1_ratio=self.l1_ratio, regularization='both', random_state=self.random_state, verbose=self.verbose, shuffle=self.shuffle, distribution = self.distribution, N=self.N, D=self.D) self.reconstruction_err_ = _beta_divergence(X, W, H, self.beta_loss, square_root=True) self.n_components_ = H.shape[0] self.components_ = H self.n_iter_ = n_iter_ return W
def nmf_prepare_tag(self, data_preprocessed, no_topics=32): ''' prepare nmf, topic ad tf vectorizer from data preprocessed ''' from sklearn.decomposition.nmf import _beta_divergence documents = data_preprocessed.unique()[0:self.precision] nmf_tfidf, nmf_tfidf_vectorizer = self.nmf_init(documents) nmf = self.nmf_train(nmf_tfidf, no_topics) print('original reconstruction error automatically calculated -> TRAIN: ', nmf.reconstruction_err_) """ Manual reconstruction_err_ calculation -> use transform to get W -> ask fitted NMF to get H -> use available _beta_divergence-function to calculate desired metric """ W_train = nmf.transform(nmf_tfidf) rec_error = _beta_divergence(nmf_tfidf, W_train, nmf.components_, 'frobenius', square_root=True) print('Manually calculated rec-error train: ', rec_error) nmf_topicnames = ["Topic" + str(i) for i in range(nmf.n_components)] # Topic-Keyword Matrix nmf_df_topic_keyword = self.pd.DataFrame(nmf.components_) # Assign Column and Index nmf_df_topic_keyword.columns = nmf_tfidf_vectorizer.get_feature_names() nmf_df_topic_keyword.index = nmf_topicnames return nmf, nmf_df_topic_keyword, nmf_tfidf_vectorizer
def test_nmf_decreasing(): # test that the objective function is decreasing at each iteration n_samples = 20 n_features = 15 n_components = 10 alpha = 0.1 l1_ratio = 0.5 tol = 0. # initialization rng = np.random.mtrand.RandomState(42) X = rng.randn(n_samples, n_features) np.abs(X, X) W0, H0 = nmf._initialize_nmf(X, n_components, init='random', random_state=42) for beta_loss in (-1.2, 0, 0.2, 1., 2., 2.5): for solver in ('cd', 'mu'): if solver != 'mu' and beta_loss != 2: # not implemented continue W, H = W0.copy(), H0.copy() previous_loss = None for _ in range(30): # one more iteration starting from the previous results W, H, _ = non_negative_factorization( X, W, H, beta_loss=beta_loss, init='custom', n_components=n_components, max_iter=1, alpha=alpha, solver=solver, tol=tol, l1_ratio=l1_ratio, verbose=0, regularization='both', random_state=0, update_H=True) loss = nmf._beta_divergence(X, W, H, beta_loss) if previous_loss is not None: assert_greater(previous_loss, loss) previous_loss = loss
def score(self, X): ''' Returns the Kullback-Leibler divergence. Parameters ---------- X : array-like (str), shape [n_samples,] The data to encode. Returns ------- kl_divergence : float. Transformed input. ''' unq_X, lookup = np.unique(X, return_inverse=True) unq_V = self.ngrams_count.transform(unq_X) if self.add_words: unq_V2 = self.word_count.transform(unq_X) unq_V = sparse.hstack((unq_V, unq_V2), format='csr') self._add_unseen_keys_to_H_dict(unq_X) unq_H = self._get_H(unq_X) for slice in gen_batches(n=unq_H.shape[0], batch_size=self.batch_size): unq_H[slice] = _multiplicative_update_h( unq_V[slice], self.W_, unq_H[slice], epsilon=1e-3, max_iter=self.max_iter_e_step, rescale_W=self.rescale_W, gamma_shape_prior=self.gamma_shape_prior, gamma_scale_prior=self.gamma_scale_prior) kl_divergence = _beta_divergence( unq_V[lookup], unq_H[lookup], self.W_, 'kullback-leibler', square_root=False) return kl_divergence
def test_nmf_decreasing(): # test that the objective function is decreasing at each iteration n_samples = 20 n_features = 15 n_components = 10 alpha = 0.1 l1_ratio = 0.5 tol = 0. # initialization rng = np.random.mtrand.RandomState(42) X = rng.randn(n_samples, n_features) np.abs(X, X) W0, H0 = nmf._initialize_nmf(X, n_components, init='random', random_state=42) for beta_loss in (-1.2, 0, 0.2, 1., 2., 2.5): for solver in ('cd', 'mu'): if solver != 'mu' and beta_loss != 2: # not implemented continue W, H = W0.copy(), H0.copy() previous_loss = None for _ in range(30): # one more iteration starting from the previous results W, H, _ = non_negative_factorization( X, W, H, beta_loss=beta_loss, init='custom', n_components=n_components, max_iter=1, alpha=alpha, solver=solver, tol=tol, l1_ratio=l1_ratio, verbose=0, regularization='both', random_state=0, update_H=True) loss = nmf._beta_divergence(X, W, H, beta_loss) if previous_loss is not None: assert_greater(previous_loss, loss) previous_loss = loss
def test_beta_divergence(): # Compare _beta_divergence with the reference _beta_divergence_dense n_samples = 20 n_features = 10 n_components = 5 beta_losses = [0., 0.5, 1., 1.5, 2.] # initialization rng = np.random.mtrand.RandomState(42) X = rng.randn(n_samples, n_features) np.clip(X, 0, None, out=X) X_csr = sp.csr_matrix(X) W, H = nmf._initialize_nmf(X, n_components, init='random', random_state=42) for beta in beta_losses: ref = _beta_divergence_dense(X, W, H, beta) loss = nmf._beta_divergence(X, W, H, beta) loss_csr = nmf._beta_divergence(X_csr, W, H, beta) assert_almost_equal(ref, loss, decimal=7) assert_almost_equal(ref, loss_csr, decimal=7)
def test_beta_divergence(): # Compare _beta_divergence with the reference _beta_divergence_dense n_samples = 20 n_features = 10 n_components = 5 beta_losses = [0., 0.5, 1., 1.5, 2.] # initialization rng = np.random.mtrand.RandomState(42) X = rng.randn(n_samples, n_features) np.clip(X, 0, None, out=X) X_csr = sp.csr_matrix(X) W, H = nmf._initialize_nmf(X, n_components, init='random', random_state=42) for beta in beta_losses: ref = _beta_divergence_dense(X, W, H, beta) loss = nmf._beta_divergence(X, W, H, beta) loss_csr = nmf._beta_divergence(X_csr, W, H, beta) assert_almost_equal(ref, loss, decimal=7) assert_almost_equal(ref, loss_csr, decimal=7)
def bench_one(name, X, W0, H0, X_shape, clf_type, clf_params, init, n_components, random_state): W = W0.copy() H = H0.copy() clf = clf_type(**clf_params) st = time() W = clf.fit_transform(X, W=W, H=H) end = time() H = clf.components_ this_loss = _beta_divergence(X, W, H, 2.0, True) duration = end - st return this_loss, duration
def bench_one(name, X, W0, H0, X_shape, clf_type, clf_params, init, n_components, random_state): W = W0.copy() H = H0.copy() clf = clf_type(**clf_params) st = time() W = clf.fit_transform(X, W=W, H=H) end = time() H = clf.components_ this_loss = _beta_divergence(X, W, H, 2.0, True) duration = end - st return this_loss, duration
def test_loss_decreasing(): # test that the objective function for at least one of the matrices is decreasing n_components = 10 alpha = 0.1 tol = 0. # initialization rng = np.random.mtrand.RandomState(42) X = np.abs(rng.randn(20, 15)) Y = np.abs(rng.randn(15, 10)) U0, V0 = nmf._initialize_nmf(X, n_components, init='random', random_state=42) V0_, Z0 = nmf._initialize_nmf(Y, n_components, init='random', random_state=42) V0 = (V0.T + V0_) / 2 U, V, Z = U0.copy(), V0.copy(), Z0.copy() # since Hessian is being perturbed, might not have to work for newton-raphson solver for solver in ['mu']: previous_x_loss = nmf._beta_divergence(X, U, V.T, 2) previous_y_loss = nmf._beta_divergence(Y, V, Z.T, 2) for _ in range(30): # one more iteration starting from the previous results U, V, Z, _ = collective_matrix_factorization( X, Y, U, V, Z, x_init='custom', y_init='custom', n_components=n_components, max_iter=1, solver=solver, tol=tol, verbose=0, random_state=0) x_loss = nmf._beta_divergence(X, U, V.T, 2) y_loss = nmf._beta_divergence(Y, V, Z.T, 2) max_loss_decrease = max(previous_x_loss - x_loss, previous_y_loss - y_loss) assert_greater(max_loss_decrease, 0) previous_x_loss = x_loss previous_y_loss = y_loss
def score(self, X): """ Returns the Kullback-Leibler divergence between the n-grams counts matrix V of X, and its non-negative factorization HW. Parameters ---------- X : array-like (str), shape (n_samples, ) The data to encode. Returns ------- kl_divergence : float. The Kullback-Leibler divergence. """ # Build n-grams/word counts matrix unq_X, lookup = np.unique(X, return_inverse=True) unq_V = self.ngrams_count_.transform(unq_X) if self.add_words: unq_V2 = self.word_count_.transform(unq_X) unq_V = sparse.hstack((unq_V, unq_V2), format='csr') self._add_unseen_keys_to_H_dict(unq_X) unq_H = self._get_H(unq_X) # Given the learnt topics W, optimize the activations H to fit V = HW for slice in gen_batches(n=unq_H.shape[0], batch_size=self.batch_size): unq_H[slice] = _multiplicative_update_h( unq_V[slice], self.W_, unq_H[slice], epsilon=1e-3, max_iter=self.max_iter_e_step, rescale_W=self.rescale_W, gamma_shape_prior=self.gamma_shape_prior, gamma_scale_prior=self.gamma_scale_prior) # Compute the KL divergence between V and HW kl_divergence = _beta_divergence(unq_V[lookup], unq_H[lookup], self.W_, 'kullback-leibler', square_root=False) return kl_divergence
def k_fold(run_id,k_folds): stat = RunStats.objects.get(run_id=run_id) qid = stat.query.id K = stat.K alpha = stat.alpha n_features = stat.max_features if n_features == 0: n_features = 100000000000 limit = stat.limit ng = stat.ngram if stat.method=="LD": if stat.max_iter == 200: stat.max_iter = 10 if stat.max_iter > 100: stat.max_iter = 90 n_samples = stat.max_iter if stat.fulltext: docs = Doc.objects.filter(query=qid,fulltext__iregex='\w') else: docs = Doc.objects.filter(query=qid,content__iregex='\w') # if we are limiting, probably for testing, then do that if limit > 0: docs = docs[:limit] tfidf_vectorizer = TfidfVectorizer( max_df=stat.max_df, min_df=stat.min_freq, max_features=n_features, ngram_range=(ng,ng), tokenizer=snowball_stemmer(), stop_words=stoplist ) count_vectorizer = CountVectorizer( max_df=stat.max_df, min_df=stat.min_freq, max_features=n_features, ngram_range=(ng,ng), tokenizer=snowball_stemmer(), stop_words=stoplist ) abstracts, docsizes, ids = proc_docs(docs, stoplist, stat.fulltext) doc_ids = ids random.shuffle(doc_ids) if stat.method=="NM": tfidf = tfidf_vectorizer.fit_transform(abstracts) vectorizer = tfidf_vectorizer else: tfidf = count_vectorizer.fit_transform(abstracts) vectorizer = count_vectorizer for k in range(k_folds): train_set = [i for i,x in enumerate(doc_ids) if i % k_folds !=k] test_set = [i for i,x in enumerate(doc_ids) if i % k_folds ==k] X_train = tfidf[train_set,] X_test = tfidf[test_set,] if stat.method=="NM": model = NMF( n_components=K, random_state=1, alpha=alpha, l1_ratio=.1, verbose=False, init='nndsvd', max_iter=n_samples ).fit(X_train) w_test = model.transform(X_test) rec_error = _beta_divergence( X_test, w_test, model.components_, 'frobenius', square_root=True ) else: model = LDA( n_components=K, doc_topic_prior=stat.alpha, max_iter=stat.max_iter, n_jobs=6 ).fit(X_test) w_test = model.transform(X_test) rec_error = _beta_divergence( X_test, w_test, model.components_, 'frobenius', square_root=True ) kf, created = KFold.objects.get_or_create( model=stat, K=k ) kf.error = rec_error kf.save() return
""" ============================== Beta-divergence loss functions ============================== A plot that compares the various Beta-divergence loss functions supported by the Multiplicative-Update ('mu') solver in :class:`sklearn.decomposition.NMF`. """ import numpy as np import matplotlib.pyplot as plt from sklearn.decomposition.nmf import _beta_divergence print(__doc__) x = np.linspace(0.001, 4, 1000) y = np.zeros(x.shape) colors = 'mbgyr' for j, beta in enumerate((0., 0.5, 1., 1.5, 2.)): for i, xi in enumerate(x): y[i] = _beta_divergence(1, xi, 1, beta) name = "beta = %1.1f" % beta plt.plot(x, y, label=name, color=colors[j]) plt.xlabel("x") plt.title("beta-divergence(1, x)") plt.legend(loc=0) plt.axis([0, 4, 0, 3]) plt.show()
def score(self, X, y=None): H = self.components_ W = self.transform(X) return -_beta_divergence(X, W, H, self.beta_loss, square_root=True)
temppath = tempimage() plt.savefig(temppath, dpi=dpi) dx,dy = imagesize(temppath) w = min(W,dx) image(temppath,imgx,imgy,width=w) imgy = imgy + dy + 20 os.remove(temppath) size(W, HEIGHT+dy+40) else: def pltshow(mplpyplot): mplpyplot.show() # nodebox section end x = np.linspace(0.001, 4, 1000) y = np.zeros(x.shape) colors = 'mbgyr' for j, beta in enumerate((0., 0.5, 1., 1.5, 2.)): for i, xi in enumerate(x): y[i] = _beta_divergence(1, xi, 1, beta) name = "beta = %1.1f" % beta plt.plot(x, y, label=name, color=colors[j]) plt.xlabel("x") plt.title("beta-divergence(1, x)") plt.legend(loc=0) plt.axis([0, 4, 0, 3]) # plt.show() pltshow(plt)