def test_nmf_underflow(): # Regression test for an underflow issue in _beta_divergence rng = np.random.RandomState(0) n_samples, n_features, n_components = 10, 2, 2 X = np.abs(rng.randn(n_samples, n_features)) * 10 W = np.abs(rng.randn(n_samples, n_components)) * 10 H = np.abs(rng.randn(n_components, n_features)) X[0, 0] = 0 ref = nmf._beta_divergence(X, W, H, beta=1.0) X[0, 0] = 1e-323 res = nmf._beta_divergence(X, W, H, beta=1.0) assert_almost_equal(res, ref)
def test_nmf_decreasing(): # test that the objective function is decreasing at each iteration n_samples = 20 n_features = 15 n_components = 10 alpha = 0.1 l1_ratio = 0.5 tol = 0. # initialization rng = np.random.mtrand.RandomState(42) X = rng.randn(n_samples, n_features) np.abs(X, X) W0, H0 = nmf._initialize_nmf(X, n_components, init='random', random_state=42) for beta_loss in (-1.2, 0, 0.2, 1., 2., 2.5): for solver in ('cd', 'mu'): if solver != 'mu' and beta_loss != 2: # not implemented continue W, H = W0.copy(), H0.copy() previous_loss = None for _ in range(30): # one more iteration starting from the previous results W, H, _ = non_negative_factorization( X, W, H, beta_loss=beta_loss, init='custom', n_components=n_components, max_iter=1, alpha=alpha, solver=solver, tol=tol, l1_ratio=l1_ratio, verbose=0, regularization='both', random_state=0, update_H=True) loss = nmf._beta_divergence(X, W, H, beta_loss) if previous_loss is not None: assert previous_loss > loss previous_loss = loss
def test_beta_divergence(): # Compare _beta_divergence with the reference _beta_divergence_dense n_samples = 20 n_features = 10 n_components = 5 beta_losses = [0., 0.5, 1., 1.5, 2.] # initialization rng = np.random.mtrand.RandomState(42) X = rng.randn(n_samples, n_features) np.clip(X, 0, None, out=X) X_csr = sp.csr_matrix(X) W, H = nmf._initialize_nmf(X, n_components, init='random', random_state=42) for beta in beta_losses: ref = _beta_divergence_dense(X, W, H, beta) loss = nmf._beta_divergence(X, W, H, beta) loss_csr = nmf._beta_divergence(X_csr, W, H, beta) assert_almost_equal(ref, loss, decimal=7) assert_almost_equal(ref, loss_csr, decimal=7)
def test_nmf_decreasing(solver): # test that the objective function is decreasing at each iteration n_samples = 20 n_features = 15 n_components = 10 alpha = 0.1 l1_ratio = 0.5 tol = 0.0 # initialization rng = np.random.mtrand.RandomState(42) X = rng.randn(n_samples, n_features) np.abs(X, X) W0, H0 = nmf._initialize_nmf(X, n_components, init="random", random_state=42) for beta_loss in (-1.2, 0, 0.2, 1.0, 2.0, 2.5): if solver != "mu" and beta_loss != 2: # not implemented continue W, H = W0.copy(), H0.copy() previous_loss = None for _ in range(30): # one more iteration starting from the previous results W, H, _ = non_negative_factorization( X, W, H, beta_loss=beta_loss, init="custom", n_components=n_components, max_iter=1, alpha_W=alpha, solver=solver, tol=tol, l1_ratio=l1_ratio, verbose=0, random_state=0, update_H=True, ) loss = (nmf._beta_divergence(X, W, H, beta_loss) + alpha * l1_ratio * n_features * W.sum() + alpha * l1_ratio * n_samples * H.sum() + alpha * (1 - l1_ratio) * n_features * (W**2).sum() + alpha * (1 - l1_ratio) * n_samples * (H**2).sum()) if previous_loss is not None: assert previous_loss > loss previous_loss = loss
def calc_rec_error(self, df, date_range): """ Calculate reconstruction error. For the data of one trading day, take previous day's NMF model, apply transform method to the data, and calculate reconstruction error Parameters: df: Pandas DataFrame Data for a particular date range for the output of read_raw_data() method in modules.tweet_data date_range: DateTimeIndex DateTimeIndex of dates which will serve as range for fitting the data Returns: list, list List of reconstruction errors for the fitted models List of reconstruction errors for the transformed data """ model_err = [] new_err = [] for i in range(len(date_range) - 1): str_date = str(date_range[i + 1].date()) prev_str_date = str(date_range[i].date()) print("Working on : ", str_date, end="\r") # Take portion of df in the range of a trading day sub_df = df[date_range[i]:(date_range[i + 1] - dt.timedelta(seconds=1))].tweet # Tokenize with Spacy NLP pipe. Disable tagger, parser and ner for faster calculation sub_df = [ self.twitter_tokenizer(text) for text in nlp.pipe(sub_df, disable=["tagger", "parser", "ner"]) ] # Use previous day's tfidf model to transform data to tfidf format used to fit NMF model tfidf_vecs = self.tfidf_dict[prev_str_date].transform(sub_df) # Calculate reconstruction error using method from # https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/decomposition/_nmf.py new_rec_err = _beta_divergence( tfidf_vecs, self.nmf_dict[prev_str_date].transform(tfidf_vecs), self.nmf_dict[prev_str_date].components_, 'frobenius', square_root=True) # Reconstruction error from original model rec_err = self.nmf_dict[str_date].reconstruction_err_ model_err.append(rec_err) new_err.append(new_rec_err) print("\nFinished") return model_err, new_err
def bench_one(name, X, W0, H0, X_shape, clf_type, clf_params, init, n_components, random_state): W = W0.copy() H = H0.copy() clf = clf_type(**clf_params) st = time() W = clf.fit_transform(X, W=W, H=H) end = time() H = clf.components_ this_loss = _beta_divergence(X, W, H, 2.0, True) duration = end - st return this_loss, duration
""" ============================== Beta-divergence loss functions ============================== A plot that compares the various Beta-divergence loss functions supported by the Multiplicative-Update ('mu') solver in :class:`sklearn.decomposition.NMF`. """ import numpy as np import matplotlib.pyplot as plt from sklearn.decomposition._nmf import _beta_divergence print(__doc__) x = np.linspace(0.001, 4, 1000) y = np.zeros(x.shape) colors = 'mbgyr' for j, beta in enumerate((0., 0.5, 1., 1.5, 2.)): for i, xi in enumerate(x): y[i] = _beta_divergence(1, xi, 1, beta) name = "beta = %1.1f" % beta plt.plot(x, y, label=name, color=colors[j]) plt.xlabel("x") plt.title("beta-divergence(1, x)") plt.legend(loc=0) plt.axis([0, 4, 0, 3]) plt.show()
def k_fold(run_id, k_folds): stat = RunStats.objects.get(run_id=run_id) qid = stat.query.id K = stat.K alpha = stat.alpha n_features = stat.max_features if n_features == 0: n_features = 100000000000 limit = stat.limit ng = stat.ngram if stat.method == "LD": if stat.max_iter == 200: stat.max_iter = 10 if stat.max_iter > 100: stat.max_iter = 90 n_samples = stat.max_iter if stat.fulltext: docs = Doc.objects.filter(query=qid, fulltext__iregex='\w') else: docs = Doc.objects.filter(query=qid, content__iregex='\w') # if we are limiting, probably for testing, then do that if limit > 0: docs = docs[:limit] tfidf_vectorizer = TfidfVectorizer(max_df=stat.max_df, min_df=stat.min_freq, max_features=n_features, ngram_range=(ng, ng), tokenizer=snowball_stemmer(), stop_words=stoplist) count_vectorizer = CountVectorizer(max_df=stat.max_df, min_df=stat.min_freq, max_features=n_features, ngram_range=(ng, ng), tokenizer=snowball_stemmer(), stop_words=stoplist) abstracts, docsizes, ids = proc_docs(docs, stoplist, stat.fulltext) doc_ids = ids random.shuffle(doc_ids) if stat.method == "NM": tfidf = tfidf_vectorizer.fit_transform(abstracts) vectorizer = tfidf_vectorizer else: tfidf = count_vectorizer.fit_transform(abstracts) vectorizer = count_vectorizer for k in range(k_folds): train_set = [i for i, x in enumerate(doc_ids) if i % k_folds != k] test_set = [i for i, x in enumerate(doc_ids) if i % k_folds == k] X_train = tfidf[train_set, ] X_test = tfidf[test_set, ] if stat.method == "NM": model = NMF(n_components=K, random_state=1, alpha=alpha, l1_ratio=.1, verbose=False, init='nndsvd', max_iter=n_samples).fit(X_train) w_test = model.transform(X_test) rec_error = _beta_divergence(X_test, w_test, model.components_, 'frobenius', square_root=True) else: model = LDA(n_components=K, doc_topic_prior=stat.alpha, max_iter=stat.max_iter, n_jobs=6).fit(X_test) w_test = model.transform(X_test) rec_error = _beta_divergence(X_test, w_test, model.components_, 'frobenius', square_root=True) kf, created = KFold.objects.get_or_create(model=stat, K=k) kf.error = rec_error kf.save() return
def contrastive_NMF(v, w_init, h_init, h_tilde, delta=0, mu=0, beta=0, n_iter=100, nr_src=2): """ Parameters ---------- v: [array of shape (F, N)] magnitude spectrogram of the mixture w_init: [array of shape (F, K)] initialization of the dictionary w h_init: [array of shape (K, N)] initialization of the activations h h_tilde: [array of shape (K1, N)] activations of the source to enhance delta: [float > 0] weight of the contrastive term mu: [float > 0] weight of the l1 regularizer on h beta: [float > 0] weight of the l1 regularizer on w n_iter: [int > 0] number of NMF iterations n_src: [int > 0] number of sources in the mixture Returns ------- the dictionary w and the corresponding activations h resulting from the factorization of v and a list containing the total cost at each iteration. """ flr = 1e-9 cost = [] # initial values x = v.copy() w = w_init.copy() h = h_init.copy() # avoid too small values x[x <= flr] = flr w[w <= flr] = flr h[h <= flr] = flr # normalize h_tilde hn_tilde = np.sqrt(np.sum(h_tilde ** 2, axis=1)) h_tilde = h_tilde / hn_tilde[:, None] # normalize h and rescale w hn = np.sqrt(np.sum(h ** 2, axis=1)) h = h / hn[:, None] w = w * hn[None, :] # NMF iterations for i in range(n_iter): # update H WH = np.maximum(w @ h, flr) h, contrast = update_h(w, h, x, h_tilde, WH, delta, mu, flr, nr_src) h[h <= flr] = flr # normalize h and rescale w hn = np.sqrt(np.sum(h ** 2, axis=1)) h = h / hn[:, None] w = w * hn[None, :] # update W WH = np.maximum(w@h, flr) w = update_w(w, h, x, WH, beta, flr) w[w <= 0] = flr # keep track of the cost cost.append(_beta_divergence(x, w, h, 'kullback-leibler', square_root=True) - delta * contrast + mu * np.linalg.norm(h) + beta * np.linalg.norm(w)) return w, h, cost