Пример #1
0
def test_nmf_underflow():
    # Regression test for an underflow issue in _beta_divergence
    rng = np.random.RandomState(0)
    n_samples, n_features, n_components = 10, 2, 2
    X = np.abs(rng.randn(n_samples, n_features)) * 10
    W = np.abs(rng.randn(n_samples, n_components)) * 10
    H = np.abs(rng.randn(n_components, n_features))

    X[0, 0] = 0
    ref = nmf._beta_divergence(X, W, H, beta=1.0)
    X[0, 0] = 1e-323
    res = nmf._beta_divergence(X, W, H, beta=1.0)
    assert_almost_equal(res, ref)
Пример #2
0
def test_nmf_underflow():
    # Regression test for an underflow issue in _beta_divergence
    rng = np.random.RandomState(0)
    n_samples, n_features, n_components = 10, 2, 2
    X = np.abs(rng.randn(n_samples, n_features)) * 10
    W = np.abs(rng.randn(n_samples, n_components)) * 10
    H = np.abs(rng.randn(n_components, n_features))

    X[0, 0] = 0
    ref = nmf._beta_divergence(X, W, H, beta=1.0)
    X[0, 0] = 1e-323
    res = nmf._beta_divergence(X, W, H, beta=1.0)
    assert_almost_equal(res, ref)
Пример #3
0
def compute_factorization_error(target, left_factor, right_factor, link, beta_loss):
    if target is None:
        return 0
    elif link == "linear":
        return _beta_divergence(target, left_factor, right_factor, beta_loss, square_root=True)
    elif link == "logit":
        return np.linalg.norm(target - sigmoid(np.dot(left_factor, right_factor)))
    def fit_transform(self, X, y=None, W=None, H=None):
        """Learn a NMF model for the data X and returns the transformed data.
        This is more efficient than calling fit followed by transform.
        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            Data matrix to be decomposed
        y : Ignored
        W : array-like, shape (n_samples, n_components)
            If init='custom', it is used as initial guess for the solution.
        H : array-like, shape (n_components, n_features)
            If init='custom', it is used as initial guess for the solution.
        Returns
        -------
        W : array, shape (n_samples, n_components)
            Transformed data.
        """
        X = check_array(X, accept_sparse=('csr', 'csc'), dtype=float)

        W, H, n_iter_ = non_negative_factorization(
            X=X, W=W, H=H, n_components=self.n_components, init=self.init,
            update_H=True, solver=self.solver, beta_loss=self.beta_loss,
            tol=self.tol, max_iter=self.max_iter, alpha=self.alpha,
            l1_ratio=self.l1_ratio, regularization='both',
            random_state=self.random_state, verbose=self.verbose,
            shuffle=self.shuffle, distribution = self.distribution, N=self.N, D=self.D)

        self.reconstruction_err_ = _beta_divergence(X, W, H, self.beta_loss,
                                                    square_root=True)

        self.n_components_ = H.shape[0]
        self.components_ = H
        self.n_iter_ = n_iter_

        return W
Пример #5
0
    def nmf_prepare_tag(self, data_preprocessed, no_topics=32):
        '''
        prepare nmf, topic ad tf vectorizer from data preprocessed
        '''
        from sklearn.decomposition.nmf import _beta_divergence
        documents = data_preprocessed.unique()[0:self.precision]
        
        nmf_tfidf, nmf_tfidf_vectorizer = self.nmf_init(documents)
        nmf = self.nmf_train(nmf_tfidf, no_topics)
        
        print('original reconstruction error automatically calculated -> TRAIN: ', nmf.reconstruction_err_)

        """ Manual reconstruction_err_ calculation
            -> use transform to get W
            -> ask fitted NMF to get H
            -> use available _beta_divergence-function to calculate desired metric
        """
        W_train = nmf.transform(nmf_tfidf)
        rec_error = _beta_divergence(nmf_tfidf, W_train, nmf.components_, 'frobenius', square_root=True)
        print('Manually calculated rec-error train: ', rec_error)

        nmf_topicnames = ["Topic" + str(i) for i in range(nmf.n_components)]

        # Topic-Keyword Matrix
        nmf_df_topic_keyword = self.pd.DataFrame(nmf.components_)

        # Assign Column and Index
        nmf_df_topic_keyword.columns = nmf_tfidf_vectorizer.get_feature_names()
        nmf_df_topic_keyword.index = nmf_topicnames
        return nmf, nmf_df_topic_keyword, nmf_tfidf_vectorizer
Пример #6
0
def test_nmf_decreasing():
    # test that the objective function is decreasing at each iteration
    n_samples = 20
    n_features = 15
    n_components = 10
    alpha = 0.1
    l1_ratio = 0.5
    tol = 0.

    # initialization
    rng = np.random.mtrand.RandomState(42)
    X = rng.randn(n_samples, n_features)
    np.abs(X, X)
    W0, H0 = nmf._initialize_nmf(X, n_components, init='random',
                                 random_state=42)

    for beta_loss in (-1.2, 0, 0.2, 1., 2., 2.5):
        for solver in ('cd', 'mu'):
            if solver != 'mu' and beta_loss != 2:
                # not implemented
                continue
            W, H = W0.copy(), H0.copy()
            previous_loss = None
            for _ in range(30):
                # one more iteration starting from the previous results
                W, H, _ = non_negative_factorization(
                    X, W, H, beta_loss=beta_loss, init='custom',
                    n_components=n_components, max_iter=1, alpha=alpha,
                    solver=solver, tol=tol, l1_ratio=l1_ratio, verbose=0,
                    regularization='both', random_state=0, update_H=True)

                loss = nmf._beta_divergence(X, W, H, beta_loss)
                if previous_loss is not None:
                    assert_greater(previous_loss, loss)
                previous_loss = loss
    def score(self, X):
        '''
        Returns the Kullback-Leibler divergence.

        Parameters
        ----------
        X : array-like (str), shape [n_samples,]
            The data to encode.

        Returns
        -------
        kl_divergence : float.
            Transformed input.
        '''

        unq_X, lookup = np.unique(X, return_inverse=True)
        unq_V = self.ngrams_count.transform(unq_X)
        if self.add_words:
            unq_V2 = self.word_count.transform(unq_X)
            unq_V = sparse.hstack((unq_V, unq_V2), format='csr')

        self._add_unseen_keys_to_H_dict(unq_X)
        unq_H = self._get_H(unq_X)
        for slice in gen_batches(n=unq_H.shape[0],
                                 batch_size=self.batch_size):
            unq_H[slice] = _multiplicative_update_h(
                unq_V[slice], self.W_, unq_H[slice],
                epsilon=1e-3, max_iter=self.max_iter_e_step,
                rescale_W=self.rescale_W,
                gamma_shape_prior=self.gamma_shape_prior,
                gamma_scale_prior=self.gamma_scale_prior)
        kl_divergence = _beta_divergence(
            unq_V[lookup], unq_H[lookup], self.W_,
            'kullback-leibler', square_root=False)
        return kl_divergence
Пример #8
0
def test_nmf_decreasing():
    # test that the objective function is decreasing at each iteration
    n_samples = 20
    n_features = 15
    n_components = 10
    alpha = 0.1
    l1_ratio = 0.5
    tol = 0.

    # initialization
    rng = np.random.mtrand.RandomState(42)
    X = rng.randn(n_samples, n_features)
    np.abs(X, X)
    W0, H0 = nmf._initialize_nmf(X, n_components, init='random',
                                 random_state=42)

    for beta_loss in (-1.2, 0, 0.2, 1., 2., 2.5):
        for solver in ('cd', 'mu'):
            if solver != 'mu' and beta_loss != 2:
                # not implemented
                continue
            W, H = W0.copy(), H0.copy()
            previous_loss = None
            for _ in range(30):
                # one more iteration starting from the previous results
                W, H, _ = non_negative_factorization(
                    X, W, H, beta_loss=beta_loss, init='custom',
                    n_components=n_components, max_iter=1, alpha=alpha,
                    solver=solver, tol=tol, l1_ratio=l1_ratio, verbose=0,
                    regularization='both', random_state=0, update_H=True)

                loss = nmf._beta_divergence(X, W, H, beta_loss)
                if previous_loss is not None:
                    assert_greater(previous_loss, loss)
                previous_loss = loss
Пример #9
0
def test_beta_divergence():
    # Compare _beta_divergence with the reference _beta_divergence_dense
    n_samples = 20
    n_features = 10
    n_components = 5
    beta_losses = [0., 0.5, 1., 1.5, 2.]

    # initialization
    rng = np.random.mtrand.RandomState(42)
    X = rng.randn(n_samples, n_features)
    np.clip(X, 0, None, out=X)
    X_csr = sp.csr_matrix(X)
    W, H = nmf._initialize_nmf(X, n_components, init='random', random_state=42)

    for beta in beta_losses:
        ref = _beta_divergence_dense(X, W, H, beta)
        loss = nmf._beta_divergence(X, W, H, beta)
        loss_csr = nmf._beta_divergence(X_csr, W, H, beta)

        assert_almost_equal(ref, loss, decimal=7)
        assert_almost_equal(ref, loss_csr, decimal=7)
Пример #10
0
def test_beta_divergence():
    # Compare _beta_divergence with the reference _beta_divergence_dense
    n_samples = 20
    n_features = 10
    n_components = 5
    beta_losses = [0., 0.5, 1., 1.5, 2.]

    # initialization
    rng = np.random.mtrand.RandomState(42)
    X = rng.randn(n_samples, n_features)
    np.clip(X, 0, None, out=X)
    X_csr = sp.csr_matrix(X)
    W, H = nmf._initialize_nmf(X, n_components, init='random', random_state=42)

    for beta in beta_losses:
        ref = _beta_divergence_dense(X, W, H, beta)
        loss = nmf._beta_divergence(X, W, H, beta)
        loss_csr = nmf._beta_divergence(X_csr, W, H, beta)

        assert_almost_equal(ref, loss, decimal=7)
        assert_almost_equal(ref, loss_csr, decimal=7)
Пример #11
0
def bench_one(name, X, W0, H0, X_shape, clf_type, clf_params, init,
              n_components, random_state):
    W = W0.copy()
    H = H0.copy()

    clf = clf_type(**clf_params)
    st = time()
    W = clf.fit_transform(X, W=W, H=H)
    end = time()
    H = clf.components_

    this_loss = _beta_divergence(X, W, H, 2.0, True)
    duration = end - st
    return this_loss, duration
Пример #12
0
def bench_one(name, X, W0, H0, X_shape, clf_type, clf_params, init,
              n_components, random_state):
    W = W0.copy()
    H = H0.copy()

    clf = clf_type(**clf_params)
    st = time()
    W = clf.fit_transform(X, W=W, H=H)
    end = time()
    H = clf.components_

    this_loss = _beta_divergence(X, W, H, 2.0, True)
    duration = end - st
    return this_loss, duration
Пример #13
0
def test_loss_decreasing():
    # test that the objective function for at least one of the matrices is decreasing
    n_components = 10
    alpha = 0.1
    tol = 0.

    # initialization
    rng = np.random.mtrand.RandomState(42)
    X = np.abs(rng.randn(20, 15))
    Y = np.abs(rng.randn(15, 10))
    U0, V0 = nmf._initialize_nmf(X, n_components, init='random',
                                 random_state=42)
    V0_, Z0 = nmf._initialize_nmf(Y, n_components, init='random',
                                  random_state=42)
    V0 = (V0.T + V0_) / 2

    U, V, Z = U0.copy(), V0.copy(), Z0.copy()

    # since Hessian is being perturbed, might not have to work for newton-raphson solver
    for solver in ['mu']:

        previous_x_loss = nmf._beta_divergence(X, U, V.T, 2)
        previous_y_loss = nmf._beta_divergence(Y, V, Z.T, 2)
        for _ in range(30):
            # one more iteration starting from the previous results
            U, V, Z, _ = collective_matrix_factorization(
                X, Y, U, V, Z, x_init='custom', y_init='custom',
                n_components=n_components, max_iter=1,
                solver=solver, tol=tol, verbose=0, random_state=0)

            x_loss = nmf._beta_divergence(X, U, V.T, 2)
            y_loss = nmf._beta_divergence(Y, V, Z.T, 2)
            max_loss_decrease = max(previous_x_loss - x_loss, previous_y_loss - y_loss)
            assert_greater(max_loss_decrease, 0)
            previous_x_loss = x_loss
            previous_y_loss = y_loss
Пример #14
0
    def score(self, X):
        """
        Returns the Kullback-Leibler divergence between the n-grams counts
        matrix V of X, and its non-negative factorization HW.

        Parameters
        ----------
        X : array-like (str), shape (n_samples, )
            The data to encode.

        Returns
        -------
        kl_divergence : float.
            The Kullback-Leibler divergence.
        """
        # Build n-grams/word counts matrix
        unq_X, lookup = np.unique(X, return_inverse=True)
        unq_V = self.ngrams_count_.transform(unq_X)
        if self.add_words:
            unq_V2 = self.word_count_.transform(unq_X)
            unq_V = sparse.hstack((unq_V, unq_V2), format='csr')

        self._add_unseen_keys_to_H_dict(unq_X)
        unq_H = self._get_H(unq_X)
        # Given the learnt topics W, optimize the activations H to fit V = HW
        for slice in gen_batches(n=unq_H.shape[0], batch_size=self.batch_size):
            unq_H[slice] = _multiplicative_update_h(
                unq_V[slice],
                self.W_,
                unq_H[slice],
                epsilon=1e-3,
                max_iter=self.max_iter_e_step,
                rescale_W=self.rescale_W,
                gamma_shape_prior=self.gamma_shape_prior,
                gamma_scale_prior=self.gamma_scale_prior)
        # Compute the KL divergence between V and HW
        kl_divergence = _beta_divergence(unq_V[lookup],
                                         unq_H[lookup],
                                         self.W_,
                                         'kullback-leibler',
                                         square_root=False)
        return kl_divergence
Пример #15
0
def k_fold(run_id,k_folds):
    stat = RunStats.objects.get(run_id=run_id)
    qid = stat.query.id
    K = stat.K
    alpha = stat.alpha
    n_features = stat.max_features
    if n_features == 0:
        n_features = 100000000000
    limit = stat.limit
    ng = stat.ngram

    if stat.method=="LD":
        if stat.max_iter == 200:
            stat.max_iter = 10
        if stat.max_iter > 100:
            stat.max_iter = 90

    n_samples = stat.max_iter

    if stat.fulltext:
        docs = Doc.objects.filter(query=qid,fulltext__iregex='\w')
    else:
        docs = Doc.objects.filter(query=qid,content__iregex='\w')

    # if we are limiting, probably for testing, then do that
    if limit > 0:
        docs = docs[:limit]


    tfidf_vectorizer = TfidfVectorizer(
        max_df=stat.max_df,
        min_df=stat.min_freq,
        max_features=n_features,
        ngram_range=(ng,ng),
        tokenizer=snowball_stemmer(),
        stop_words=stoplist
    )

    count_vectorizer = CountVectorizer(
        max_df=stat.max_df,
        min_df=stat.min_freq,
        max_features=n_features,
        ngram_range=(ng,ng),
        tokenizer=snowball_stemmer(),
        stop_words=stoplist
    )

    abstracts, docsizes, ids = proc_docs(docs, stoplist, stat.fulltext)

    doc_ids = ids
    random.shuffle(doc_ids)

    if stat.method=="NM":
        tfidf = tfidf_vectorizer.fit_transform(abstracts)
        vectorizer = tfidf_vectorizer
    else:
        tfidf = count_vectorizer.fit_transform(abstracts)
        vectorizer = count_vectorizer

    for k in range(k_folds):
        train_set = [i for i,x in enumerate(doc_ids) if i % k_folds !=k]
        test_set = [i for i,x in enumerate(doc_ids) if i % k_folds ==k]

        X_train = tfidf[train_set,]
        X_test = tfidf[test_set,]

        if stat.method=="NM":
            model = NMF(
                n_components=K, random_state=1,
                alpha=alpha, l1_ratio=.1, verbose=False,
                init='nndsvd', max_iter=n_samples
            ).fit(X_train)
            w_test = model.transform(X_test)
            rec_error = _beta_divergence(
                X_test,
                w_test,
                model.components_,
                'frobenius',
                square_root=True
            )

        else:
            model = LDA(
                n_components=K,
                doc_topic_prior=stat.alpha,
                max_iter=stat.max_iter,
                n_jobs=6
            ).fit(X_test)
            w_test = model.transform(X_test)
            rec_error = _beta_divergence(
                X_test,
                w_test,
                model.components_,
                'frobenius',
                square_root=True
            )
        kf, created = KFold.objects.get_or_create(
            model=stat,
            K=k
        )
        kf.error = rec_error
        kf.save()

    return
Пример #16
0
"""
==============================
Beta-divergence loss functions
==============================

A plot that compares the various Beta-divergence loss functions supported by
the Multiplicative-Update ('mu') solver in :class:`sklearn.decomposition.NMF`.
"""
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition.nmf import _beta_divergence

print(__doc__)

x = np.linspace(0.001, 4, 1000)
y = np.zeros(x.shape)

colors = 'mbgyr'
for j, beta in enumerate((0., 0.5, 1., 1.5, 2.)):
    for i, xi in enumerate(x):
        y[i] = _beta_divergence(1, xi, 1, beta)
    name = "beta = %1.1f" % beta
    plt.plot(x, y, label=name, color=colors[j])

plt.xlabel("x")
plt.title("beta-divergence(1, x)")
plt.legend(loc=0)
plt.axis([0, 4, 0, 3])
plt.show()
Пример #17
0
 def score(self, X, y=None):
     H = self.components_
     W = self.transform(X)
     return -_beta_divergence(X, W, H, self.beta_loss, square_root=True)
Пример #18
0
        temppath = tempimage()
        plt.savefig(temppath, dpi=dpi)
        dx,dy = imagesize(temppath)
        w = min(W,dx)
        image(temppath,imgx,imgy,width=w)
        imgy = imgy + dy + 20
        os.remove(temppath)
        size(W, HEIGHT+dy+40)
else:
    def pltshow(mplpyplot):
        mplpyplot.show()
# nodebox section end


x = np.linspace(0.001, 4, 1000)
y = np.zeros(x.shape)

colors = 'mbgyr'
for j, beta in enumerate((0., 0.5, 1., 1.5, 2.)):
    for i, xi in enumerate(x):
        y[i] = _beta_divergence(1, xi, 1, beta)
    name = "beta = %1.1f" % beta
    plt.plot(x, y, label=name, color=colors[j])

plt.xlabel("x")
plt.title("beta-divergence(1, x)")
plt.legend(loc=0)
plt.axis([0, 4, 0, 3])
# plt.show()
pltshow(plt)