Пример #1
0
    def vi(self, i, ids, cts, words_no, expElogbetad, no_iter=1000):
        alpha = self.G_0.G_0 * self.m_gamma

        gamma = np.ones(len(alpha))
        expElogtheta = np.exp(dirichlet_expectation(gamma))

        phinorm = np.dot(expElogtheta, expElogbetad) + 1e-100
        counts = np.array(cts)
        for _ in range(no_iter):
            lastgamma = gamma

            gamma = alpha + expElogtheta * np.dot(cts / phinorm,
                                                  expElogbetad.T)
            expElogtheta = np.exp(dirichlet_expectation(gamma))

            phinorm = np.dot(expElogtheta, expElogbetad) + 1e-100
            meanchange = mean_absolute_difference(gamma, lastgamma)
            if meanchange < meanchangethresh:
                break

        pro_mat = np.outer(expElogtheta.T, 1 / phinorm) * expElogbetad

        mat_z = my_multinomial(pro_mat)

        self.mat_z[i][self.effe_list] = mat_z
        self.mat_z_sum[i][self.effe_list] = np.dot(mat_z, cts)
Пример #2
0
    def testDirichletExpectation(self):
        # test dirichlet_expectation
        rs = self.random_state

        for dtype in [np.float16, np.float32, np.float64]:
            for i in range(self.num_runs):
                # 1 dimensional case
                input_1d = rs.uniform(.01, 10000, size=(self.num_topics, ))
                known_good = dirichlet_expectation(input_1d)
                test_values = matutils.dirichlet_expectation(input_1d)

                msg = "dirichlet_expectation_1d failed for dtype={}".format(
                    dtype)
                self.assertTrue(np.allclose(known_good, test_values), msg)

                # 2 dimensional case
                input_2d = rs.uniform(.01, 10000, size=(
                    1,
                    self.num_topics,
                ))
                known_good = dirichlet_expectation(input_2d)
                test_values = matutils.dirichlet_expectation(input_2d)

                msg = "dirichlet_expectation_2d failed for dtype={}".format(
                    dtype)
                self.assertTrue(np.allclose(known_good, test_values), msg)
Пример #3
0
    def bound(self, corpus, gamma=None, subsample_ratio=1.0):
        """
        Estimate the variational bound of documents from `corpus`:
        E_q[log p(corpus)] - E_q[log q(corpus)]

        `gamma` are the variational parameters on topic weights for each `corpus`
        document (=2d matrix=what comes out of `inference()`).
        If not supplied, will be inferred from the model.

        """
        score = 0.0
        _lambda = self.state.get_lambda()
        Elogbeta = dirichlet_expectation(_lambda)

        for d, doc in enumerate(
                corpus
        ):  # stream the input doc-by-doc, in case it's too large to fit in RAM
            if d % self.chunksize == 0:
                logger.debug("bound: at document #%i", d)
            if gamma is None:
                gammad, _ = self.inference([doc])
            else:
                gammad = gamma[d]
            Elogthetad = dirichlet_expectation(gammad)

            # E[log p(doc | theta, beta)]
            score += np.sum(cnt *
                            logsumexp(Elogthetad + Elogbeta[:, int(id)])
                            for id, cnt in doc)

            # E[log p(theta | alpha) - log q(theta | gamma)]; assumes alpha is a vector
            score += np.sum((self.alpha - gammad) * Elogthetad)
            score += np.sum(gammaln(gammad) - gammaln(self.alpha))
            score += gammaln(np.sum(self.alpha)) - gammaln(np.sum(gammad))

        # Compensate likelihood for when `corpus` above is only a sample of the whole corpus. This ensures
        # that the likelihood is always rougly on the same scale.
        score *= subsample_ratio

        # E[log p(beta | eta) - log q (beta | lambda)]; assumes eta is a scalar
        score += np.sum((self.eta - _lambda) * Elogbeta)
        score += np.sum(gammaln(_lambda) - gammaln(self.eta))

        if np.ndim(self.eta) == 0:
            sum_eta = self.eta * self.num_terms
        else:
            sum_eta = np.sum(self.eta)

        score += np.sum(gammaln(sum_eta) - gammaln(np.sum(_lambda, 1)))

        return score
Пример #4
0
    def delete_empty_list(self, delete_list):
        delete_no = np.sum(delete_list)

        delete_list2 = self.effe_list[delete_list]

        if delete_no != 0:
            self.m_K -= delete_no
            self.effe_list = self.effe_list[np.logical_not(delete_list)]

            self.m_lambda[delete_list2] = np.zeros_like(
                self.m_lambda[delete_list2]) * self.m_lambda[0, 0]
            self.m_dir_exp_lambda[delete_list2] = np.exp(
                dirichlet_expectation(self.m_lambda[delete_list2] +
                                      self.m_beta))

            self.mat_phi[delete_list2] = np.zeros(
                (delete_no, self.chunk_doc_no))

            self.G_0.G_0 = self.G_0.G_0[np.logical_not(delete_list)]
            self.G_0.m_K = int(len(self.G_0.G_0) - 1)

            self.G_0.G_0[0] = 1 - np.sum(self.G_0.G_0[1:])

            for i in range(self.chunk_doc_no):
                self.mat_z[i][delete_list2] = np.zeros_like(
                    self.mat_z[i][delete_list2])
                self.mat_z_avrg[i][delete_list2] = np.zeros_like(
                    self.mat_z_avrg[i][delete_list2])
Пример #5
0
 def get_initial(self):
     self.G_0 = Global_Prior(self.m_alpha, self.m_K, self.m_gamma, self.m_D,
                             self.m_W, self.chunksize)
     self.effe_list = np.arange(self.m_K + 1)
     self.m_lambda = np.zeros((self.max_K + 1, self.m_W))
     self.m_dir_exp_lambda = np.exp(
         dirichlet_expectation(self.m_lambda + self.m_beta))
Пример #6
0
    def bound(self, corpus, gamma=None, subsample_ratio=1.0):
        """
        Estimate the variational bound of documents from `corpus`:
        E_q[log p(corpus)] - E_q[log q(corpus)]

        `gamma` are the variational parameters on topic weights for each `corpus`
        document (=2d matrix=what comes out of `inference()`).
        If not supplied, will be inferred from the model.

        """
        score = 0.0
        _lambda = self.state.get_lambda()
        Elogbeta = dirichlet_expectation(_lambda)

        for d, doc in enumerate(corpus):  # stream the input doc-by-doc, in case it's too large to fit in RAM
            if d % self.chunksize == 0:
                logger.debug("bound: at document #%i", d)
            if gamma is None:
                gammad, _ = self.inference([doc])
            else:
                gammad = gamma[d]
            Elogthetad = dirichlet_expectation(gammad)

            # E[log p(doc | theta, beta)]
            score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)

            # E[log p(theta | alpha) - log q(theta | gamma)]; assumes alpha is a vector
            score += np.sum((self.alpha - gammad) * Elogthetad)
            score += np.sum(gammaln(gammad) - gammaln(self.alpha))
            score += gammaln(np.sum(self.alpha)) - gammaln(np.sum(gammad))

        # Compensate likelihood for when `corpus` above is only a sample of the whole corpus. This ensures
        # that the likelihood is always rougly on the same scale.
        score *= subsample_ratio

        # E[log p(beta | eta) - log q (beta | lambda)]; assumes eta is a scalar
        score += np.sum((self.eta - _lambda) * Elogbeta)
        score += np.sum(gammaln(_lambda) - gammaln(self.eta))

        if np.ndim(self.eta) == 0:
            sum_eta = self.eta * self.num_terms
        else:
            sum_eta = np.sum(self.eta)

        score += np.sum(gammaln(sum_eta) - gammaln(np.sum(_lambda, 1)))

        return score
Пример #7
0
def lda_e_step(doc_word_ids, doc_word_counts, alpha, beta, max_iter=100):
    """Performs EM-iteration on a single document for calculation of likelihood for a maximum iteration of `max_iter`.

    Parameters
    ----------
    doc_word_ids : int
        Id of corresponding words in a document.
    doc_word_counts : int
        Count of words in a single document.
    alpha : numpy.ndarray
        Lda equivalent value of alpha.
    beta : numpy.ndarray
        Lda equivalent value of beta.
    max_iter : int, optional
        Maximum number of times the expectation will be maximised.

    Returns
    -------
    (numpy.ndarray, numpy.ndarray)
        Computed (:math:`likelihood`, :math:`\\gamma`).

    """
    gamma = np.ones(len(alpha))
    expElogtheta = np.exp(dirichlet_expectation(gamma))
    betad = beta[:, doc_word_ids]
    phinorm = np.dot(expElogtheta, betad) + 1e-100
    counts = np.array(doc_word_counts)
    for _ in xrange(max_iter):
        lastgamma = gamma

        gamma = alpha + expElogtheta * np.dot(counts / phinorm, betad.T)
        Elogtheta = dirichlet_expectation(gamma)
        expElogtheta = np.exp(Elogtheta)
        phinorm = np.dot(expElogtheta, betad) + 1e-100
        meanchange = np.mean(abs(gamma - lastgamma))
        if meanchange < meanchangethresh:
            break

    likelihood = np.sum(counts * np.log(phinorm))
    likelihood += np.sum((alpha - gamma) * Elogtheta)
    likelihood += np.sum(gammaln(gamma) - gammaln(alpha))
    likelihood += gammaln(np.sum(alpha)) - gammaln(np.sum(gamma))

    return likelihood, gamma
Пример #8
0
def lda_e_step(doc_word_ids, doc_word_counts, alpha, beta, max_iter=100):
    r"""Performs EM-iteration on a single document for calculation of likelihood for a maximum iteration of `max_iter`.

    Parameters
    ----------
    doc_word_ids : int
        Id of corresponding words in a document.
    doc_word_counts : int
        Count of words in a single document.
    alpha : numpy.ndarray
        Lda equivalent value of alpha.
    beta : numpy.ndarray
        Lda equivalent value of beta.
    max_iter : int, optional
        Maximum number of times the expectation will be maximised.

    Returns
    -------
    (numpy.ndarray, numpy.ndarray)
        Computed (:math:`likelihood`, :math:`\gamma`).

    """
    gamma = np.ones(len(alpha))
    expElogtheta = np.exp(dirichlet_expectation(gamma))
    betad = beta[:, doc_word_ids]
    phinorm = np.dot(expElogtheta, betad) + 1e-100
    counts = np.array(doc_word_counts)
    for _ in range(max_iter):
        lastgamma = gamma

        gamma = alpha + expElogtheta * np.dot(counts / phinorm, betad.T)
        Elogtheta = dirichlet_expectation(gamma)
        expElogtheta = np.exp(Elogtheta)
        phinorm = np.dot(expElogtheta, betad) + 1e-100
        meanchange = mean_absolute_difference(gamma, lastgamma)
        if meanchange < meanchangethresh:
            break

    likelihood = np.sum(counts * np.log(phinorm))
    likelihood += np.sum((alpha - gamma) * Elogtheta)
    likelihood += np.sum(gammaln(gamma) - gammaln(alpha))
    likelihood += gammaln(np.sum(alpha)) - gammaln(np.sum(gamma))

    return likelihood, gamma
Пример #9
0
    def update_eta(self, lambdat, rho):
        """
        Update parameters for the Dirichlet prior on the per-topic
        word weights `eta` given the last `lambdat`.
        """
        N = float(lambdat.shape[0])
        logphat = (sum(dirichlet_expectation(lambda_) for lambda_ in lambdat) / N).reshape((self.num_terms,))

        self.eta = update_dir_prior(self.eta, N, logphat, rho)

        return self.eta
Пример #10
0
    def update_eta(self, lambdat, rho):
        """
        Update parameters for the Dirichlet prior on the per-topic
        word weights `eta` given the last `lambdat`.
        """
        N = float(lambdat.shape[0])
        logphat = (sum(dirichlet_expectation(lambda_) for lambda_ in lambdat) / N).reshape((self.num_terms,))

        self.eta = update_dir_prior(self.eta, N, logphat, rho)

        return self.eta
Пример #11
0
def lda_e_step(ids, cts, alpha, expElogbetad, max_iter=1000):
    """
    the function to update global parameters
    """
    gamma = np.ones(len(alpha))
    expElogtheta = np.exp(dirichlet_expectation(gamma))

    phinorm = np.dot(expElogtheta, expElogbetad) + 1e-100
    counts = np.array(cts)
    for _ in range(max_iter):
        lastgamma = gamma

        gamma = alpha + expElogtheta * np.dot(cts / phinorm, expElogbetad.T)
        expElogtheta = np.exp(dirichlet_expectation(gamma))

        phinorm = np.dot(expElogtheta, expElogbetad) + 1e-100
        meanchange = mean_absolute_difference(gamma, lastgamma)
        if meanchange < meanchangethresh:
            break

    return gamma / np.sum(gamma)
Пример #12
0
    def testDirichletExpectation(self):
        # test dirichlet_expectation
        rs = self.random_state

        for dtype in [np.float16, np.float32, np.float64]:
            for i in range(self.num_runs):
                # 1 dimensional case
                input_1d = rs.uniform(.01, 10000, size=(self.num_topics,))
                known_good = dirichlet_expectation(input_1d)
                test_values = matutils.dirichlet_expectation(input_1d)

                msg = "dirichlet_expectation_1d failed for dtype={}".format(dtype)
                self.assertTrue(np.allclose(known_good, test_values), msg)

                # 2 dimensional case
                input_2d = rs.uniform(.01, 10000, size=(1, self.num_topics,))
                known_good = dirichlet_expectation(input_2d)
                test_values = matutils.dirichlet_expectation(input_2d)

                msg = "dirichlet_expectation_2d failed for dtype={}".format(dtype)
                self.assertTrue(np.allclose(known_good, test_values), msg)
Пример #13
0
    def update_alpha(self, gammat, rho):
        """
        Update parameters for the Dirichlet prior on the per-document
        topic weights `alpha` given the last `gammat`.
        """
        N = float(len(gammat))
        logphat = sum(dirichlet_expectation(gamma) for gamma in gammat) / N

        self.alpha = update_dir_prior(self.alpha, N, logphat, rho)
        logger.info("optimized alpha %s", list(self.alpha))

        return self.alpha
Пример #14
0
    def update_alpha(self, gammat, rho):
        """
        Update parameters for the Dirichlet prior on the per-document
        topic weights `alpha` given the last `gammat`.
        """
        N = float(len(gammat))
        logphat = sum(dirichlet_expectation(gamma) for gamma in gammat) / N

        self.alpha = update_dir_prior(self.alpha, N, logphat, rho)
        logger.info("optimized alpha %s", list(self.alpha))

        return self.alpha
def lda_e_step(doc_word_ids, doc_word_counts, alpha, beta, max_iter=100):
    gamma = np.ones(len(alpha))
    expElogtheta = np.exp(dirichlet_expectation(gamma))
    betad = beta[:, doc_word_ids]
    phinorm = np.dot(expElogtheta, betad) + 1e-100
    counts = np.array(doc_word_counts)
    for _ in xrange(max_iter):
        lastgamma = gamma

        gamma = alpha + expElogtheta * np.dot(counts / phinorm, betad.T)
        Elogtheta = dirichlet_expectation(gamma)
        expElogtheta = np.exp(Elogtheta)
        phinorm = np.dot(expElogtheta, betad) + 1e-100
        meanchange = np.mean(abs(gamma - lastgamma))
        if (meanchange < meanchangethresh):
            break

    likelihood = np.sum(counts * np.log(phinorm))
    likelihood += np.sum((alpha - gamma) * Elogtheta)
    likelihood += np.sum(gammaln(gamma) - gammaln(alpha))
    likelihood += gammaln(np.sum(alpha)) - gammaln(np.sum(gamma))

    return (likelihood, gamma)
Пример #16
0
def lda_e_step(doc_word_ids, doc_word_counts, alpha, beta, max_iter=100):
    gamma = np.ones(len(alpha))
    expElogtheta = np.exp(dirichlet_expectation(gamma))
    betad = beta[:, doc_word_ids]
    phinorm = np.dot(expElogtheta, betad) + 1e-100
    counts = np.array(doc_word_counts)
    for _ in xrange(max_iter):
        lastgamma = gamma

        gamma = alpha + expElogtheta * np.dot(counts / phinorm, betad.T)
        Elogtheta = dirichlet_expectation(gamma)
        expElogtheta = np.exp(Elogtheta)
        phinorm = np.dot(expElogtheta, betad) + 1e-100
        meanchange = np.mean(abs(gamma - lastgamma))
        if (meanchange < meanchangethresh):
            break

    likelihood = np.sum(counts * np.log(phinorm))
    likelihood += np.sum((alpha - gamma) * Elogtheta)
    likelihood += np.sum(gammaln(gamma) - gammaln(alpha))
    likelihood += gammaln(np.sum(alpha)) - gammaln(np.sum(gamma))

    return (likelihood, gamma)
Пример #17
0
    def update_lambda(self, rhot):
        self.m_lambda[self.effe_list] -= rhot * (self.m_lambda[self.effe_list])

        for i in range(self.chunk_doc_no):
            ids = self.chunk_doc_word_ids_list[i]
            cts = self.chunk_doc_word_counts_list[i]
            self.m_lambda[np.ix_(
                self.effe_list,
                ids)] += rhot * (self.m_D / self.chunksize) * (np.tile(
                    cts,
                    (self.m_K + 1, 1)) * self.mat_z_avrg[i][self.effe_list])

        self.m_dir_exp_lambda[self.effe_list] = np.exp(
            dirichlet_expectation(self.m_lambda[self.effe_list] + self.m_beta))
Пример #18
0
    def bound(self, chunk, chunk_doc_idx=None, subsample_ratio=1.0, author2doc=None, doc2author=None):
        """
        Estimate the variational bound of documents from `corpus`:
        E_q[log p(corpus)] - E_q[log q(corpus)]

        There are basically two use cases of this method:
        1. `chunk` is a subset of the training corpus, and `chunk_doc_idx` is provided,
        indicating the indexes of the documents in the training corpus.
        2. `chunk` is a test set (held-out data), and author2doc and doc2author
        corrsponding to this test set are provided. There must not be any new authors
        passed to this method. `chunk_doc_idx` is not needed in this case.

        To obtain the per-word bound, compute:

        >>> corpus_words = sum(cnt for document in corpus for _, cnt in document)
        >>> model.bound(corpus, author2doc=author2doc, doc2author=doc2author) / corpus_words

        """

        # TODO: enable evaluation of documents with new authors. One could, for example, make it
        # possible to pass a list of documents to self.inference with no author dictionaries,
        # assuming all the documents correspond to one (unseen) author, learn the author's
        # gamma, and return gamma (without adding it to self.state.gamma). Of course,
        # collect_sstats should be set to false, so that the model is not updated w.r.t. these
        # new documents.

        _lambda = self.state.get_lambda()
        Elogbeta = dirichlet_expectation(_lambda)
        expElogbeta = np.exp(Elogbeta)

        gamma = self.state.gamma

        if author2doc is None and doc2author is None:
            # Evaluating on training documents (chunk of self.corpus).
            author2doc = self.author2doc
            doc2author = self.doc2author

            if not chunk_doc_idx:
                # If author2doc and doc2author are not provided, chunk is assumed to be a subset of
                # self.corpus, and chunk_doc_idx is thus required.
                raise ValueError('Either author dictionaries or chunk_doc_idx must be provided. Consult documentation of bound method.')
        elif author2doc is not None and doc2author is not None:
            # Training on held-out documents (documents not seen during training).
            # All authors in dictionaries must still be seen during training.
            for a in author2doc.keys():
                if not self.author2doc.get(a):
                    raise ValueError('bound cannot be called with authors not seen during training.')

            if chunk_doc_idx:
                raise ValueError('Either author dictionaries or chunk_doc_idx must be provided, not both. Consult documentation of bound method.')
        else:
            raise ValueError('Either both author2doc and doc2author should be provided, or neither. Consult documentation of bound method.')

        Elogtheta = dirichlet_expectation(gamma)
        expElogtheta = np.exp(Elogtheta)

        word_score = 0.0
        theta_score = 0.0
        for d, doc in enumerate(chunk):
            if chunk_doc_idx:
                doc_no = chunk_doc_idx[d]
            else:
                doc_no = d
            # Get all authors in current document, and convert the author names to integer IDs.
            authors_d = [self.author2id[a] for a in self.doc2author[doc_no]]
            ids = np.array([id for id, _ in doc])  # Word IDs in doc.
            cts = np.array([cnt for _, cnt in doc])  # Word counts.

            if d % self.chunksize == 0:
                logger.debug("bound: at document #%i in chunk", d)

            # Computing the bound requires summing over expElogtheta[a, k] * expElogbeta[k, v], which
            # is the same computation as in normalizing phi.
            phinorm = self.compute_phinorm(expElogtheta[authors_d, :], expElogbeta[:, ids])
            word_score += np.log(1.0 / len(authors_d)) * sum(cts) + cts.dot(np.log(phinorm))

        # Compensate likelihood for when `chunk` above is only a sample of the whole corpus. This ensures
        # that the likelihood is always roughly on the same scale.
        word_score *= subsample_ratio

        # E[log p(theta | alpha) - log q(theta | gamma)]
        for a in author2doc.keys():
            a = self.author2id[a]
            theta_score += np.sum((self.alpha - gamma[a, :]) * Elogtheta[a, :])
            theta_score += np.sum(gammaln(gamma[a, :]) - gammaln(self.alpha))
            theta_score += gammaln(np.sum(self.alpha)) - gammaln(np.sum(gamma[a, :]))

        # theta_score is rescaled in a similar fashion.
        # TODO: treat this in a more general way, similar to how it is done with word_score.
        theta_score *= self.num_authors / len(author2doc)

        # E[log p(beta | eta) - log q (beta | lambda)]
        beta_score = 0.0
        beta_score += np.sum((self.eta - _lambda) * Elogbeta)
        beta_score += np.sum(gammaln(_lambda) - gammaln(self.eta))
        sum_eta = np.sum(self.eta)
        beta_score += np.sum(gammaln(sum_eta) - gammaln(np.sum(_lambda, 1)))

        total_score = word_score + theta_score + beta_score

        return total_score
Пример #19
0
    def inference(self, chunk, author2doc, doc2author, rhot, collect_sstats=False, chunk_doc_idx=None):
        """
        Given a chunk of sparse document vectors, update gamma (parameters
        controlling the topic weights) for each author corresponding to the
        documents in the chunk.

        The whole input chunk of document is assumed to fit in RAM; chunking of
        a large corpus must be done earlier in the pipeline.

        If `collect_sstats` is True, also collect sufficient statistics needed
        to update the model's topic-word distributions, and return a 2-tuple
        `(gamma_chunk, sstats)`. Otherwise, return `(gamma_chunk, None)`.
        `gamma_cunk` is of shape `len(chunk_authors) x self.num_topics`, where
        `chunk_authors` is the number of authors in the documents in the
        current chunk.

        Avoids computing the `phi` variational parameter directly using the
        optimization presented in **Lee, Seung: Algorithms for non-negative matrix factorization, NIPS 2001**.

        """
        try:
            len(chunk)
        except TypeError:
            # convert iterators/generators to plain list, so we have len() etc.
            chunk = list(chunk)
        if len(chunk) > 1:
            logger.debug("performing inference on a chunk of %i documents", len(chunk))

        # Initialize the variational distribution q(theta|gamma) for the chunk
        if collect_sstats:
            sstats = np.zeros_like(self.expElogbeta)
        else:
            sstats = None
        converged = 0

        # Stack all the computed gammas into this output array.
        gamma_chunk = np.zeros((0, self.num_topics))

        # Now, for each document d update gamma and phi w.r.t. all authors in those documents.
        for d, doc in enumerate(chunk):
            if chunk_doc_idx is not None:
                doc_no = chunk_doc_idx[d]
            else:
                doc_no = d
            # Get the IDs and counts of all the words in the current document.
            # TODO: this is duplication of code in LdaModel. Refactor.
            if doc and not isinstance(doc[0][0], six.integer_types + (np.integer,)):
                # make sure the term IDs are ints, otherwise np will get upset
                ids = [int(idx) for idx, _ in doc]
            else:
                ids = [idx for idx, _ in doc]
            cts = np.array([cnt for _, cnt in doc])

            # Get all authors in current document, and convert the author names to integer IDs.
            authors_d = [self.author2id[a] for a in self.doc2author[doc_no]]

            gammad = self.state.gamma[authors_d, :]  # gamma of document d before update.
            tilde_gamma = gammad.copy()  # gamma that will be updated.

            # Compute the expectation of the log of the Dirichlet parameters theta and beta.
            Elogthetad = dirichlet_expectation(tilde_gamma)
            expElogthetad = np.exp(Elogthetad)
            expElogbetad = self.expElogbeta[:, ids]

            # Compute the normalizing constant of phi for the current document.
            phinorm = self.compute_phinorm(expElogthetad, expElogbetad)

            # Iterate between gamma and phi until convergence
            for _ in xrange(self.iterations):
                lastgamma = tilde_gamma.copy()

                # Update gamma.
                # phi is computed implicitly below,
                for ai, a in enumerate(authors_d):
                    tilde_gamma[ai, :] = self.alpha + len(self.author2doc[self.id2author[a]]) * expElogthetad[ai, :] * np.dot(cts / phinorm, expElogbetad.T)

                # Update gamma.
                # Interpolation between document d's "local" gamma (tilde_gamma),
                # and "global" gamma (gammad).
                tilde_gamma = (1 - rhot) * gammad + rhot * tilde_gamma

                # Update Elogtheta and Elogbeta, since gamma and lambda have been updated.
                Elogthetad = dirichlet_expectation(tilde_gamma)
                expElogthetad = np.exp(Elogthetad)

                # Update the normalizing constant in phi.
                phinorm = self.compute_phinorm(expElogthetad, expElogbetad)

                # Check for convergence.
                # Criterion is mean change in "local" gamma.
                meanchange_gamma = np.mean(abs(tilde_gamma - lastgamma))
                gamma_condition = meanchange_gamma < self.gamma_threshold
                if gamma_condition:
                    converged += 1
                    break
            # End of iterations loop.

            # Store the updated gammas in the model state.
            self.state.gamma[authors_d, :] = tilde_gamma

            # Stack the new gammas into the output array.
            gamma_chunk = np.vstack([gamma_chunk, tilde_gamma])

            if collect_sstats:
                # Contribution of document d to the expected sufficient
                # statistics for the M step.
                expElogtheta_sum_a = expElogthetad.sum(axis=0)
                sstats[:, ids] += np.outer(expElogtheta_sum_a.T, cts / phinorm)

        if len(chunk) > 1:
            logger.debug(
                "%i/%i documents converged within %i iterations",
                converged, len(chunk), self.iterations
            )

        if collect_sstats:
            # This step finishes computing the sufficient statistics for the
            # M step, so that
            # sstats[k, w] = \sum_d n_{dw} * \sum_a phi_{dwak}
            # = \sum_d n_{dw} * exp{Elogtheta_{ak} + Elogbeta_{kw}} / phinorm_{dw}.
            sstats *= self.expElogbeta
        return gamma_chunk, sstats
Пример #20
0
    def __init__(self, corpus=None, num_topics=100, id2word=None, author2doc=None, doc2author=None,
                 chunksize=2000, passes=1, iterations=50, decay=0.5, offset=1.0,
                 alpha='symmetric', eta='symmetric', update_every=1, eval_every=10,
                 gamma_threshold=0.001, serialized=False, serialization_path=None,
                 minimum_probability=0.01, random_state=None):
        """
        If the iterable corpus and one of author2doc/doc2author dictionaries are given,
        start training straight away. If not given, the model is left untrained
        (presumably because you want to call the `update` method manually).

        `num_topics` is the number of requested latent topics to be extracted from
        the training corpus.

        `id2word` is a mapping from word ids (integers) to words (strings). It is
        used to determine the vocabulary size, as well as for debugging and topic
        printing.

        `author2doc` is a dictionary where the keys are the names of authors, and the
        values are lists of documents that the author contributes to.

        `doc2author` is a dictionary where the keys are document IDs (indexes to corpus)
        and the values are lists of author names. I.e. this is the reverse mapping of
        `author2doc`. Only one of the two, `author2doc` and `doc2author` have to be
        supplied.

        `passes` is the number of times the model makes a pass over the entire trianing
        data.

        `iterations` is the maximum number of times the model loops over each document
        (M-step). The iterations stop when convergence is reached.

        `chunksize` controls the size of the mini-batches.

        `alpha` and `eta` are hyperparameters that affect sparsity of the author-topic
        (theta) and topic-word (lambda) distributions. Both default to a symmetric
        1.0/num_topics prior.

        `alpha` can be set to an explicit array = prior of your choice. It also
        support special values of 'asymmetric' and 'auto': the former uses a fixed
        normalized asymmetric 1.0/topicno prior, the latter learns an asymmetric
        prior directly from your data.

        `eta` can be a scalar for a symmetric prior over topic/word
        distributions, or a vector of shape num_words, which can be used to
        impose (user defined) asymmetric priors over the word distribution.
        It also supports the special value 'auto', which learns an asymmetric
        prior over words directly from your data. `eta` can also be a matrix
        of shape num_topics x num_words, which can be used to impose
        asymmetric priors over the word distribution on a per-topic basis
        (can not be learned from data).

        Calculate and log perplexity estimate from the latest mini-batch every
        `eval_every` model updates. Set to None to disable perplexity estimation.

        `decay` and `offset` parameters are the same as Kappa and Tau_0 in
        Hoffman et al, respectively. `decay` controls how quickly old documents are
        forgotten, while `offset` down-weights early iterations.

        `minimum_probability` controls filtering the topics returned for a document (bow).

        `random_state` can be an integer or a numpy.random.RandomState object. Set the
        state of the random number generator inside the author-topic model, to ensure
        reproducibility of your experiments, for example.

        `serialized` indicates whether the input corpora to the model are simple
        in-memory lists (`serialized = False`) or saved to the hard-drive
        (`serialized = True`). Note that this behaviour is quite different from
        other Gensim models. If your data is too large to fit in to memory, use
        this functionality. Note that calling `AuthorTopicModel.update` with new
        data may be cumbersome as it requires all the existing data to be
        re-serialized.

        `serialization_path` must be set to a filepath, if `serialized = True` is
        used. Use, for example, `serialization_path = /tmp/serialized_model.mm` or use your
        working directory by setting `serialization_path = serialized_model.mm`. An existing
        file *cannot* be overwritten; either delete the old file or choose a different
        name.

        Example:

        >>> model = AuthorTopicModel(corpus, num_topics=100, author2doc=author2doc, id2word=id2word)  # train model
        >>> model.update(corpus2)  # update the author-topic model with additional documents

        >>> model = AuthorTopicModel(corpus, num_topics=50, author2doc=author2doc, id2word=id2word, alpha='auto', eval_every=5)  # train asymmetric alpha from data

        """

        # NOTE: as distributed version of this model is not implemented, "distributed" is set to false. Some of the
        # infrastructure to implement a distributed author-topic model is already in place, such as the AuthorTopicState.
        distributed = False
        self.dispatcher = None
        self.numworkers = 1

        self.id2word = id2word
        if corpus is None and self.id2word is None:
            raise ValueError(
                "at least one of corpus/id2word must be specified, to establish input space dimensionality"
            )

        if self.id2word is None:
            logger.warning("no word id mapping provided; initializing from corpus, assuming identity")
            self.id2word = utils.dict_from_corpus(corpus)
            self.num_terms = len(self.id2word)
        elif len(self.id2word) > 0:
            self.num_terms = 1 + max(self.id2word.keys())
        else:
            self.num_terms = 0

        if self.num_terms == 0:
            raise ValueError("cannot compute the author-topic model over an empty collection (no terms)")

        logger.info('Vocabulary consists of %d words.', self.num_terms)

        self.author2doc = {}
        self.doc2author = {}

        self.distributed = distributed
        self.num_topics = num_topics
        self.num_authors = 0
        self.chunksize = chunksize
        self.decay = decay
        self.offset = offset
        self.minimum_probability = minimum_probability
        self.num_updates = 0
        self.total_docs = 0

        self.passes = passes
        self.update_every = update_every
        self.eval_every = eval_every

        self.author2id = {}
        self.id2author = {}

        self.serialized = serialized
        if serialized and not serialization_path:
            raise ValueError("If serialized corpora are used, a the path to a folder where the corpus should be saved must be provided (serialized_path).")
        if serialized and serialization_path:
            assert not isfile(serialization_path), \
                "A file already exists at the serialization_path path; " \
                "choose a different serialization_path, or delete the file."
        self.serialization_path = serialization_path

        # Initialize an empty self.corpus.
        self.init_empty_corpus()

        self.alpha, self.optimize_alpha = self.init_dir_prior(alpha, 'alpha')

        assert self.alpha.shape == (self.num_topics,), \
            "Invalid alpha shape. Got shape %s, but expected (%d, )" % (str(self.alpha.shape), self.num_topics)

        if isinstance(eta, six.string_types):
            if eta == 'asymmetric':
                raise ValueError("The 'asymmetric' option cannot be used for eta")

        self.eta, self.optimize_eta = self.init_dir_prior(eta, 'eta')

        self.random_state = utils.get_random_state(random_state)

        assert (self.eta.shape == (self.num_terms,) or self.eta.shape == (self.num_topics, self.num_terms)), (
                "Invalid eta shape. Got shape %s, but expected (%d, 1) or (%d, %d)" %
                (str(self.eta.shape), self.num_terms, self.num_topics, self.num_terms)
        )

        # VB constants
        self.iterations = iterations
        self.gamma_threshold = gamma_threshold

        # Initialize the variational distributions q(beta|lambda) and q(theta|gamma)
        self.state = AuthorTopicState(self.eta, (self.num_topics, self.num_terms), (self.num_authors, self.num_topics))
        self.state.sstats = self.random_state.gamma(100., 1. / 100., (self.num_topics, self.num_terms))
        self.expElogbeta = np.exp(dirichlet_expectation(self.state.sstats))

        # if a training corpus was provided, start estimating the model right away
        if corpus is not None and (author2doc is not None or doc2author is not None):
            use_numpy = self.dispatcher is not None
            self.update(corpus, author2doc, doc2author, chunks_as_numpy=use_numpy)
Пример #21
0
    def update_doc(self, i, max_iter=500):
        self.mat_z[i] = np.zeros(((self.max_K + 1), self.chunk_doc_word_no[i]))
        self.mat_z_avrg[i] = np.copy(self.mat_z[i])
        self.mat_z_sum[i] = np.zeros((self.max_K + 1))

        ids = self.chunk_doc_word_ids_list[i]
        cts = self.chunk_doc_word_counts_list[i]
        words_no = self.chunk_doc_word_no[i]
        expElogbetad = self.m_dir_exp_lambda[np.ix_(self.effe_list, ids)]

        self.vi(i, ids, cts, words_no, expElogbetad, no_iter=1000)
        self.gibbs_samplings(i, ids, cts, words_no, expElogbetad, max_iter=10)

        iter = 2
        aver_sum = np.copy(self.mat_z_sum[i])
        aver_phi = digamma(self.G_0.G_0 * self.m_gamma +
                           self.mat_z_sum[i][self.effe_list])

        while iter < max_iter:
            last_aver_sum = np.copy(aver_sum)

            self.gibbs_samplings(i,
                                 ids,
                                 cts,
                                 words_no,
                                 expElogbetad,
                                 max_iter=1)
            self.mat_z_avrg[i] -= 1 / iter * (self.mat_z_avrg[i] -
                                              self.mat_z[i])
            aver_sum -= 1 / iter * (last_aver_sum - self.mat_z_sum[i])
            aver_phi -= 1 / iter * (aver_phi -
                                    digamma(self.G_0.G_0 * self.m_gamma +
                                            self.mat_z_sum[i][self.effe_list]))

            iter += 1

            meanchange = mean_absolute_difference(
                aver_sum[self.effe_list],
                last_aver_sum[self.effe_list]) / np.sum(cts)
            if meanchange < meanchangethresh:
                break

        self.mat_phi[self.effe_list,
                     i] = aver_phi - digamma(self.G_0.G_0 * self.m_gamma)

        if np.sum(self.mat_z_avrg[i][0]) > 0:
            add_vector = self.mat_z_sum[i][0]
            add_no = 1
            add_list = ids

            self.m_K += add_no
            new_k = find_gap_in_np_array(self.effe_list, add_no)

            self.effe_list = np.sort(self.effe_list.tolist() + new_k)

            self.mat_z_avrg[i][new_k] = self.mat_z_avrg[i][0]
            self.mat_z_avrg[i][0] = np.zeros_like(self.mat_z_avrg[i][0])

            self.mat_z[i][new_k] = self.mat_z[i][0]
            self.mat_z[i][0] = np.zeros_like(self.mat_z[i][0])

            self.mat_phi[new_k, i] = self.mat_phi[0, i]
            self.mat_phi[0, i] = np.zeros_like(self.mat_phi[0, i])

            self.G_0.add_new(add_no)

            self.m_lambda[np.ix_(new_k, add_list)] += self.rhot * self.m_D / self.chunksize * np.array(cts) * \
                                                      self.mat_z_avrg[i][new_k]
            self.m_dir_exp_lambda[new_k] = np.exp(
                dirichlet_expectation(self.m_lambda[new_k] + self.m_beta))
Пример #22
0
    def __init__(self, corpus, id2word, max_chunks=None, max_time=None,
                 chunksize=256, kappa=1.0, tau=64.0, K=15, T=150, alpha=1,
                 gamma=1, eta=0.01, scale=1.0, var_converge=0.0001,
                 outputdir=None, random_state=None):
        """

        Parameters
        ----------
        corpus : iterable of list of (int, float)
            Corpus in BoW format.
        id2word : :class:`~gensim.corpora.dictionary.Dictionary`
            Dictionary for the input corpus.
        max_chunks : int, optional
            Upper bound on how many chunks to process. It wraps around corpus beginning in another corpus pass,
            if there are not enough chunks in the corpus.
        max_time : int, optional
            Upper bound on time (in seconds) for which model will be trained.
        chunksize : int, optional
            Number of documents in one chuck.
        kappa: float,optional
            Learning parameter which acts as exponential decay factor to influence extent of learning from each batch.
        tau: float, optional
            Learning parameter which down-weights early iterations of documents.
        K : int, optional
            Second level truncation level
        T : int, optional
            Top level truncation level
        alpha : int, optional
            Second level concentration
        gamma : int, optional
            First level concentration
        eta : float, optional
            The topic Dirichlet
        scale : float, optional
            Weights information from the mini-chunk of corpus to calculate rhot.
        var_converge : float, optional
            Lower bound on the right side of convergence. Used when updating variational parameters for a
            single document.
        outputdir : str, optional
            Stores topic and options information in the specified directory.
        random_state : {None, int, array_like, :class:`~np.random.RandomState`, optional}
            Adds a little random jitter to randomize results around same alpha when trying to fetch a closest
            corresponding lda model from :meth:`~gensim.models.hdpmodel.HdpModel.suggested_lda_model`

        """
        self.corpus = corpus
        self.id2word = id2word
        self.chunksize = chunksize
        self.max_chunks = max_chunks
        self.max_time = max_time
        self.outputdir = outputdir

        self.random_state = utils.get_random_state(random_state)

        self.lda_alpha = None
        self.lda_beta = None

        self.m_W = len(id2word)
        self.m_D = 0
        if corpus:
            self.m_D = len(corpus)

        self.m_T = T
        self.m_K = K
        self.m_alpha = alpha
        self.m_gamma = gamma

        self.m_var_sticks = np.zeros((2, T - 1))
        self.m_var_sticks[0] = 1.0
        self.m_var_sticks[1] = range(T - 1, 0, -1)
        self.m_varphi_ss = np.zeros(T)

        self.m_lambda = self.random_state.gamma(1.0, 1.0, (T, self.m_W)) * self.m_D * 100 / (T * self.m_W) - eta
        self.m_eta = eta
        self.m_Elogbeta = dirichlet_expectation(self.m_eta + self.m_lambda)

        self.m_tau = tau + 1
        self.m_kappa = kappa
        self.m_scale = scale
        self.m_updatect = 0
        self.m_status_up_to_date = True
        self.m_num_docs_processed = 0

        self.m_timestamp = np.zeros(self.m_W, dtype=int)
        self.m_r = [0]
        self.m_lambda_sum = np.sum(self.m_lambda, axis=1)

        self.m_var_converge = var_converge

        if self.outputdir:
            self.save_options()

        # if a training corpus was provided, start estimating the model right away
        if corpus is not None:
            self.update(corpus)
Пример #23
0
 def get_Elogbeta(self):
     return dirichlet_expectation(self.get_lambda())
Пример #24
0
    def __init__(self,
                 corpus=None,
                 num_topics=100,
                 id2word=None,
                 distributed=None,
                 chunk_size=2000,
                 passes=1,
                 update_every=1,
                 alpha='symmetric',
                 eta=None,
                 decay=0.5,
                 offset=1.0,
                 evaluate_every=10,
                 iterations=200,
                 gamma_threshold=0.001,
                 min_prob=0.01,
                 random_state=None,
                 ns_conf={},
                 min_phi_val=0.01,
                 per_word_topics=False):
        self.id2word = id2word
        if corpus is None and self.id2word is None:
            raise ValueError(
                'At least one of corpus/id2word must be specified.')

        if self.id2word is None:
            logger.warning(
                'No word-id mapping provided; initializing from corpus, assuming identity'
            )
            self.id2word = corpus_handle.dict_from_corpus(corpus)
            self.num_items = len(self.id2word)
        elif len(self.id2word) > 0:
            self.num_items = 1 + max(self.id2word.keys())
        else:
            self.num_items = 0

        if self.num_items == 0:
            raise ValueError(
                " Cannot compute LDA over an empty collection(no items)")

        self.distributed = distributed
        self.num_topics = num_topics
        self.chunk_size = chunk_size
        self.decay = decay
        self.offset = offset
        self.min_prob = min_prob
        self.num_updates = 0

        self.passes = passes
        self.update_every = update_every
        self.evaluate_every = evaluate_every
        self.min_phi_val = min_phi_val

        self.alpha, self.optimize_alpha = self.init_dirichlet_prior(
            alpha, 'alpha')

        assert self.alpha.shape == (
            self.num_topics,
        ), "Invalid alpha shape. Got shape %s, but expected (%d, )" % (str(
            self.alpha.shape), self.num_topics)

        if isinstance(eta, six.string_types):
            if eta == 'asymmetric':
                raise ValueError(
                    "The 'asymmetric' option cannot be used for eta")

        self.eta, self.optimize_eta = self.init_dirichlet_prior(eta, 'eta')

        self.random_state = corpus_handle.get_random_state(random_state)

        assert (
            self.eta.shape == (self.num_terms, )
            or self.eta.shape == (self.num_topics, self.num_terms)
        ), ("Invalid eta shape. Got shape %s, but expected (%d, 1) or (%d, %d)"
            % (str(self.eta.shape), self.num_terms, self.num_topics,
               self.num_terms))

        self.iterations = iterations
        self.gamma_threshold = gamma_threshold

        if not distributed:
            logger.info("Using serial LDA version on this node.")
            self.dispatcher = None
            self.num_workers = 1
        else:
            pass

        self.state = LDAState(self.eta, (self, num_topics, self.num_items))
        self.state.s_stats = self.random_state.gamma(
            100., 1. / 100., (self.num_topics, self.num_items))
        self.expElogbeta = np.exp(dirichlet_expectation(self.state.s_stats))

        # if a training corpus was provided, start training estimating right away.
        if corpus is not None:
            use_numpy = self.dispatcher is not None
            self.update(corpus, chunk_as_numpy=use_numpy)
Пример #25
0
    def inference(self, chunk, collect_sstats=False):
        """
        Given a chunk of sparse document vectors, estimate gamma (parameters
        controlling the topic weights) for each document in the chunk.

        This function does not modify the model (=is read-only aka const). The
        whole input chunk of document is assumed to fit in RAM; chunking of a
        large corpus must be done earlier in the pipeline.

        If `collect_sstats` is True, also collect sufficient statistics needed
        to update the model's topic-word distributions, and return a 2-tuple
        `(gamma, sstats)`. Otherwise, return `(gamma, None)`. `gamma` is of shape
        `len(chunk) x self.num_topics`.

        Avoids computing the `phi` variational parameter directly using the
        optimization presented in **Lee, Seung: Algorithms for non-negative matrix factorization, NIPS 2001**.

        """
        try:
            _ = len(chunk)
        except:
            # convert iterators/generators to plain list, so we have len() etc.
            chunk = list(chunk)
        if len(chunk) > 1:
            logger.debug("performing inference on a chunk of %i documents", len(chunk))

        # Initialize the variational distribution q(theta|gamma) for the chunk
        gamma = self.random_state.gamma(100., 1. / 100., (len(chunk), self.num_topics))
        Elogtheta = dirichlet_expectation(gamma)
        expElogtheta = np.exp(Elogtheta)
        if collect_sstats:
            sstats = np.zeros_like(self.expElogbeta)
        else:
            sstats = None
        converged = 0

        # Now, for each document d update that document's gamma and phi
        # Inference code copied from Hoffman's `onlineldavb.py` (esp. the
        # Lee&Seung trick which speeds things up by an order of magnitude, compared
        # to Blei's original LDA-C code, cool!).
        for d, doc in enumerate(chunk):
            if len(doc) > 0 and not isinstance(doc[0][0], six.integer_types + (np.integer,)):
                # make sure the term IDs are ints, otherwise np will get upset
                ids = [int(id) for id, _ in doc]
            else:
                ids = [id for id, _ in doc]
            cts = np.array([cnt for _, cnt in doc])
            gammad = gamma[d, :]
            Elogthetad = Elogtheta[d, :]
            expElogthetad = expElogtheta[d, :]
            expElogbetad = self.expElogbeta[:, ids]

            # The optimal phi_{dwk} is proportional to expElogthetad_k * expElogbetad_w.
            # phinorm is the normalizer.
            # TODO treat zeros explicitly, instead of adding 1e-100?
            phinorm = np.dot(expElogthetad, expElogbetad) + 1e-100

            # Iterate between gamma and phi until convergence
            for _ in xrange(self.iterations):
                lastgamma = gammad
                # We represent phi implicitly to save memory and time.
                # Substituting the value of the optimal phi back into
                # the update for gamma gives this update. Cf. Lee&Seung 2001.
                gammad = self.alpha + expElogthetad * np.dot(cts / phinorm, expElogbetad.T)
                Elogthetad = dirichlet_expectation(gammad)
                expElogthetad = np.exp(Elogthetad)
                phinorm = np.dot(expElogthetad, expElogbetad) + 1e-100
                # If gamma hasn't changed much, we're done.
                meanchange = np.mean(abs(gammad - lastgamma))
                if (meanchange < self.gamma_threshold):
                    converged += 1
                    break
            gamma[d, :] = gammad
            if collect_sstats:
                # Contribution of document d to the expected sufficient
                # statistics for the M step.
                sstats[:, ids] += np.outer(expElogthetad.T, cts / phinorm)

        if len(chunk) > 1:
            logger.debug("%i/%i documents converged within %i iterations",
                         converged, len(chunk), self.iterations)

        if collect_sstats:
            # This step finishes computing the sufficient statistics for the
            # M step, so that
            # sstats[k, w] = \sum_d n_{dw} * phi_{dwk}
            # = \sum_d n_{dw} * exp{Elogtheta_{dk} + Elogbeta_{kw}} / phinorm_{dw}.
            sstats *= self.expElogbeta
        return gamma, sstats
print change_index_result.shape
''' '''
import torch
from torch.autograd import Variable
gamma_test = -1*Variable(torch.from_numpy(np.random.rand(3, 6)))
para_test = Variable(torch.from_numpy(np.random.rand(6, 4)))
print gamma_test
print gamma_test.abs()
print (gamma_test.abs().sum(dim=1).view(-1, 1))
print (gamma_test.abs()) / (gamma_test.abs().sum(dim=1).view(-1, 1))


print para_test
mum = gamma_test.abs().mm(para_test)
print mum
print 'max: '
max = mum.max(dim=1)[0]
print max
sum = max.sum()
print sum
re = Variable(torch.DoubleTensor([1.0]))/sum
print re
'''
a = Variable(torch.FloatTensor([1.1]))
print a
print psi([1.0])
# print psi(a)
print matutils.dirichlet_expectation(np.array([1.0]))
print matutils.dirichlet_expectation(a.data)

print str('"yang"').replace('"', '')
Пример #27
0
    def __init__(self,
                 corpus=None,
                 num_topics=100,
                 id2word=None,
                 distributed=False,
                 chunksize=2000,
                 passes=1,
                 update_every=1,
                 alpha='symmetric',
                 eta=None,
                 decay=0.5,
                 offset=1.0,
                 eval_every=10,
                 iterations=50,
                 gamma_threshold=0.001,
                 minimum_probability=0.01,
                 random_state=None,
                 ns_conf={},
                 minimum_phi_value=0.01,
                 per_word_topics=False):
        """
        If given, start training from the iterable `corpus` straight away. If not given,
        the model is left untrained (presumably because you want to call `update()` manually).

        `num_topics` is the number of requested latent topics to be extracted from
        the training corpus.

        `id2word` is a mapping from word ids (integers) to words (strings). It is
        used to determine the vocabulary size, as well as for debugging and topic
        printing.

        `alpha` and `eta` are hyperparameters that affect sparsity of the document-topic
        (theta) and topic-word (lambda) distributions. Both default to a symmetric
        1.0/num_topics prior.

        `alpha` can be set to an explicit array = prior of your choice. It also
        support special values of 'asymmetric' and 'auto': the former uses a fixed
        normalized asymmetric 1.0/topicno prior, the latter learns an asymmetric
        prior directly from your data.

        `eta` can be a scalar for a symmetric prior over topic/word
        distributions, or a vector of shape num_words, which can be used to
        impose (user defined) asymmetric priors over the word distribution.
        It also supports the special value 'auto', which learns an asymmetric
        prior over words directly from your data. `eta` can also be a matrix
        of shape num_topics x num_words, which can be used to impose
        asymmetric priors over the word distribution on a per-topic basis
        (can not be learned from data).

        Turn on `distributed` to force distributed computing (see the `web tutorial <http://radimrehurek.com/gensim/distributed.html>`_
        on how to set up a cluster of machines for gensim).

        Calculate and log perplexity estimate from the latest mini-batch every
        `eval_every` model updates (setting this to 1 slows down training ~2x;
        default is 10 for better performance). Set to None to disable perplexity estimation.

        `decay` and `offset` parameters are the same as Kappa and Tau_0 in
        Hoffman et al, respectively.

        `minimum_probability` controls filtering the topics returned for a document (bow).

        `random_state` can be a np.random.RandomState object or the seed for one

        Example:

        >>> lda = LdaModel(corpus, num_topics=100)  # train model
        >>> print(lda[doc_bow]) # get topic probability distribution for a document
        >>> lda.update(corpus2) # update the LDA model with additional documents
        >>> print(lda[doc_bow])

        >>> lda = LdaModel(corpus, num_topics=50, alpha='auto', eval_every=5)  # train asymmetric alpha from data

        """

        # store user-supplied parameters
        self.id2word = id2word
        if corpus is None and self.id2word is None:
            raise ValueError(
                'at least one of corpus/id2word must be specified, to establish input space dimensionality'
            )

        if self.id2word is None:
            logger.warning(
                "no word id mapping provided; initializing from corpus, assuming identity"
            )
            self.id2word = utils.dict_from_corpus(corpus)
            self.num_terms = len(self.id2word)
        elif len(self.id2word) > 0:
            self.num_terms = 1 + max(self.id2word.keys())
        else:
            self.num_terms = 0

        if self.num_terms == 0:
            raise ValueError(
                "cannot compute LDA over an empty collection (no terms)")

        self.distributed = bool(distributed)
        self.num_topics = int(num_topics)
        self.chunksize = chunksize
        self.decay = decay
        self.offset = offset
        self.minimum_probability = minimum_probability
        self.num_updates = 0

        self.passes = passes
        self.update_every = update_every
        self.eval_every = eval_every
        self.minimum_phi_value = minimum_phi_value
        self.per_word_topics = per_word_topics

        self.alpha, self.optimize_alpha = self.init_dir_prior(alpha, 'alpha')

        assert self.alpha.shape == (
            self.num_topics,
        ), "Invalid alpha shape. Got shape %s, but expected (%d, )" % (str(
            self.alpha.shape), self.num_topics)

        if isinstance(eta, six.string_types):
            if eta == 'asymmetric':
                raise ValueError(
                    "The 'asymmetric' option cannot be used for eta")

        self.eta, self.optimize_eta = self.init_dir_prior(eta, 'eta')

        self.random_state = utils.get_random_state(random_state)

        assert (
            self.eta.shape == (self.num_terms, )
            or self.eta.shape == (self.num_topics, self.num_terms)
        ), ("Invalid eta shape. Got shape %s, but expected (%d, 1) or (%d, %d)"
            % (str(self.eta.shape), self.num_terms, self.num_topics,
               self.num_terms))

        # VB constants
        self.iterations = iterations
        self.gamma_threshold = gamma_threshold

        # set up distributed environment if necessary
        if not distributed:
            logger.info("using serial LDA version on this node")
            self.dispatcher = None
            self.numworkers = 1
        else:
            if self.optimize_alpha:
                raise NotImplementedError(
                    "auto-optimizing alpha not implemented in distributed LDA")
            # set up distributed version
            try:
                import Pyro4
                with utils.getNS(**ns_conf) as ns:
                    from gensim.models.lda_dispatcher import LDA_DISPATCHER_PREFIX
                    self.dispatcher = Pyro4.Proxy(
                        ns.list(prefix=LDA_DISPATCHER_PREFIX)
                        [LDA_DISPATCHER_PREFIX])
                    logger.debug("looking for dispatcher at %s" %
                                 str(self.dispatcher._pyroUri))
                    self.dispatcher.initialize(id2word=self.id2word,
                                               num_topics=self.num_topics,
                                               chunksize=chunksize,
                                               alpha=alpha,
                                               eta=eta,
                                               distributed=False)
                    self.numworkers = len(self.dispatcher.getworkers())
                    logger.info("using distributed version with %i workers" %
                                self.numworkers)
            except Exception as err:
                logger.error("failed to initialize distributed LDA (%s)", err)
                raise RuntimeError(
                    "failed to initialize distributed LDA (%s)" % err)

        # Initialize the variational distribution q(beta|lambda)
        self.state = LdaState(self.eta, (self.num_topics, self.num_terms))
        self.state.sstats = self.random_state.gamma(
            100., 1. / 100., (self.num_topics, self.num_terms))
        self.expElogbeta = np.exp(dirichlet_expectation(self.state.sstats))

        # if a training corpus was provided, start estimating the model right away
        if corpus is not None:
            use_numpy = self.dispatcher is not None
            self.update(corpus, chunks_as_numpy=use_numpy)
Пример #28
0
 def get_Elogbeta(self):
     return dirichlet_expectation(self.get_lambda())
Пример #29
0
    def inference(self,
                  chunk,
                  author2doc,
                  doc2author,
                  rhot,
                  collect_sstats=False,
                  chunk_doc_idx=None):
        """
        Given a chunk of sparse document vectors, update gamma (parameters
        controlling the topic weights) for each author corresponding to the
        documents in the chunk.

        The whole input chunk of document is assumed to fit in RAM; chunking of
        a large corpus must be done earlier in the pipeline.

        If `collect_sstats` is True, also collect sufficient statistics needed
        to update the model's topic-word distributions, and return a 2-tuple
        `(gamma_chunk, sstats)`. Otherwise, return `(gamma_chunk, None)`.
        `gamma_cunk` is of shape `len(chunk_authors) x self.num_topics`, where
        `chunk_authors` is the number of authors in the documents in the
        current chunk.

        Avoids computing the `phi` variational parameter directly using the
        optimization presented in **Lee, Seung: Algorithms for non-negative matrix factorization, NIPS 2001**.

        """
        try:
            _ = len(chunk)
        except:
            # convert iterators/generators to plain list, so we have len() etc.
            chunk = list(chunk)
        if len(chunk) > 1:
            logger.debug("performing inference on a chunk of %i documents",
                         len(chunk))

        # Initialize the variational distribution q(theta|gamma) for the chunk
        if collect_sstats:
            sstats = np.zeros_like(self.expElogbeta)
        else:
            sstats = None
        converged = 0

        # Stack all the computed gammas into this output array.
        gamma_chunk = np.zeros((0, self.num_topics))

        # Now, for each document d update gamma and phi w.r.t. all authors in those documents.
        for d, doc in enumerate(chunk):
            if chunk_doc_idx is not None:
                doc_no = chunk_doc_idx[d]
            else:
                doc_no = d
            # Get the IDs and counts of all the words in the current document.
            # TODO: this is duplication of code in LdaModel. Refactor.
            if doc and not isinstance(doc[0][0], six.integer_types):
                # make sure the term IDs are ints, otherwise np will get upset
                ids = [int(id) for id, _ in doc]
            else:
                ids = [id for id, _ in doc]
            cts = np.array([cnt for _, cnt in doc])

            # Get all authors in current document, and convert the author names to integer IDs.
            authors_d = [self.author2id[a] for a in self.doc2author[doc_no]]

            gammad = self.state.gamma[
                authors_d, :]  # gamma of document d before update.
            tilde_gamma = gammad.copy()  # gamma that will be updated.

            # Compute the expectation of the log of the Dirichlet parameters theta and beta.
            Elogthetad = dirichlet_expectation(tilde_gamma)
            expElogthetad = np.exp(Elogthetad)
            expElogbetad = self.expElogbeta[:, ids]

            # Compute the normalizing constant of phi for the current document.
            phinorm = self.compute_phinorm(ids, authors_d, expElogthetad,
                                           expElogbetad)

            # Iterate between gamma and phi until convergence
            for iteration in xrange(self.iterations):

                lastgamma = tilde_gamma.copy()

                # Update gamma.
                # phi is computed implicitly below,
                for ai, a in enumerate(authors_d):
                    tilde_gamma[ai, :] = self.alpha + len(self.author2doc[
                        self.id2author[a]]) * expElogthetad[ai, :] * np.dot(
                            cts / phinorm, expElogbetad.T)

                # Update gamma.
                # Interpolation between document d's "local" gamma (tilde_gamma),
                # and "global" gamma (gammad).
                tilde_gamma = (1 - rhot) * gammad + rhot * tilde_gamma

                # Update Elogtheta and Elogbeta, since gamma and lambda have been updated.
                Elogthetad = dirichlet_expectation(tilde_gamma)
                expElogthetad = np.exp(Elogthetad)

                # Update the normalizing constant in phi.
                phinorm = self.compute_phinorm(ids, authors_d, expElogthetad,
                                               expElogbetad)

                # Check for convergence.
                # Criterion is mean change in "local" gamma.
                meanchange_gamma = np.mean(abs(tilde_gamma - lastgamma))
                gamma_condition = meanchange_gamma < self.gamma_threshold
                if gamma_condition:
                    converged += 1
                    break
            # End of iterations loop.

            # Store the updated gammas in the model state.
            self.state.gamma[authors_d, :] = tilde_gamma

            # Stack the new gammas into the output array.
            gamma_chunk = np.vstack([gamma_chunk, tilde_gamma])

            if collect_sstats:
                # Contribution of document d to the expected sufficient
                # statistics for the M step.
                expElogtheta_sum_a = expElogthetad.sum(axis=0)
                sstats[:, ids] += np.outer(expElogtheta_sum_a.T, cts / phinorm)

        if len(chunk) > 1:
            logger.debug("%i/%i documents converged within %i iterations",
                         converged, len(chunk), self.iterations)

        if collect_sstats:
            # This step finishes computing the sufficient statistics for the
            # M step, so that
            # sstats[k, w] = \sum_d n_{dw} * \sum_a phi_{dwak}
            # = \sum_d n_{dw} * exp{Elogtheta_{ak} + Elogbeta_{kw}} / phinorm_{dw}.
            sstats *= self.expElogbeta
        return gamma_chunk, sstats
    def __init__(self,
                 corpus,
                 id2word,
                 max_chunks=None,
                 max_time=None,
                 chunksize=256,
                 kappa=1.0,
                 tau=64.0,
                 K=15,
                 T=150,
                 alpha=1,
                 gamma=1,
                 eta=0.01,
                 scale=1.0,
                 var_converge=0.0001,
                 outputdir=None,
                 random_state=None):
        """
        `gamma`: first level concentration
        `alpha`: second level concentration
        `eta`: the topic Dirichlet
        `T`: top level truncation level
        `K`: second level truncation level
        `kappa`: learning rate
        `tau`: slow down parameter
        `max_time`: stop training after this many seconds
        `max_chunks`: stop after having processed this many chunks (wrap around
        corpus beginning in another corpus pass, if there are not enough chunks
        in the corpus)
        """
        self.corpus = corpus
        self.id2word = id2word
        self.chunksize = chunksize
        self.max_chunks = max_chunks
        self.max_time = max_time
        self.outputdir = outputdir

        self.random_state = utils.get_random_state(random_state)

        self.lda_alpha = None
        self.lda_beta = None

        self.m_W = len(id2word)
        self.m_D = 0
        if corpus:
            self.m_D = len(corpus)

        self.m_T = T
        self.m_K = K
        self.m_alpha = alpha
        self.m_gamma = gamma

        self.m_var_sticks = np.zeros((2, T - 1))
        self.m_var_sticks[0] = 1.0
        self.m_var_sticks[1] = range(T - 1, 0, -1)
        self.m_varphi_ss = np.zeros(T)

        self.m_lambda = self.random_state.gamma(
            1.0, 1.0, (T, self.m_W)) * self.m_D * 100 / (T * self.m_W) - eta
        self.m_eta = eta
        self.m_Elogbeta = dirichlet_expectation(self.m_eta + self.m_lambda)

        self.m_tau = tau + 1
        self.m_kappa = kappa
        self.m_scale = scale
        self.m_updatect = 0
        self.m_status_up_to_date = True
        self.m_num_docs_processed = 0

        self.m_timestamp = np.zeros(self.m_W, dtype=int)
        self.m_r = [0]
        self.m_lambda_sum = np.sum(self.m_lambda, axis=1)

        self.m_var_converge = var_converge

        if self.outputdir:
            self.save_options()

        # if a training corpus was provided, start estimating the model right away
        if corpus is not None:
            self.update(corpus)
def get_inference_penalty(net, hidden_size, docs_path, topic_num):
    # train the lda model
    selected_docs = pd.read_csv(docs_path, header=None, index_col=[0]).values
    print 'number of docs:', selected_docs.shape
    # print selected_docs[:5]
    texts = [[word for word in doc[0].split(' ')] for doc in selected_docs]
    # pprint(texts[:5])
    dictionary = corpora.Dictionary(texts)
    dictionary.save_as_text(Path+'/data-repository/available_word_in_literature.csv')
    print dictionary
    # print dictionary.token2id
    corpus = [dictionary.doc2bow(text) for text in texts]
    print corpus[:5]
    print len(corpus)
    lda_model = models.LdaModel(corpus, id2word=dictionary, num_topics=topic_num, update_every=1, chunksize=1000, passes=1)

    # to inference the new doc
    # initialize the variational distribution q(theta|gamma) for the chunk
    init_gamma = utils.get_random_state(None).gamma(100., 1. / 100., (hidden_size, topic_num))
    Elogtheta = matutils.dirichlet_expectation(init_gamma)
    expElogtheta = np.exp(Elogtheta)

    converged = 0
    # Now, for each document d update that document's gamma and phi
    # Inference code copied from Hoffman's `onlineldavb.py` (esp. the
    # Lee&Seung trick which speeds things up by an order of magnitude, compared
    # to Blei's original LDA-C code, cool!).
    for para_iter, para in enumerate(net.parameters()):
        if para_iter == 0:
            para_data = para.abs()
            for d, doc in enumerate(chunk):
                if len(doc) > 0 and not isinstance(doc[0][0], six.integer_types + (np.integer,)):
                    # make sure the term IDs are ints, otherwise np will get upset
                    ids = [int(idx) for idx, _ in doc]
                else:
                    ids = [idx for idx, _ in doc]
                cts = np.array([cnt for _, cnt in doc])
                gammad = init_gamma[d, :]
                Elogthetad = Elogtheta[d, :]
                expElogthetad = expElogtheta[d, :]
                expElogbetad = lda_model.expElogbeta[:, ids]

                # The optimal phi_{dwk} is proportional to expElogthetad_k * expElogbetad_w.
                # phinorm is the normalizer.
                # TODO treat zeros explicitly, instead of adding 1e-100?
                phinorm = np.dot(expElogthetad, expElogbetad) + 1e-100

                # Iterate between gamma and phi until convergence
                for _ in xrange(lda_model.iterations):
                    lastgamma = gammad
                    # We represent phi implicitly to save memory and time.
                    # Substituting the value of the optimal phi back into
                    # the update for gamma gives this update. Cf. Lee&Seung 2001.
                    gammad = lda_model.alpha + expElogthetad * np.dot(cts / phinorm, expElogbetad.T)
                    Elogthetad = matutils.dirichlet_expectation(gammad)
                    expElogthetad = np.exp(Elogthetad)
                    phinorm = np.dot(expElogthetad, expElogbetad) + 1e-100
                    # If gamma hasn't changed much, we're done.
                    meanchange = np.mean(abs(gammad - lastgamma))
                    if meanchange < lda_model.gamma_threshold:
                        converged += 1
                        break
                init_gamma[d, :] = gammad
    pass
Пример #32
0
    def inference(self, chunk, collect_sstats=False):
        """
        Given a chunk of sparse document vectors, estimate gamma (parameters
        controlling the topic weights) for each document in the chunk.

        This function does not modify the model (=is read-only aka const). The
        whole input chunk of document is assumed to fit in RAM; chunking of a
        large corpus must be done earlier in the pipeline.

        If `collect_sstats` is True, also collect sufficient statistics needed
        to update the model's topic-word distributions, and return a 2-tuple
        `(gamma, sstats)`. Otherwise, return `(gamma, None)`. `gamma` is of shape
        `len(chunk) x self.num_topics`.

        Avoids computing the `phi` variational parameter directly using the
        optimization presented in **Lee, Seung: Algorithms for non-negative matrix factorization, NIPS 2001**.

        """
        try:
            _ = len(chunk)
        except:
            # convert iterators/generators to plain list, so we have len() etc.
            chunk = list(chunk)
        if len(chunk) > 1:
            logger.debug("performing inference on a chunk of %i documents",
                         len(chunk))

        # Initialize the variational distribution q(theta|gamma) for the chunk
        gamma = self.random_state.gamma(100., 1. / 100.,
                                        (len(chunk), self.num_topics))
        Elogtheta = dirichlet_expectation(gamma)
        expElogtheta = np.exp(Elogtheta)
        if collect_sstats:
            sstats = np.zeros_like(self.expElogbeta)
        else:
            sstats = None
        converged = 0

        # Now, for each document d update that document's gamma and phi
        # Inference code copied from Hoffman's `onlineldavb.py` (esp. the
        # Lee&Seung trick which speeds things up by an order of magnitude, compared
        # to Blei's original LDA-C code, cool!).
        for d, doc in enumerate(chunk):
            if doc and not isinstance(doc[0][0], six.integer_types):
                # make sure the term IDs are ints, otherwise np will get upset
                ids = [int(id) for id, _ in doc]
            else:
                ids = [id for id, _ in doc]
            cts = np.array([cnt for _, cnt in doc])
            gammad = gamma[d, :]
            Elogthetad = Elogtheta[d, :]
            expElogthetad = expElogtheta[d, :]
            expElogbetad = self.expElogbeta[:, ids]

            # The optimal phi_{dwk} is proportional to expElogthetad_k * expElogbetad_w.
            # phinorm is the normalizer.
            # TODO treat zeros explicitly, instead of adding 1e-100?
            phinorm = np.dot(expElogthetad, expElogbetad) + 1e-100

            # Iterate between gamma and phi until convergence
            for _ in xrange(self.iterations):
                lastgamma = gammad
                # We represent phi implicitly to save memory and time.
                # Substituting the value of the optimal phi back into
                # the update for gamma gives this update. Cf. Lee&Seung 2001.
                gammad = self.alpha + expElogthetad * np.dot(
                    cts / phinorm, expElogbetad.T)
                Elogthetad = dirichlet_expectation(gammad)
                expElogthetad = np.exp(Elogthetad)
                phinorm = np.dot(expElogthetad, expElogbetad) + 1e-100
                # If gamma hasn't changed much, we're done.
                meanchange = np.mean(abs(gammad - lastgamma))
                if (meanchange < self.gamma_threshold):
                    converged += 1
                    break
            gamma[d, :] = gammad
            if collect_sstats:
                # Contribution of document d to the expected sufficient
                # statistics for the M step.
                sstats[:, ids] += np.outer(expElogthetad.T, cts / phinorm)

        if len(chunk) > 1:
            logger.debug("%i/%i documents converged within %i iterations",
                         converged, len(chunk), self.iterations)

        if collect_sstats:
            # This step finishes computing the sufficient statistics for the
            # M step, so that
            # sstats[k, w] = \sum_d n_{dw} * phi_{dwk}
            # = \sum_d n_{dw} * exp{Elogtheta_{dk} + Elogbeta_{kw}} / phinorm_{dw}.
            sstats *= self.expElogbeta
        return gamma, sstats
Пример #33
0
    def __init__(self, corpus, id2word, max_chunks=None, max_time=None,
                 chunksize=256, kappa=1.0, tau=64.0, K=15, T=150, alpha=1,
                 gamma=1, eta=0.01, scale=1.0, var_converge=0.0001,
                 outputdir=None, random_state=None):
        """
        `gamma`: first level concentration
        `alpha`: second level concentration
        `eta`: the topic Dirichlet
        `T`: top level truncation level
        `K`: second level truncation level
        `kappa`: learning rate
        `tau`: slow down parameter
        `max_time`: stop training after this many seconds
        `max_chunks`: stop after having processed this many chunks (wrap around
        corpus beginning in another corpus pass, if there are not enough chunks
        in the corpus)
        """
        self.corpus = corpus
        self.id2word = id2word
        self.chunksize = chunksize
        self.max_chunks = max_chunks
        self.max_time = max_time
        self.outputdir = outputdir

        self.random_state = utils.get_random_state(random_state)

        self.lda_alpha = None
        self.lda_beta = None

        self.m_W = len(id2word)
        self.m_D = 0
        if corpus:
            self.m_D = len(corpus)

        self.m_T = T
        self.m_K = K
        self.m_alpha = alpha
        self.m_gamma = gamma

        self.m_var_sticks = np.zeros((2, T - 1))
        self.m_var_sticks[0] = 1.0
        self.m_var_sticks[1] = range(T - 1, 0, -1)
        self.m_varphi_ss = np.zeros(T)

        self.m_lambda = self.random_state.gamma(1.0, 1.0, (T, self.m_W)) * self.m_D * 100 / (T * self.m_W) - eta
        self.m_eta = eta
        self.m_Elogbeta = dirichlet_expectation(self.m_eta + self.m_lambda)

        self.m_tau = tau + 1
        self.m_kappa = kappa
        self.m_scale = scale
        self.m_updatect = 0
        self.m_status_up_to_date = True
        self.m_num_docs_processed = 0

        self.m_timestamp = np.zeros(self.m_W, dtype=int)
        self.m_r = [0]
        self.m_lambda_sum = np.sum(self.m_lambda, axis=1)

        self.m_var_converge = var_converge

        if self.outputdir:
            self.save_options()

        # if a training corpus was provided, start estimating the model right away
        if corpus is not None:
            self.update(corpus)
Пример #34
0
    def __init__(self,
                 corpus=None,
                 num_topics=100,
                 id2word=None,
                 author2doc=None,
                 doc2author=None,
                 chunksize=2000,
                 passes=1,
                 iterations=50,
                 decay=0.5,
                 offset=1.0,
                 alpha='symmetric',
                 eta='symmetric',
                 update_every=1,
                 eval_every=10,
                 gamma_threshold=0.001,
                 serialized=False,
                 serialization_path=None,
                 minimum_probability=0.01,
                 random_state=None):
        """
        If the iterable corpus and one of author2doc/doc2author dictionaries are given,
        start training straight away. If not given, the model is left untrained
        (presumably because you want to call the `update` method manually).

        `num_topics` is the number of requested latent topics to be extracted from
        the training corpus.

        `id2word` is a mapping from word ids (integers) to words (strings). It is
        used to determine the vocabulary size, as well as for debugging and topic
        printing.

        `author2doc` is a dictionary where the keys are the names of authors, and the
        values are lists of documents that the author contributes to.

        `doc2author` is a dictionary where the keys are document IDs (indexes to corpus)
        and the values are lists of author names. I.e. this is the reverse mapping of
        `author2doc`. Only one of the two, `author2doc` and `doc2author` have to be
        supplied.

        `passes` is the number of times the model makes a pass over the entire trianing
        data.

        `iterations` is the maximum number of times the model loops over each document
        (M-step). The iterations stop when convergence is reached.

        `chunksize` controls the size of the mini-batches.

        `alpha` and `eta` are hyperparameters that affect sparsity of the author-topic
        (theta) and topic-word (lambda) distributions. Both default to a symmetric
        1.0/num_topics prior.

        `alpha` can be set to an explicit array = prior of your choice. It also
        support special values of 'asymmetric' and 'auto': the former uses a fixed
        normalized asymmetric 1.0/topicno prior, the latter learns an asymmetric
        prior directly from your data.

        `eta` can be a scalar for a symmetric prior over topic/word
        distributions, or a vector of shape num_words, which can be used to
        impose (user defined) asymmetric priors over the word distribution.
        It also supports the special value 'auto', which learns an asymmetric
        prior over words directly from your data. `eta` can also be a matrix
        of shape num_topics x num_words, which can be used to impose
        asymmetric priors over the word distribution on a per-topic basis
        (can not be learned from data).

        Calculate and log perplexity estimate from the latest mini-batch every
        `eval_every` model updates. Set to None to disable perplexity estimation.

        `decay` and `offset` parameters are the same as Kappa and Tau_0 in
        Hoffman et al, respectively. `decay` controls how quickly old documents are
        forgotten, while `offset` down-weights early iterations.

        `minimum_probability` controls filtering the topics returned for a document (bow).

        `random_state` can be an integer or a numpy.random.RandomState object. Set the
        state of the random number generator inside the author-topic model, to ensure
        reproducibility of your experiments, for example.

        `serialized` indicates whether the input corpora to the model are simple
        in-memory lists (`serialized = False`) or saved to the hard-drive
        (`serialized = True`). Note that this behaviour is quite different from
        other Gensim models. If your data is too large to fit in to memory, use
        this functionality. Note that calling `AuthorTopicModel.update` with new
        data may be cumbersome as it requires all the existing data to be
        re-serialized.

        `serialization_path` must be set to a filepath, if `serialized = True` is
        used. Use, for example, `serialization_path = /tmp/serialized_model.mm` or use your
        working directory by setting `serialization_path = serialized_model.mm`. An existing
        file *cannot* be overwritten; either delete the old file or choose a different
        name.

        Example:

        >>> model = AuthorTopicModel(corpus, num_topics=100, author2doc=author2doc, id2word=id2word)  # train model
        >>> model.update(corpus2)  # update the author-topic model with additional documents

        >>> model = AuthorTopicModel(corpus, num_topics=50, author2doc=author2doc, id2word=id2word, alpha='auto', eval_every=5)  # train asymmetric alpha from data

        """

        # NOTE: as distributed version of this model is not implemented, "distributed" is set to false. Some of the
        # infrastructure to implement a distributed author-topic model is already in place, such as the AuthorTopicState.
        distributed = False
        self.dispatcher = None
        self.numworkers = 1

        self.id2word = id2word
        if corpus is None and self.id2word is None:
            raise ValueError(
                'at least one of corpus/id2word must be specified, to establish input space dimensionality'
            )

        if self.id2word is None:
            logger.warning(
                "no word id mapping provided; initializing from corpus, assuming identity"
            )
            self.id2word = utils.dict_from_corpus(corpus)
            self.num_terms = len(self.id2word)
        elif len(self.id2word) > 0:
            self.num_terms = 1 + max(self.id2word.keys())
        else:
            self.num_terms = 0

        if self.num_terms == 0:
            raise ValueError(
                "cannot compute the author-topic model over an empty collection (no terms)"
            )

        logger.info('Vocabulary consists of %d words.', self.num_terms)

        self.author2doc = {}
        self.doc2author = {}

        self.distributed = distributed
        self.num_topics = num_topics
        self.num_authors = 0
        self.chunksize = chunksize
        self.decay = decay
        self.offset = offset
        self.minimum_probability = minimum_probability
        self.num_updates = 0
        self.total_docs = 0

        self.passes = passes
        self.update_every = update_every
        self.eval_every = eval_every

        self.author2id = {}
        self.id2author = {}

        self.serialized = serialized
        if serialized and not serialization_path:
            raise ValueError(
                "If serialized corpora are used, a the path to a folder where the corpus should be saved must be provided (serialized_path)."
            )
        if serialized and serialization_path:
            assert not isfile(
                serialization_path
            ), "A file already exists at the serialization_path path; choose a different serialization_path, or delete the file."
        self.serialization_path = serialization_path

        # Initialize an empty self.corpus.
        self.init_empty_corpus()

        self.alpha, self.optimize_alpha = self.init_dir_prior(alpha, 'alpha')

        assert self.alpha.shape == (
            self.num_topics,
        ), "Invalid alpha shape. Got shape %s, but expected (%d, )" % (str(
            self.alpha.shape), self.num_topics)

        if isinstance(eta, six.string_types):
            if eta == 'asymmetric':
                raise ValueError(
                    "The 'asymmetric' option cannot be used for eta")

        self.eta, self.optimize_eta = self.init_dir_prior(eta, 'eta')

        self.random_state = utils.get_random_state(random_state)

        assert (
            self.eta.shape == (self.num_terms, )
            or self.eta.shape == (self.num_topics, self.num_terms)
        ), ("Invalid eta shape. Got shape %s, but expected (%d, 1) or (%d, %d)"
            % (str(self.eta.shape), self.num_terms, self.num_topics,
               self.num_terms))

        # VB constants
        self.iterations = iterations
        self.gamma_threshold = gamma_threshold

        # Initialize the variational distributions q(beta|lambda) and q(theta|gamma)
        self.state = AuthorTopicState(self.eta,
                                      (self.num_topics, self.num_terms),
                                      (self.num_authors, self.num_topics))
        self.state.sstats = self.random_state.gamma(
            100., 1. / 100., (self.num_topics, self.num_terms))
        self.expElogbeta = np.exp(dirichlet_expectation(self.state.sstats))

        # if a training corpus was provided, start estimating the model right away
        if corpus is not None and (author2doc is not None
                                   or doc2author is not None):
            use_numpy = self.dispatcher is not None
            self.update(corpus,
                        author2doc,
                        doc2author,
                        chunks_as_numpy=use_numpy)
Пример #35
0
    def __init__(self, corpus=None, num_topics=100, id2word=None,
                 distributed=False, chunksize=2000, passes=1, update_every=1,
                 alpha='symmetric', eta=None, decay=0.5, offset=1.0,
                 eval_every=10, iterations=50, gamma_threshold=0.001,
                 minimum_probability=0.01, random_state=None, ns_conf={},
                 minimum_phi_value=0.01, per_word_topics=False):
        """
        If given, start training from the iterable `corpus` straight away. If not given,
        the model is left untrained (presumably because you want to call `update()` manually).

        `num_topics` is the number of requested latent topics to be extracted from
        the training corpus.

        `id2word` is a mapping from word ids (integers) to words (strings). It is
        used to determine the vocabulary size, as well as for debugging and topic
        printing.

        `alpha` and `eta` are hyperparameters that affect sparsity of the document-topic
        (theta) and topic-word (lambda) distributions. Both default to a symmetric
        1.0/num_topics prior.

        `alpha` can be set to an explicit array = prior of your choice. It also
        support special values of 'asymmetric' and 'auto': the former uses a fixed
        normalized asymmetric 1.0/topicno prior, the latter learns an asymmetric
        prior directly from your data.

        `eta` can be a scalar for a symmetric prior over topic/word
        distributions, or a vector of shape num_words, which can be used to
        impose (user defined) asymmetric priors over the word distribution.
        It also supports the special value 'auto', which learns an asymmetric
        prior over words directly from your data. `eta` can also be a matrix
        of shape num_topics x num_words, which can be used to impose
        asymmetric priors over the word distribution on a per-topic basis
        (can not be learned from data).

        Turn on `distributed` to force distributed computing (see the `web tutorial <http://radimrehurek.com/gensim/distributed.html>`_
        on how to set up a cluster of machines for gensim).

        Calculate and log perplexity estimate from the latest mini-batch every
        `eval_every` model updates (setting this to 1 slows down training ~2x;
        default is 10 for better performance). Set to None to disable perplexity estimation.

        `decay` and `offset` parameters are the same as Kappa and Tau_0 in
        Hoffman et al, respectively.

        `minimum_probability` controls filtering the topics returned for a document (bow).

        `random_state` can be a np.random.RandomState object or the seed for one

        Example:

        >>> lda = LdaModel(corpus, num_topics=100)  # train model
        >>> print(lda[doc_bow]) # get topic probability distribution for a document
        >>> lda.update(corpus2) # update the LDA model with additional documents
        >>> print(lda[doc_bow])

        >>> lda = LdaModel(corpus, num_topics=50, alpha='auto', eval_every=5)  # train asymmetric alpha from data

        """

        # store user-supplied parameters
        self.id2word = id2word
        if corpus is None and self.id2word is None:
            raise ValueError('at least one of corpus/id2word must be specified, to establish input space dimensionality')

        if self.id2word is None:
            logger.warning("no word id mapping provided; initializing from corpus, assuming identity")
            self.id2word = utils.dict_from_corpus(corpus)
            self.num_terms = len(self.id2word)
        elif len(self.id2word) > 0:
            self.num_terms = 1 + max(self.id2word.keys())
        else:
            self.num_terms = 0

        if self.num_terms == 0:
            raise ValueError("cannot compute LDA over an empty collection (no terms)")

        self.distributed = bool(distributed)
        self.num_topics = int(num_topics)
        self.chunksize = chunksize
        self.decay = decay
        self.offset = offset
        self.minimum_probability = minimum_probability
        self.num_updates = 0

        self.passes = passes
        self.update_every = update_every
        self.eval_every = eval_every
        self.minimum_phi_value = minimum_phi_value
        self.per_word_topics = per_word_topics

        self.alpha, self.optimize_alpha = self.init_dir_prior(alpha, 'alpha')

        assert self.alpha.shape == (self.num_topics,), "Invalid alpha shape. Got shape %s, but expected (%d, )" % (str(self.alpha.shape), self.num_topics)

        if isinstance(eta, six.string_types):
            if eta == 'asymmetric':
                raise ValueError("The 'asymmetric' option cannot be used for eta")

        self.eta, self.optimize_eta = self.init_dir_prior(eta, 'eta')

        self.random_state = utils.get_random_state(random_state)

        assert self.eta.shape == (self.num_terms,) or self.eta.shape == (self.num_topics, self.num_terms), (
            "Invalid eta shape. Got shape %s, but expected (%d, 1) or (%d, %d)" %
            (str(self.eta.shape), self.num_terms, self.num_topics, self.num_terms))

        # VB constants
        self.iterations = iterations
        self.gamma_threshold = gamma_threshold

        # set up distributed environment if necessary
        if not distributed:
            logger.info("using serial LDA version on this node")
            self.dispatcher = None
            self.numworkers = 1
        else:
            if self.optimize_alpha:
                raise NotImplementedError("auto-optimizing alpha not implemented in distributed LDA")
            # set up distributed version
            try:
                import Pyro4
                with utils.getNS(**ns_conf) as ns:
                    from gensim.models.lda_dispatcher import LDA_DISPATCHER_PREFIX
                    self.dispatcher = Pyro4.Proxy(ns.list(prefix=LDA_DISPATCHER_PREFIX)[LDA_DISPATCHER_PREFIX])
                    logger.debug("looking for dispatcher at %s" % str(self.dispatcher._pyroUri))
                    self.dispatcher.initialize(id2word=self.id2word, num_topics=self.num_topics,
                                               chunksize=chunksize, alpha=alpha, eta=eta, distributed=False)
                    self.numworkers = len(self.dispatcher.getworkers())
                    logger.info("using distributed version with %i workers" % self.numworkers)
            except Exception as err:
                logger.error("failed to initialize distributed LDA (%s)", err)
                raise RuntimeError("failed to initialize distributed LDA (%s)" % err)

        # Initialize the variational distribution q(beta|lambda)
        self.state = LdaState(self.eta, (self.num_topics, self.num_terms))
        self.state.sstats = self.random_state.gamma(100., 1. / 100., (self.num_topics, self.num_terms))
        self.expElogbeta = np.exp(dirichlet_expectation(self.state.sstats))

        # if a training corpus was provided, start estimating the model right away
        if corpus is not None:
            use_numpy = self.dispatcher is not None
            self.update(corpus, chunks_as_numpy=use_numpy)
Пример #36
0
    def bound(self,
              chunk,
              chunk_doc_idx=None,
              subsample_ratio=1.0,
              author2doc=None,
              doc2author=None):
        """
        Estimate the variational bound of documents from `corpus`:
        E_q[log p(corpus)] - E_q[log q(corpus)]

        There are basically two use cases of this method:
        1. `chunk` is a subset of the training corpus, and `chunk_doc_idx` is provided,
        indicating the indexes of the documents in the training corpus.
        2. `chunk` is a test set (held-out data), and author2doc and doc2author
        corrsponding to this test set are provided. There must not be any new authors
        passed to this method. `chunk_doc_idx` is not needed in this case.

        To obtain the per-word bound, compute:

        >>> corpus_words = sum(cnt for document in corpus for _, cnt in document)
        >>> model.bound(corpus, author2doc=author2doc, doc2author=doc2author) / corpus_words

        """

        # TODO: enable evaluation of documents with new authors. One could, for example, make it
        # possible to pass a list of documents to self.inference with no author dictionaries,
        # assuming all the documents correspond to one (unseen) author, learn the author's
        # gamma, and return gamma (without adding it to self.state.gamma). Of course,
        # collect_sstats should be set to false, so that the model is not updated w.r.t. these
        # new documents.

        _lambda = self.state.get_lambda()
        Elogbeta = dirichlet_expectation(_lambda)
        expElogbeta = np.exp(Elogbeta)

        gamma = self.state.gamma

        if author2doc is None and doc2author is None:
            # Evaluating on training documents (chunk of self.corpus).
            author2doc = self.author2doc
            doc2author = self.doc2author

            if not chunk_doc_idx:
                # If author2doc and doc2author are not provided, chunk is assumed to be a subset of
                # self.corpus, and chunk_doc_idx is thus required.
                raise ValueError(
                    'Either author dictionaries or chunk_doc_idx must be provided. Consult documentation of bound method.'
                )
        elif author2doc is not None and doc2author is not None:
            # Training on held-out documents (documents not seen during training).
            # All authors in dictionaries must still be seen during training.
            for a in author2doc.keys():
                if not self.author2doc.get(a):
                    raise ValueError(
                        'bound cannot be called with authors not seen during training.'
                    )

            if chunk_doc_idx:
                raise ValueError(
                    'Either author dictionaries or chunk_doc_idx must be provided, not both. Consult documentation of bound method.'
                )
        else:
            raise ValueError(
                'Either both author2doc and doc2author should be provided, or neither. Consult documentation of bound method.'
            )

        Elogtheta = dirichlet_expectation(gamma)
        expElogtheta = np.exp(Elogtheta)

        word_score = 0.0
        theta_score = 0.0
        for d, doc in enumerate(chunk):
            if chunk_doc_idx:
                doc_no = chunk_doc_idx[d]
            else:
                doc_no = d
            # Get all authors in current document, and convert the author names to integer IDs.
            authors_d = [self.author2id[a] for a in self.doc2author[doc_no]]
            ids = np.array([id for id, _ in doc])  # Word IDs in doc.
            cts = np.array([cnt for _, cnt in doc])  # Word counts.

            if d % self.chunksize == 0:
                logger.debug("bound: at document #%i in chunk", d)

            # Computing the bound requires summing over expElogtheta[a, k] * expElogbeta[k, v], which
            # is the same computation as in normalizing phi.
            phinorm = self.compute_phinorm(ids, authors_d,
                                           expElogtheta[authors_d, :],
                                           expElogbeta[:, ids])
            word_score += np.log(1.0 / len(authors_d)) + cts.dot(
                np.log(phinorm))

        # Compensate likelihood for when `chunk` above is only a sample of the whole corpus. This ensures
        # that the likelihood is always rougly on the same scale.
        word_score *= subsample_ratio

        # E[log p(theta | alpha) - log q(theta | gamma)]
        for a in author2doc.keys():
            a = self.author2id[a]
            theta_score += np.sum((self.alpha - gamma[a, :]) * Elogtheta[a, :])
            theta_score += np.sum(gammaln(gamma[a, :]) - gammaln(self.alpha))
            theta_score += gammaln(np.sum(self.alpha)) - gammaln(
                np.sum(gamma[a, :]))

        # theta_score is rescaled in a similar fashion.
        # TODO: treat this in a more general way, similar to how it is done with word_score.
        theta_score *= self.num_authors / len(author2doc)

        # E[log p(beta | eta) - log q (beta | lambda)]
        beta_score = 0.0
        beta_score += np.sum((self.eta - _lambda) * Elogbeta)
        beta_score += np.sum(gammaln(_lambda) - gammaln(self.eta))
        sum_eta = np.sum(self.eta)
        beta_score += np.sum(gammaln(sum_eta) - gammaln(np.sum(_lambda, 1)))

        total_score = word_score + theta_score + beta_score

        return total_score
Пример #37
0
    def __init__(self, corpus, id2word, max_chunks=None, max_time=None,
                 chunksize=256, kappa=1.0, tau=64.0, K=15, T=150, alpha=1,
                 gamma=1, eta=0.01, scale=1.0, var_converge=0.0001,
                 outputdir=None, random_state=None):
        """

        Parameters
        ----------
        corpus : iterable of list of (int, float)
            Corpus in BoW format.
        id2word : :class:`~gensim.corpora.dictionary.Dictionary`
            Dictionary for the input corpus.
        max_chunks : int, optional
            Upper bound on how many chunks to process. It wraps around corpus beginning in another corpus pass,
            if there are not enough chunks in the corpus.
        max_time : int, optional
            Upper bound on time (in seconds) for which model will be trained.
        chunksize : int, optional
            Number of documents in one chuck.
        kappa: float,optional
            Learning parameter which acts as exponential decay factor to influence extent of learning from each batch.
        tau: float, optional
            Learning parameter which down-weights early iterations of documents.
        K : int, optional
            Second level truncation level
        T : int, optional
            Top level truncation level
        alpha : int, optional
            Second level concentration
        gamma : int, optional
            First level concentration
        eta : float, optional
            The topic Dirichlet
        scale : float, optional
            Weights information from the mini-chunk of corpus to calculate rhot.
        var_converge : float, optional
            Lower bound on the right side of convergence. Used when updating variational parameters for a
            single document.
        outputdir : str, optional
            Stores topic and options information in the specified directory.
        random_state : {None, int, array_like, :class:`~np.random.RandomState`, optional}
            Adds a little random jitter to randomize results around same alpha when trying to fetch a closest
            corresponding lda model from :meth:`~gensim.models.hdpmodel.HdpModel.suggested_lda_model`

        """
        self.corpus = corpus
        self.id2word = id2word
        self.chunksize = chunksize
        self.max_chunks = max_chunks
        self.max_time = max_time
        self.outputdir = outputdir

        self.random_state = utils.get_random_state(random_state)

        self.lda_alpha = None
        self.lda_beta = None

        self.m_W = len(id2word)
        self.m_D = 0
        if corpus:
            self.m_D = len(corpus)

        self.m_T = T
        self.m_K = K
        self.m_alpha = alpha
        self.m_gamma = gamma

        self.m_var_sticks = np.zeros((2, T - 1))
        self.m_var_sticks[0] = 1.0
        self.m_var_sticks[1] = range(T - 1, 0, -1)
        self.m_varphi_ss = np.zeros(T)

        self.m_lambda = self.random_state.gamma(1.0, 1.0, (T, self.m_W)) * self.m_D * 100 / (T * self.m_W) - eta
        self.m_eta = eta
        self.m_Elogbeta = dirichlet_expectation(self.m_eta + self.m_lambda)

        self.m_tau = tau + 1
        self.m_kappa = kappa
        self.m_scale = scale
        self.m_updatect = 0
        self.m_status_up_to_date = True
        self.m_num_docs_processed = 0

        self.m_timestamp = np.zeros(self.m_W, dtype=int)
        self.m_r = [0]
        self.m_lambda_sum = np.sum(self.m_lambda, axis=1)

        self.m_var_converge = var_converge

        if self.outputdir:
            self.save_options()

        # if a training corpus was provided, start estimating the model right away
        if corpus is not None:
            self.update(corpus)