Exemplo n.º 1
0
    def __init__(self,
                 K,
                 alpha,
                 eta,
                 tau0,
                 kappa,
                 sanity_check=False,
                 parse=parse):
        """
        Arguments:
        K: Number of topics
        alpha: Hyperparameter for prior on weight vectors theta
        eta: Hyperparameter for prior on topics beta
        tau0: A (positive) learning parameter that downweights early iterations
        kappa: Learning rate: exponential decay rate---should be between
             (0.5, 1.0] to guarantee asymptotic convergence.

        Note that if you pass the same set of D documents in every time and
        set kappa=0 this class can also be used to do batch VB.
        """

        if not isinstance(K, int):
            raise ParameterError

        # set the model-level parameters
        self._K = K
        self._alpha = alpha
        self._eta = eta
        self._tau0 = tau0 + 1
        self._kappa = kappa
        self.sanity_check = sanity_check
        # number of documents seen *so far*. Updated each time a new batch is
        # submitted.
        self._D = 0

        # number of batches processed so far.
        self._batches_to_date = 0

        # cache the wordids and wordcts for the most recent batch so they don't
        # have to be recalculated when computing perplexity
        self.recentbatch = {'wordids': None, 'wordcts': None}

        # Initialize lambda as a DirichletWords object which has a non-zero
        # probability for any character sequence, even those unseen.
        self._lambda = DirichletWords(self._K,
                                      sanity_check=self.sanity_check,
                                      initialize=True)
        self._lambda_mat = self._lambda.as_matrix()

        # set the variational distribution q(beta|lambda).
        self._Elogbeta = self._lambda_mat  # num_topics x num_words
        self._expElogbeta = n.exp(self._Elogbeta)  # num_topics x num_words

        # normalize and parse string function.
        self.parse = parse
Exemplo n.º 2
0
    def do_e_step(self, docs):
        """
        Given a mini-batch of documents, estimates the parameters
        gamma controlling the variational distribution over the topic
        weights for each document in the mini-batch.

        Arguments:
        docs:  List of D documents. Each document must be represented
               as a string. (Word order is unimportant.) Any
               words not in the vocabulary will be ignored.

        Returns a tuple containing the estimated values of gamma,
        as well as sufficient statistics needed to update lambda.
        """
        # This is to handle the case where someone just passes in a single
        # document, not in a list.
        if type(docs) == str: docs = [
                docs,
        ]

        (wordids, wordcts) = self.parse_new_docs(docs)
        # don't use len(docs) here because if we encounter any empty documents,
        # they'll be skipped in the parse step above, and then batchD will be
        # longer than wordids list.
        batchD = len(wordids)

        # Initialize the variational distribution q(theta|gamma) for
        # the mini-batch
        gamma = 1 * n.random.gamma(100., 1. / 100.,
                                   (batchD, self._K))  # batchD x K
        Elogtheta = dirichlet_expectation(gamma)  # D x K
        expElogtheta = n.exp(Elogtheta)

        # create a new_lambda to store the stats for this batch
        new_lambda = DirichletWords(self._K, sanity_check=self.sanity_check)

        # Now, for each document d update that document's gamma and phi
        it = 0
        meanchange = 0
        for d in range(0, batchD):
            if d % 10 == 0:
                print 'Updating gamma and phi for document %d in batch' % d
            # These are mostly just shorthand (but might help cache locality)
            ids = wordids[d]
            cts = wordcts[d]
            gammad = gamma[d, :]
            Elogthetad = Elogtheta[d, :]  # K x 1
            expElogthetad = expElogtheta[d, :]  # k x 1 for this D.
            # make sure exp/Elogbeta is initialized for all the needed indices.
            self.Elogbeta_sizecheck(ids)
            expElogbetad = self._expElogbeta[:,
                                             ids]  # dims(expElogbetad) = k x len(doc_vocab)
            # The optimal phi_{dwk} is proportional to
            # expElogthetad_k * expElogbetad_w. phinorm is the normalizer.
            phinorm = n.dot(expElogthetad, expElogbetad) + 1e-100

            # Iterate between gamma and phi until convergence
            for it in range(0, 100):
                lastgamma = gammad
                # In these steps, phi is represented implicitly to save memory
                # and time.  Substituting the value of the optimal phi back
                # into the update for gamma gives this update. Cf. Lee&Seung
                # 2001.
                gammad = self._alpha + expElogthetad * \
                    n.dot(cts / phinorm, expElogbetad.T)
                Elogthetad = dirichlet_expectation(gammad)
                expElogthetad = n.exp(Elogthetad)
                phinorm = n.dot(expElogthetad, expElogbetad) + 1e-100
                # If gamma hasn't changed much, we're done.
                meanchange = n.mean(abs(gammad - lastgamma))
                if (meanchange < meanchangethresh):
                    break
            gamma[d, :] = gammad
            # Contribution of document d to the expected sufficient
            # statistics for the M step. Updates the statistics only for words
            # in ids list, with their respective counts in cts (also a list).
            # the multiplying factor from self._expElogbeta
            # lambda_stats is basically phi multiplied by the word counts, ie
            # lambda_stats_wk = n_dw * phi_dwk
            # the sum over documents shown in equation (5) happens as each
            # document is iterated over.

            # lambda stats is K x len(ids), while the actual word ids can be
            # any integer, so we need a way to map word ids to their
            # lambda_stats (ie we can't just index into the lambda_stats array
            # using the wordid because it will be out of range). so we create
            # lambda_data, which contains a list of 2-tuples of length len(ids).
            # the first tuple item contains the wordid, and the second contains
            # a numpy array with the statistics for each topic, for that word.

            lambda_stats = n.outer(expElogthetad.T,
                                   cts / phinorm) * expElogbetad
            lambda_data = zip(ids, lambda_stats.T)
            for wordid, stats in lambda_data:
                word = self._lambda.dictionary(wordid)
                for topic in xrange(self._K):
                    stats_wk = stats[topic]
                    new_lambda.update_count(word, topic, stats_wk)

        return ((gamma, new_lambda))