Exemplo n.º 1
0
    def __init__(self,
                 data=None,
                 num_topics=100,
                 alpha=0.01,
                 eta=0.01,
                 tau0=1.0,
                 kappa=0.9,
                 conv_infer=0.0001,
                 iter_infer=50,
                 lda_model=None):
        super(OnlineVB, self).__init__(data, num_topics, lda_model)

        self.num_docs = 0
        self._alpha = alpha
        self._eta = eta
        self._tau0 = tau0
        self._kappa = kappa
        self._updatect = 1
        self._conv_infer = conv_infer
        self._iter_infer = iter_infer

        if self.data is not None or self.lda_model is not None:
            if self.data is not None:
                self.num_terms = data.get_num_terms()

            if self.lda_model is not None:
                self.num_topics, self.num_terms = self.lda_model.model.shape
            else:
                # Initialize the variational distribution q(beta|lambda)
                self.lda_model = LdaModel(self.num_terms, num_topics, 1)
            self._Elogbeta = dirichlet_expectation(self.lda_model.model)
            self._expElogbeta = n.exp(self._Elogbeta)
Exemplo n.º 2
0
    def __init__(self,
                 data=None,
                 num_topics=100,
                 alpha=0.01,
                 eta=0.01,
                 iter_infer=50,
                 lda_model=None):
        """
        Arguments:
            num_terms: Number of unique terms in the corpus (length of the vocabulary).
            num_topics: Number of topics shared by the whole corpus.
            alpha: Hyperparameter for prior on topic mixture theta.
            eta: Hyperparameter for prior on topics beta.
            iter_infer: Number of iterations of FW algorithm.
        """
        super(StreamingOPE, self).__init__(data, num_topics, lda_model)

        self.num_topics = num_topics
        self.alpha = alpha
        self.eta = eta
        self.INF_MAX_ITER = iter_infer

        if self.data is not None or self.lda_model is not None:
            if self.data is not None:
                self.num_terms = data.get_num_terms()

            if self.lda_model is not None:
                self.num_topics, self.num_terms = self.lda_model.model.shape
            else:
                # Initialize lambda (variational parameters of topics beta)
                # beta_norm stores values, each of which is sum of elements in each row
                # of _lambda.
                self.lda_model = LdaModel(self.num_terms, num_topics)
            self.beta_norm = self.lda_model.model.sum(axis=1)
Exemplo n.º 3
0
    def __init__(self,
                 data=None,
                 num_topics=100,
                 alpha=0.01,
                 eta=0.01,
                 tau_phi=1.0,
                 kappa_phi=0.9,
                 s_phi=1.0,
                 tau_theta=10.0,
                 kappa_theta=0.9,
                 s_theta=1.0,
                 burn_in=25,
                 lda_model=None):
        """

        Args:
            num_tokens:
            num_terms:
            num_topics:
            alpha:
            eta:
            tau_phi:
            kappa_phi:
            s_phi:
            tau_theta:
            kappa_theta:
            s_theta:
            burn_in:
            lda_model:
        """
        super(OnlineCVB0, self).__init__(data, num_topics, lda_model)

        self.num_topics = num_topics
        self.alpha = alpha
        self.eta = eta
        self.eta_sum = num_topics * eta
        self.tau_phi = tau_phi
        self.kappa_phi = kappa_phi
        self.s_phi = s_phi
        self.tau_theta = tau_theta
        self.kappa_theta = kappa_theta
        self.s_theta = s_theta
        self.burn_in = burn_in
        self.updatect = 1

        if self.data is not None or self.lda_model is not None:
            if self.data is not None:
                self.num_tokens = data.get_num_tokens()
                self.num_terms = data.get_num_terms()

            if self.lda_model is not None:
                self.num_topics, self.num_terms = self.lda_model.model.shape
            else:
                # self.N_phi = np.random.rand(num_topics, num_terms)
                # replace N_phi with lda model
                self.lda_model = LdaModel(self.num_terms, self.num_topics)
            self.N_Z = self.lda_model.model.sum(axis=1)
Exemplo n.º 4
0
    def __init__(self,
                 data=None,
                 num_topics=100,
                 alpha=0.01,
                 tau0=1.0,
                 kappa=0.9,
                 burn_in=25,
                 samples=25,
                 lda_model=None):
        """

        Args:
            num_terms:
            num_topics:
            alpha:
            tau0:
            kappa:
            burn_in:
            samples:
            lda_model:
        """
        super(MLCGS, self).__init__(data, num_topics, lda_model)

        self.num_topics = num_topics
        self._alpha = alpha
        self._tau0 = tau0
        self._kappa = kappa
        self.burn_in = burn_in  # burn-in
        self.samples = samples  # samples
        self._sweeps = burn_in + samples
        self.update_unit = 1. / samples
        self._update_t = 1

        if self.data is not None or self.lda_model is not None:
            if self.data is not None:
                self.num_terms = data.get_num_terms()

            if self.lda_model is not None:
                self.num_topics, self.num_terms = self.lda_model.model.shape
            else:
                # initialize the variational distribution q(beta|lambda)
                self.lda_model = LdaModel(self.num_terms, num_topics)

            self.lda_model.normalize()
Exemplo n.º 5
0
    def __init__(self,
                 data=None,
                 num_topics=100,
                 alpha=0.01,
                 tau0=1.0,
                 kappa=0.9,
                 iter_infer=50,
                 lda_model=None):
        """
        Arguments:
            num_terms: Number of unique terms in the corpus (length of the vocabulary).
            num_topics: Number of topics shared by the whole corpus.
            alpha: Hyperparameter for prior on topic mixture theta.
            tau0: A (positive) learning parameter that downweights early iterations.
            kappa: Learning rate: exponential decay rate should be between
                   (0.5, 1.0] to guarantee asymptotic convergence.
            iter_infer: Number of iterations of FW algorithm 

        Note that if you pass the same set of all documents in the corpus every time and
        set kappa=0 this class can also be used to do batch OPE.
        """
        super(MLOPE, self).__init__(data, num_topics, lda_model)
        self.num_topics = num_topics
        self.alpha = alpha
        self.tau0 = tau0
        self.kappa = kappa
        self.updatect = 1
        self.INF_MAX_ITER = iter_infer

        if self.data is not None or self.lda_model is not None:
            if self.data is not None:
                self.num_terms = data.get_num_terms()

            if self.lda_model is not None:
                self.num_topics, self.num_terms = self.lda_model.model.shape
            else:
                # Initialize beta (topics)
                self.lda_model = LdaModel(self.num_terms, num_topics)
            self.lda_model.normalize()
Exemplo n.º 6
0
    def __init__(self,
                 data=None,
                 num_topics=100,
                 eta=0.01,
                 tau0=1.0,
                 kappa=0.9,
                 iter_infer=50,
                 lda_model=None):
        """
        Arguments:
            num_docs: Number of documents in the corpus.
            num_terms: Number of unique terms in the corpus (length of the vocabulary).
            num_topics: Number of topics shared by the whole corpus.
            eta: Hyperparameter for prior on topics beta.
            tau0: A (positive) learning parameter that downweights early iterations.
            kappa: Learning rate: exponential decay rate should be between
                   (0.5, 1.0] to guarantee asymptotic convergence.
            iter_infer: Number of iterations of FW algorithm.
        """
        super(OnlineFW, self).__init__(data, num_topics, lda_model)

        self.num_docs = 0
        self.eta = eta
        self.tau0 = tau0
        self.kappa = kappa
        self.updatect = 1
        self.INF_MAX_ITER = iter_infer

        # Generate values used for initilaization of topic mixture of each document
        self.theta_init = [1e-10] * num_topics
        self.theta_vert = 1. - 1e-10 * (num_topics - 1)

        if self.data is not None or self.lda_model is not None:
            if self.data is not None:
                self.num_terms = data.get_num_terms()

            if self.lda_model is not None:
                self.num_topics, self.num_terms = self.lda_model.model.shape
            else:
                # Initialize lambda (variational parameters of topics beta)
                # beta_norm stores values, each of which is sum of elements in each row
                # of _lambda.
                self.lda_model = LdaModel(self.num_terms, num_topics)
            self.beta_norm = self.lda_model.model.sum(axis=1)
Exemplo n.º 7
0
        # parse cmd line
        k = int(sys.argv[1])
        datafile = sys.argv[2]

        # load corpus
        if datafile.endswith(".mm"):
            corpus = corpora.MmCorpus(datafile)
            # there is no word id mapping in MM format; use word=wordId
            id2word = dict((wordId, str(wordId)) for wordId in xrange(corpus.numTerms))
        else:
            corpus = corpora.CorpusLow(datafile)
            id2word = corpus.id2word
        # corpus.saveAsBlei()
        # run parameter estimation; this is the step that takes the most time
        model = LdaModel(id2word=id2word, numTopics=k)
        model.initialize(corpus)

        # store parameters, print topics info (for sanity check)
        model.save(datafile + ".model")
        if PRINT_TOPICS:
            logging.info("printing topics (top %i words)" % PRINT_TOPICS)
            model.printTopics(numWords=PRINT_TOPICS)
            print "=" * 40
    elif "infer" in program:
        # make sure we have enough cmd line parameters
        if len(sys.argv) < 3:
            print globals()["__doc__"]
            sys.exit(1)

        # parse cmd line
Exemplo n.º 8
0
        # parse cmd line
        k = int(sys.argv[1])
        datafile = sys.argv[2]

        # load corpus
        if datafile.endswith('.mm'):
            corpus = corpora.MmCorpus(datafile)
            # there is no word id mapping in MM format; use word=wordId
            id2word = dict(
                (wordId, str(wordId)) for wordId in xrange(corpus.numTerms))
        else:
            corpus = corpora.CorpusLow(datafile)
            id2word = corpus.id2word
        #corpus.saveAsBlei()
        # run parameter estimation; this is the step that takes the most time
        model = LdaModel(id2word=id2word, numTopics=k)
        model.initialize(corpus)

        # store parameters, print topics info (for sanity check)
        model.save(datafile + '.model')
        if PRINT_TOPICS:
            logging.info("printing topics (top %i words)" % PRINT_TOPICS)
            model.printTopics(numWords=PRINT_TOPICS)
            print '=' * 40
    elif 'infer' in program:
        # make sure we have enough cmd line parameters
        if len(sys.argv) < 3:
            print globals()["__doc__"]
            sys.exit(1)

        # parse cmd line
Exemplo n.º 9
0
class MLFW(LdaLearning):
    """
    Implements ML-FW for LDA as described in "Inference in topic models I: sparsity and trade-off". 
    """

    def __init__(self, data=None, num_topics=100, tau0=1.0, kappa=0.9, iter_infer=50, lda_model=None):
        """
        Arguments:
            num_terms: Number of unique terms in the corpus (length of the vocabulary).
            num_topics: Number of topics shared by the whole corpus.
            tau0: A (positive) learning parameter that downweights early iterations.
            kappa: Learning rate: exponential decay rate should be between
                   (0.5, 1.0] to guarantee asymptotic convergence.
            iter_infer: Number of iterations of FW algorithm 

        Note that if you pass the same set of all documents in the corpus every time and
        set kappa=0 this class can also be used to do batch FW.
        """
        super(MLFW, self).__init__(data, num_topics, lda_model)

        self.num_topics = num_topics
        self.tau0 = tau0
        self.kappa = kappa
        self.updatect = 1
        self.INF_MAX_ITER = iter_infer

        # Generate values used for initilization of topic mixture of each document
        self.theta_init = [1e-10] * num_topics
        self.theta_vert = 1. - 1e-10 * (num_topics - 1)

        if self.data is not None or self.lda_model is not None:
            if self.data is not None:
                self.num_terms = data.get_num_terms()

            if self.lda_model is not None:
                self.num_topics, self.num_terms = self.lda_model.model.shape
            else:
                # Initialize beta (topics)
                self.lda_model = LdaModel(self.num_terms, num_topics)

            self.lda_model.normalize()
            self.logbeta = np.log(self.lda_model.model)

    def static_online(self, wordids, wordcts):
        """
        First does an E step on the mini-batch given in wordids and
        wordcts, then uses the result of that E step to update the
        topics in M step.
		
        Arguments:
        batch_size: Number of documents of the mini-batch.
        wordids: A list whose each element is an array (terms), corresponding to a document.
                 Each element of the array is index of a unique term, which appears in the document,
                 in the vocabulary.
        wordcts: A list whose each element is an array (frequency), corresponding to a document.
                 Each element of the array says how many time the corresponding term in wordids appears
                 in the document.
        Returns time the E and M steps have taken and the list of topic mixtures of all documents in the mini-batch.        		
        """
        # E step
        start1 = time.time()
        (theta, index) = self.e_step(wordids, wordcts)
        end1 = time.time()
        # M step
        start2 = time.time()
        self.sparse_m_step(wordids, wordcts, theta, index)
        end2 = time.time()
        return (end1 - start1, end2 - start2, theta)

    def e_step(self, wordids, wordcts):
        """
        Does e step 
		
        Returns topic mixtures and their nonzero elements' indexes of all documents in the mini-batch.
        
        Note that, FW can provides sparse solution (theta:topic mixture) when doing inference
        for each documents. It means that the theta have few non-zero elements whose indexes
        are stored in list of lists 'index'.		
        """
        # Declare theta (topic mixtures) of mini-batch and list of non-zero indexes
        batch_size = len(wordids)
        theta = np.zeros((batch_size, self.num_topics))
        index = [{} for d in range(batch_size)]
        # Do inference for each document
        for d in range(batch_size):
            (thetad, indexd) = self.infer_doc(wordids[d], wordcts[d])
            theta[d, :] = thetad
            index[d] = indexd
        return (theta, index)

    def infer_doc(self, ids, cts):
        """
        Does inference for a document using Frank Wolfe algorithm.
        
        Arguments:
        ids: an element of wordids, corresponding to a document.
        cts: an element of wordcts, corresponding to a document.

        Returns inferred theta and list of indexes of non-zero elements of the theta.
        """
        # Locate cache memory
        beta = self.lda_model.model[:, ids]
        logbeta = self.logbeta[:, ids]
        nonzero = set()
        # Initialize theta to be a vertex of unit simplex 
        # with the largest value of the objective function
        theta = np.array(self.theta_init)
        f = np.dot(logbeta, cts)
        index = np.argmax(f)
        nonzero.add(index)
        theta[index] = self.theta_vert
        # x = sum_(k=2)^K theta_k * beta_{kj}
        x = np.copy(beta[index, :])
        # Loop
        for l in range(0, self.INF_MAX_ITER):
            # Select a vertex with the largest value of  
            # derivative of the objective function
            df = np.dot(beta, cts / x)
            index = np.argmax(df)
            nonzero.add(index)
            alpha = 2. / (l + 3)
            # Update theta
            theta *= 1 - alpha
            theta[index] += alpha
            # Update x
            beta_x = beta[index, :] - x
            x += alpha * (beta_x)
        return (theta, list(nonzero))

    def sparse_m_step(self, wordids, wordcts, theta, index):
        """
        Does m step: update global variables beta, exploiting sparseness of the 
        solutions returned by Frank-Wolfe algorithm from e step as well as 
        that of wordids and wordcts lists.
        """
        # Compute un-normalized intermediate beta:  
        # \hat{beta}_{kj} = sum(over d in C_t) d_j * theta_{dk}.  
        # For each document, the computation only take nonzero elements of 
        # theta_d into consideration.
        batch_size = len(wordids)
        beta = np.zeros((self.num_topics, self.num_terms)) + 1e-100
        for d in range(batch_size):
            for i in index[d]:
                beta[i, wordids[d]] += theta[d, i] * wordcts[d]
        # Check nonzero columns in the intermediate beta matrix above. Documents 
        # in the minibatch possibly contains a relatively fewer number of terms 
        # in comparison with vocabulary size that make the intermediate beta 
        # matrix may have too many zero columns.
        ids = list()
        for j in range(self.num_terms):
            if (sum(beta[:, j]) != 0):
                ids.append(j)
        # Normalize the intermediate beta
        for k in range(self.num_topics):
            if sum(beta[k, ids]) == 0:
                beta[k, ids] = 0.
            else:
                beta[k, ids] /= sum(beta[k, ids])
        # Update beta    
        rhot = pow(self.tau0 + self.updatect, -self.kappa)
        self.rhot = rhot
        self.lda_model.model *= (1 - rhot)
        self.lda_model.model[:, ids] += beta[:, ids] * rhot
        self.logbeta = np.log(self.lda_model.model)
        self.updatect += 1

    def m_step(self, batch_size, wordids, wordcts, theta, index):
        """
        Does m step: update global variables beta without considering the sparseness.
        """
        # Compute the intermediate topics
        beta = np.zeros((self.num_topics, self.num_terms))
        for d in range(batch_size):
            beta[:, wordids[d]] += np.outer(theta[d, :], wordcts[d])
        # normalize unit lambda
        beta_norm = beta.sum(axis=1)
        beta /= beta_norm[:, np.newaxis]
        # Update _lambda base on ML 
        rhot = pow(self.tau0 + self.updatect, -self.kappa)
        self.rhot = rhot
        self.lda_model.model *= (1 - rhot)
        self.lda_model.model += beta * rhot
        self.updatect += 1

    def infer_docs(self, new_corpus):
        docs = convert_corpus_format(new_corpus, DataFormat.TERM_FREQUENCY)
        theta, index = self.e_step(docs.word_ids_tks, docs.cts_lens)
        return theta

    def estimate_topic_proportions(self, param_theta):
        return param_theta
Exemplo n.º 10
0
class MLCGS(LdaLearning):
    def __init__(self,
                 data=None,
                 num_topics=100,
                 alpha=0.01,
                 tau0=1.0,
                 kappa=0.9,
                 burn_in=25,
                 samples=25,
                 lda_model=None):
        """

        Args:
            num_terms:
            num_topics:
            alpha:
            tau0:
            kappa:
            burn_in:
            samples:
            lda_model:
        """
        super(MLCGS, self).__init__(data, num_topics, lda_model)

        self.num_topics = num_topics
        self._alpha = alpha
        self._tau0 = tau0
        self._kappa = kappa
        self.burn_in = burn_in  # burn-in
        self.samples = samples  # samples
        self._sweeps = burn_in + samples
        self.update_unit = 1. / samples
        self._update_t = 1

        if self.data is not None or self.lda_model is not None:
            if self.data is not None:
                self.num_terms = data.get_num_terms()

            if self.lda_model is not None:
                self.num_topics, self.num_terms = self.lda_model.model.shape
            else:
                # initialize the variational distribution q(beta|lambda)
                self.lda_model = LdaModel(self.num_terms, num_topics)

            self.lda_model.normalize()

    def static_online(self, wordtks, lengths):
        # E step
        start = time.time()
        (Ndk_mean, z) = self.sample_z(wordtks, lengths)
        end1 = time.time()
        # M step
        self.update_lambda(wordtks, lengths, Ndk_mean)
        end2 = time.time()
        return (end1 - start, end2 - end1, Ndk_mean)

    def sample_z(self, wordtks, lengths):
        batch_size = len(lengths)
        batch_N = sum(lengths)
        uni_rvs = np.random.uniform(size=(batch_N) * (self._sweeps + 1))
        z = [{} for d in range(0, batch_size)]
        Ndk = np.zeros((batch_size, self.num_topics), dtype=np.uint32)
        Nkw_mean = np.zeros((self.num_topics, self.num_terms),
                            dtype=np.float64)
        Ndk_mean = np.zeros((batch_size, self.num_topics), dtype=np.float64)
        util_funcs.sampling(Ndk, Nkw_mean, Ndk_mean, self.lda_model.model,
                            uni_rvs, z, wordtks, lengths, self._alpha,
                            self.update_unit, self.samples, self.burn_in)
        # normalize Ndk_mean
        Ndk_mean_norm = Ndk_mean.sum(axis=1)
        for d in range(len(Ndk_mean_norm)):
            if Ndk_mean_norm[d] == 0:
                Ndk_mean[d, :] = 0
            else:
                Ndk_mean[d, :] /= Ndk_mean_norm[d]
        #Ndk_mean /= Ndk_mean_norm[:, np.newaxis]
        return Ndk_mean, z

    def update_lambda(self, wordtks, lengths, Ndk_mean):
        batch_size = len(lengths)
        _lambda = np.zeros((self.num_topics, self.num_terms))
        # compute unit lambda
        for d in range(batch_size):
            for j in range(lengths[d]):
                _lambda[:, wordtks[d][j]] += Ndk_mean[d]
        # normalize _lambda
        _lambda_norm = _lambda.sum(axis=1)
        _lambda /= _lambda_norm[:, np.newaxis]
        # update _lambda base on ML
        rhot = pow(self._tau0 + self._update_t, -self._kappa)
        self._rhot = rhot
        self.lda_model.model *= (1 - rhot)
        self.lda_model.model += _lambda * rhot
        self._update_t += 1

    def learn_model(self,
                    save_statistic=False,
                    save_model_every=0,
                    compute_sparsity_every=0,
                    save_top_words_every=0,
                    num_top_words=0,
                    model_folder=None,
                    save_topic_proportions=None):
        self.data.set_output_format(DataFormat.TERM_SEQUENCE)
        super(MLCGS,
              self).learn_model(save_statistic=save_statistic,
                                save_model_every=save_model_every,
                                compute_sparsity_every=compute_sparsity_every,
                                save_top_words_every=save_top_words_every,
                                num_top_words=num_top_words,
                                model_folder=model_folder,
                                save_topic_proportions=save_topic_proportions)
        return self.lda_model

    def infer_new_docs(self, new_corpus):
        docs = convert_corpus_format(new_corpus, DataFormat.TERM_SEQUENCE)
        theta, z = self.sample_z(docs.word_ids_tks, docs.cts_lens)
        return theta

    def estimate_topic_proportions(self, param_theta):
        param_theta = param_theta + self._alpha
        norm = param_theta.sum(axis=1)
        theta = param_theta / norm[:, np.newaxis]
        return theta
Exemplo n.º 11
0
class MLOPE(LdaLearning):
    """
    Implements ML-OPE for LDA as described in "Inference in topic models II: provably guaranteed algorithms". 
    """
    def __init__(self,
                 data=None,
                 num_topics=100,
                 alpha=0.01,
                 tau0=1.0,
                 kappa=0.9,
                 iter_infer=50,
                 lda_model=None):
        """
        Arguments:
            num_terms: Number of unique terms in the corpus (length of the vocabulary).
            num_topics: Number of topics shared by the whole corpus.
            alpha: Hyperparameter for prior on topic mixture theta.
            tau0: A (positive) learning parameter that downweights early iterations.
            kappa: Learning rate: exponential decay rate should be between
                   (0.5, 1.0] to guarantee asymptotic convergence.
            iter_infer: Number of iterations of FW algorithm 

        Note that if you pass the same set of all documents in the corpus every time and
        set kappa=0 this class can also be used to do batch OPE.
        """
        super(MLOPE, self).__init__(data, num_topics, lda_model)
        self.num_topics = num_topics
        self.alpha = alpha
        self.tau0 = tau0
        self.kappa = kappa
        self.updatect = 1
        self.INF_MAX_ITER = iter_infer

        if self.data is not None or self.lda_model is not None:
            if self.data is not None:
                self.num_terms = data.get_num_terms()

            if self.lda_model is not None:
                self.num_topics, self.num_terms = self.lda_model.model.shape
            else:
                # Initialize beta (topics)
                self.lda_model = LdaModel(self.num_terms, num_topics)
            self.lda_model.normalize()

    def static_online(self, wordids, wordcts):
        """
        First does an E step on the mini-batch given in wordids and
        wordcts, then uses the result of that E step to update the
        topics in M step.
		
        Arguments:
        batch_size: Number of documents of the mini-batch.
        wordids: A list whose each element is an array (terms), corresponding to a document.
                 Each element of the array is index of a unique term, which appears in the document,
                 in the vocabulary.
        wordcts: A list whose each element is an array (frequency), corresponding to a document.
                 Each element of the array says how many time the corresponding term in wordids appears
                 in the document.
        Returns time the E and M steps have taken and the list of topic mixtures of all documents in the mini-batch.        		
        """
        # E step
        start1 = time.time()
        theta = self.e_step(wordids, wordcts)
        end1 = time.time()
        # M step
        start2 = time.time()
        self.m_step(wordids, wordcts, theta)
        end2 = time.time()
        return (end1 - start1, end2 - start2, theta)

    def e_step(self, wordids, wordcts):
        """
        Does e step 
		
        Returns topic mixtures theta.
        """
        # Declare theta of minibatch
        batch_size = len(wordids)
        theta = np.zeros((batch_size, self.num_topics))
        # Inference
        for d in range(batch_size):
            thetad = self.infer_doc(wordids[d], wordcts[d])
            theta[d, :] = thetad
        return (theta)

    def infer_doc(self, ids, cts):
        """
        Does inference for a document using Online MAP Estimation algorithm.
        
        Arguments:
        ids: an element of wordids, corresponding to a document.
        cts: an element of wordcts, corresponding to a document.

        Returns inferred theta.
        """
        # locate cache memory
        beta = self.lda_model.model[:, ids]
        # Initialize theta randomly
        theta = np.random.rand(self.num_topics) + 1.
        theta /= sum(theta)
        # x = sum_(k=2)^K theta_k * beta_{kj}
        x = np.dot(theta, beta)
        # Loop
        T = [1, 0]
        for l in range(1, self.INF_MAX_ITER):
            # Pick fi uniformly
            T[np.random.randint(2)] += 1
            # Select a vertex with the largest value of
            # derivative of the function F
            df = T[0] * np.dot(beta, cts / x) + T[1] * (self.alpha - 1) / theta
            index = np.argmax(df)
            alpha = 1.0 / (l + 1)
            # Update theta
            theta *= 1 - alpha
            theta[index] += alpha
            # Update x
            x = x + alpha * (beta[index, :] - x)
        return (theta)

    def m_step(self, wordids, wordcts, theta):
        """
        Does m step: update global variables beta.
        """
        # Compute intermediate beta which is denoted as "unit beta"
        batch_size = len(wordids)
        beta = np.zeros((self.num_topics, self.num_terms), dtype=float)
        for d in range(batch_size):
            beta[:, wordids[d]] += np.outer(theta[d], wordcts[d])
        # Check zeros index
        beta_sum = beta.sum(axis=0)
        ids = np.where(beta_sum != 0)[0]
        unit_beta = beta[:, ids]
        # Normalize the intermediate beta
        unit_beta_norm = unit_beta.sum(axis=1)
        unit_beta /= unit_beta_norm[:, np.newaxis]
        # Update beta
        rhot = pow(self.tau0 + self.updatect, -self.kappa)
        self.rhot = rhot
        self.lda_model.model *= (1 - rhot)
        self.lda_model.model[:, ids] += unit_beta * rhot
        self.updatect += 1

    def infer_docs(self, new_corpus):
        docs = convert_corpus_format(new_corpus, DataFormat.TERM_FREQUENCY)
        theta = self.e_step(docs.word_ids_tks, docs.cts_lens)
        return theta

    def estimate_topic_proportions(self, param_theta):
        return param_theta
Exemplo n.º 12
0
    def learn_model(self,
                    save_statistic=False,
                    save_model_every=0,
                    compute_sparsity_every=0,
                    save_top_words_every=0,
                    num_top_words=10,
                    model_folder=None,
                    save_topic_proportions=None):
        """

        Args:
            data:
            save_model_every:
            compute_sparsity_every:
            save_statistic:
            save_top_words_every:
            num_top_words:
            model_folder:

        Returns:

        """
        mini_batch_no = 0
        # create model_folder
        if model_folder is not None:
            if not os.path.exists(model_folder):
                os.mkdir(model_folder)
        if save_topic_proportions is not None:
            self.data.init_database(save_topic_proportions)

        logger.info("Start learning Lda model, passes over")

        # Iterating
        while not self.data.check_end_of_data():
            mini_batch = self.data.load_mini_batch()

            # This using for streaming method
            if self.num_terms != self.data.get_num_terms():
                self.num_terms = self.data.get_num_terms()
                new_model = LdaModel(self.num_terms,
                                     self.num_topics,
                                     random_type=1)
                new_model.model[:, :self.lda_model.model.
                                shape[1]] = self.lda_model.model
                self.lda_model = new_model

            # run expectation - maximization algorithms
            time_e, time_m, param_theta = self.static_online(
                mini_batch.word_ids_tks, mini_batch.cts_lens)
            theta = self.estimate_topic_proportions(param_theta)
            if save_topic_proportions is not None:
                self.data.store_topic_proportions(theta)
            self.lda_model.presence_score += theta.sum(axis=0)
            del theta
            self.statistics.record_time(time_e, time_m)

            # compute documents sparsity
            if compute_sparsity_every > 0 and (self.data.mini_batch_no %
                                               compute_sparsity_every) == 0:
                sparsity = utilizies.compute_sparsity(param_theta,
                                                      param_theta.shape[0],
                                                      param_theta.shape[1],
                                                      't')
                self.statistics.record_sparsity(sparsity)

            # save model : lambda, beta, N_phi
            if save_model_every > 0 and (self.data.mini_batch_no %
                                         save_model_every) == 0:
                model_file = model_folder + '/model_batch' + str(
                    mini_batch_no) + '.txt'
                self.lda_model.save(model_file)

            # save top words
            if save_top_words_every > 0 and (self.data.mini_batch_no %
                                             save_top_words_every) == 0:
                top_words_file = model_folder + '/top_words_batch_' + str(
                    mini_batch_no) + '.txt'
                self.lda_model.print_top_words(num_top_words,
                                               vocab_file=self.data.vocab_file,
                                               display_result=top_words_file)

            if self.data.end_of_file and not self.data.check_end_of_data():
                self.lda_model.presence_score *= 0
            mini_batch_no += 1

        # save learning statistic
        if save_statistic:
            time_file = model_folder + '/time' + str(
                self.data.mini_batch_no) + '.csv'
            self.statistics.save_time(time_file)
            if compute_sparsity_every > 0:
                sparsity_file = model_folder + '/sparsity' + str(
                    self.data.mini_batch_no) + '.csv'
                self.statistics.save_sparsity(sparsity_file)
        # Finish
        logger.info('Finish training!!!')
        return self.lda_model