def __init__(self, data=None, num_topics=100, alpha=0.01, eta=0.01, tau0=1.0, kappa=0.9, conv_infer=0.0001, iter_infer=50, lda_model=None): super(OnlineVB, self).__init__(data, num_topics, lda_model) self.num_docs = 0 self._alpha = alpha self._eta = eta self._tau0 = tau0 self._kappa = kappa self._updatect = 1 self._conv_infer = conv_infer self._iter_infer = iter_infer if self.data is not None or self.lda_model is not None: if self.data is not None: self.num_terms = data.get_num_terms() if self.lda_model is not None: self.num_topics, self.num_terms = self.lda_model.model.shape else: # Initialize the variational distribution q(beta|lambda) self.lda_model = LdaModel(self.num_terms, num_topics, 1) self._Elogbeta = dirichlet_expectation(self.lda_model.model) self._expElogbeta = n.exp(self._Elogbeta)
def __init__(self, data=None, num_topics=100, alpha=0.01, eta=0.01, iter_infer=50, lda_model=None): """ Arguments: num_terms: Number of unique terms in the corpus (length of the vocabulary). num_topics: Number of topics shared by the whole corpus. alpha: Hyperparameter for prior on topic mixture theta. eta: Hyperparameter for prior on topics beta. iter_infer: Number of iterations of FW algorithm. """ super(StreamingOPE, self).__init__(data, num_topics, lda_model) self.num_topics = num_topics self.alpha = alpha self.eta = eta self.INF_MAX_ITER = iter_infer if self.data is not None or self.lda_model is not None: if self.data is not None: self.num_terms = data.get_num_terms() if self.lda_model is not None: self.num_topics, self.num_terms = self.lda_model.model.shape else: # Initialize lambda (variational parameters of topics beta) # beta_norm stores values, each of which is sum of elements in each row # of _lambda. self.lda_model = LdaModel(self.num_terms, num_topics) self.beta_norm = self.lda_model.model.sum(axis=1)
def __init__(self, data=None, num_topics=100, alpha=0.01, eta=0.01, tau_phi=1.0, kappa_phi=0.9, s_phi=1.0, tau_theta=10.0, kappa_theta=0.9, s_theta=1.0, burn_in=25, lda_model=None): """ Args: num_tokens: num_terms: num_topics: alpha: eta: tau_phi: kappa_phi: s_phi: tau_theta: kappa_theta: s_theta: burn_in: lda_model: """ super(OnlineCVB0, self).__init__(data, num_topics, lda_model) self.num_topics = num_topics self.alpha = alpha self.eta = eta self.eta_sum = num_topics * eta self.tau_phi = tau_phi self.kappa_phi = kappa_phi self.s_phi = s_phi self.tau_theta = tau_theta self.kappa_theta = kappa_theta self.s_theta = s_theta self.burn_in = burn_in self.updatect = 1 if self.data is not None or self.lda_model is not None: if self.data is not None: self.num_tokens = data.get_num_tokens() self.num_terms = data.get_num_terms() if self.lda_model is not None: self.num_topics, self.num_terms = self.lda_model.model.shape else: # self.N_phi = np.random.rand(num_topics, num_terms) # replace N_phi with lda model self.lda_model = LdaModel(self.num_terms, self.num_topics) self.N_Z = self.lda_model.model.sum(axis=1)
def __init__(self, data=None, num_topics=100, alpha=0.01, tau0=1.0, kappa=0.9, burn_in=25, samples=25, lda_model=None): """ Args: num_terms: num_topics: alpha: tau0: kappa: burn_in: samples: lda_model: """ super(MLCGS, self).__init__(data, num_topics, lda_model) self.num_topics = num_topics self._alpha = alpha self._tau0 = tau0 self._kappa = kappa self.burn_in = burn_in # burn-in self.samples = samples # samples self._sweeps = burn_in + samples self.update_unit = 1. / samples self._update_t = 1 if self.data is not None or self.lda_model is not None: if self.data is not None: self.num_terms = data.get_num_terms() if self.lda_model is not None: self.num_topics, self.num_terms = self.lda_model.model.shape else: # initialize the variational distribution q(beta|lambda) self.lda_model = LdaModel(self.num_terms, num_topics) self.lda_model.normalize()
def __init__(self, data=None, num_topics=100, alpha=0.01, tau0=1.0, kappa=0.9, iter_infer=50, lda_model=None): """ Arguments: num_terms: Number of unique terms in the corpus (length of the vocabulary). num_topics: Number of topics shared by the whole corpus. alpha: Hyperparameter for prior on topic mixture theta. tau0: A (positive) learning parameter that downweights early iterations. kappa: Learning rate: exponential decay rate should be between (0.5, 1.0] to guarantee asymptotic convergence. iter_infer: Number of iterations of FW algorithm Note that if you pass the same set of all documents in the corpus every time and set kappa=0 this class can also be used to do batch OPE. """ super(MLOPE, self).__init__(data, num_topics, lda_model) self.num_topics = num_topics self.alpha = alpha self.tau0 = tau0 self.kappa = kappa self.updatect = 1 self.INF_MAX_ITER = iter_infer if self.data is not None or self.lda_model is not None: if self.data is not None: self.num_terms = data.get_num_terms() if self.lda_model is not None: self.num_topics, self.num_terms = self.lda_model.model.shape else: # Initialize beta (topics) self.lda_model = LdaModel(self.num_terms, num_topics) self.lda_model.normalize()
def __init__(self, data=None, num_topics=100, eta=0.01, tau0=1.0, kappa=0.9, iter_infer=50, lda_model=None): """ Arguments: num_docs: Number of documents in the corpus. num_terms: Number of unique terms in the corpus (length of the vocabulary). num_topics: Number of topics shared by the whole corpus. eta: Hyperparameter for prior on topics beta. tau0: A (positive) learning parameter that downweights early iterations. kappa: Learning rate: exponential decay rate should be between (0.5, 1.0] to guarantee asymptotic convergence. iter_infer: Number of iterations of FW algorithm. """ super(OnlineFW, self).__init__(data, num_topics, lda_model) self.num_docs = 0 self.eta = eta self.tau0 = tau0 self.kappa = kappa self.updatect = 1 self.INF_MAX_ITER = iter_infer # Generate values used for initilaization of topic mixture of each document self.theta_init = [1e-10] * num_topics self.theta_vert = 1. - 1e-10 * (num_topics - 1) if self.data is not None or self.lda_model is not None: if self.data is not None: self.num_terms = data.get_num_terms() if self.lda_model is not None: self.num_topics, self.num_terms = self.lda_model.model.shape else: # Initialize lambda (variational parameters of topics beta) # beta_norm stores values, each of which is sum of elements in each row # of _lambda. self.lda_model = LdaModel(self.num_terms, num_topics) self.beta_norm = self.lda_model.model.sum(axis=1)
# parse cmd line k = int(sys.argv[1]) datafile = sys.argv[2] # load corpus if datafile.endswith(".mm"): corpus = corpora.MmCorpus(datafile) # there is no word id mapping in MM format; use word=wordId id2word = dict((wordId, str(wordId)) for wordId in xrange(corpus.numTerms)) else: corpus = corpora.CorpusLow(datafile) id2word = corpus.id2word # corpus.saveAsBlei() # run parameter estimation; this is the step that takes the most time model = LdaModel(id2word=id2word, numTopics=k) model.initialize(corpus) # store parameters, print topics info (for sanity check) model.save(datafile + ".model") if PRINT_TOPICS: logging.info("printing topics (top %i words)" % PRINT_TOPICS) model.printTopics(numWords=PRINT_TOPICS) print "=" * 40 elif "infer" in program: # make sure we have enough cmd line parameters if len(sys.argv) < 3: print globals()["__doc__"] sys.exit(1) # parse cmd line
# parse cmd line k = int(sys.argv[1]) datafile = sys.argv[2] # load corpus if datafile.endswith('.mm'): corpus = corpora.MmCorpus(datafile) # there is no word id mapping in MM format; use word=wordId id2word = dict( (wordId, str(wordId)) for wordId in xrange(corpus.numTerms)) else: corpus = corpora.CorpusLow(datafile) id2word = corpus.id2word #corpus.saveAsBlei() # run parameter estimation; this is the step that takes the most time model = LdaModel(id2word=id2word, numTopics=k) model.initialize(corpus) # store parameters, print topics info (for sanity check) model.save(datafile + '.model') if PRINT_TOPICS: logging.info("printing topics (top %i words)" % PRINT_TOPICS) model.printTopics(numWords=PRINT_TOPICS) print '=' * 40 elif 'infer' in program: # make sure we have enough cmd line parameters if len(sys.argv) < 3: print globals()["__doc__"] sys.exit(1) # parse cmd line
class MLFW(LdaLearning): """ Implements ML-FW for LDA as described in "Inference in topic models I: sparsity and trade-off". """ def __init__(self, data=None, num_topics=100, tau0=1.0, kappa=0.9, iter_infer=50, lda_model=None): """ Arguments: num_terms: Number of unique terms in the corpus (length of the vocabulary). num_topics: Number of topics shared by the whole corpus. tau0: A (positive) learning parameter that downweights early iterations. kappa: Learning rate: exponential decay rate should be between (0.5, 1.0] to guarantee asymptotic convergence. iter_infer: Number of iterations of FW algorithm Note that if you pass the same set of all documents in the corpus every time and set kappa=0 this class can also be used to do batch FW. """ super(MLFW, self).__init__(data, num_topics, lda_model) self.num_topics = num_topics self.tau0 = tau0 self.kappa = kappa self.updatect = 1 self.INF_MAX_ITER = iter_infer # Generate values used for initilization of topic mixture of each document self.theta_init = [1e-10] * num_topics self.theta_vert = 1. - 1e-10 * (num_topics - 1) if self.data is not None or self.lda_model is not None: if self.data is not None: self.num_terms = data.get_num_terms() if self.lda_model is not None: self.num_topics, self.num_terms = self.lda_model.model.shape else: # Initialize beta (topics) self.lda_model = LdaModel(self.num_terms, num_topics) self.lda_model.normalize() self.logbeta = np.log(self.lda_model.model) def static_online(self, wordids, wordcts): """ First does an E step on the mini-batch given in wordids and wordcts, then uses the result of that E step to update the topics in M step. Arguments: batch_size: Number of documents of the mini-batch. wordids: A list whose each element is an array (terms), corresponding to a document. Each element of the array is index of a unique term, which appears in the document, in the vocabulary. wordcts: A list whose each element is an array (frequency), corresponding to a document. Each element of the array says how many time the corresponding term in wordids appears in the document. Returns time the E and M steps have taken and the list of topic mixtures of all documents in the mini-batch. """ # E step start1 = time.time() (theta, index) = self.e_step(wordids, wordcts) end1 = time.time() # M step start2 = time.time() self.sparse_m_step(wordids, wordcts, theta, index) end2 = time.time() return (end1 - start1, end2 - start2, theta) def e_step(self, wordids, wordcts): """ Does e step Returns topic mixtures and their nonzero elements' indexes of all documents in the mini-batch. Note that, FW can provides sparse solution (theta:topic mixture) when doing inference for each documents. It means that the theta have few non-zero elements whose indexes are stored in list of lists 'index'. """ # Declare theta (topic mixtures) of mini-batch and list of non-zero indexes batch_size = len(wordids) theta = np.zeros((batch_size, self.num_topics)) index = [{} for d in range(batch_size)] # Do inference for each document for d in range(batch_size): (thetad, indexd) = self.infer_doc(wordids[d], wordcts[d]) theta[d, :] = thetad index[d] = indexd return (theta, index) def infer_doc(self, ids, cts): """ Does inference for a document using Frank Wolfe algorithm. Arguments: ids: an element of wordids, corresponding to a document. cts: an element of wordcts, corresponding to a document. Returns inferred theta and list of indexes of non-zero elements of the theta. """ # Locate cache memory beta = self.lda_model.model[:, ids] logbeta = self.logbeta[:, ids] nonzero = set() # Initialize theta to be a vertex of unit simplex # with the largest value of the objective function theta = np.array(self.theta_init) f = np.dot(logbeta, cts) index = np.argmax(f) nonzero.add(index) theta[index] = self.theta_vert # x = sum_(k=2)^K theta_k * beta_{kj} x = np.copy(beta[index, :]) # Loop for l in range(0, self.INF_MAX_ITER): # Select a vertex with the largest value of # derivative of the objective function df = np.dot(beta, cts / x) index = np.argmax(df) nonzero.add(index) alpha = 2. / (l + 3) # Update theta theta *= 1 - alpha theta[index] += alpha # Update x beta_x = beta[index, :] - x x += alpha * (beta_x) return (theta, list(nonzero)) def sparse_m_step(self, wordids, wordcts, theta, index): """ Does m step: update global variables beta, exploiting sparseness of the solutions returned by Frank-Wolfe algorithm from e step as well as that of wordids and wordcts lists. """ # Compute un-normalized intermediate beta: # \hat{beta}_{kj} = sum(over d in C_t) d_j * theta_{dk}. # For each document, the computation only take nonzero elements of # theta_d into consideration. batch_size = len(wordids) beta = np.zeros((self.num_topics, self.num_terms)) + 1e-100 for d in range(batch_size): for i in index[d]: beta[i, wordids[d]] += theta[d, i] * wordcts[d] # Check nonzero columns in the intermediate beta matrix above. Documents # in the minibatch possibly contains a relatively fewer number of terms # in comparison with vocabulary size that make the intermediate beta # matrix may have too many zero columns. ids = list() for j in range(self.num_terms): if (sum(beta[:, j]) != 0): ids.append(j) # Normalize the intermediate beta for k in range(self.num_topics): if sum(beta[k, ids]) == 0: beta[k, ids] = 0. else: beta[k, ids] /= sum(beta[k, ids]) # Update beta rhot = pow(self.tau0 + self.updatect, -self.kappa) self.rhot = rhot self.lda_model.model *= (1 - rhot) self.lda_model.model[:, ids] += beta[:, ids] * rhot self.logbeta = np.log(self.lda_model.model) self.updatect += 1 def m_step(self, batch_size, wordids, wordcts, theta, index): """ Does m step: update global variables beta without considering the sparseness. """ # Compute the intermediate topics beta = np.zeros((self.num_topics, self.num_terms)) for d in range(batch_size): beta[:, wordids[d]] += np.outer(theta[d, :], wordcts[d]) # normalize unit lambda beta_norm = beta.sum(axis=1) beta /= beta_norm[:, np.newaxis] # Update _lambda base on ML rhot = pow(self.tau0 + self.updatect, -self.kappa) self.rhot = rhot self.lda_model.model *= (1 - rhot) self.lda_model.model += beta * rhot self.updatect += 1 def infer_docs(self, new_corpus): docs = convert_corpus_format(new_corpus, DataFormat.TERM_FREQUENCY) theta, index = self.e_step(docs.word_ids_tks, docs.cts_lens) return theta def estimate_topic_proportions(self, param_theta): return param_theta
class MLCGS(LdaLearning): def __init__(self, data=None, num_topics=100, alpha=0.01, tau0=1.0, kappa=0.9, burn_in=25, samples=25, lda_model=None): """ Args: num_terms: num_topics: alpha: tau0: kappa: burn_in: samples: lda_model: """ super(MLCGS, self).__init__(data, num_topics, lda_model) self.num_topics = num_topics self._alpha = alpha self._tau0 = tau0 self._kappa = kappa self.burn_in = burn_in # burn-in self.samples = samples # samples self._sweeps = burn_in + samples self.update_unit = 1. / samples self._update_t = 1 if self.data is not None or self.lda_model is not None: if self.data is not None: self.num_terms = data.get_num_terms() if self.lda_model is not None: self.num_topics, self.num_terms = self.lda_model.model.shape else: # initialize the variational distribution q(beta|lambda) self.lda_model = LdaModel(self.num_terms, num_topics) self.lda_model.normalize() def static_online(self, wordtks, lengths): # E step start = time.time() (Ndk_mean, z) = self.sample_z(wordtks, lengths) end1 = time.time() # M step self.update_lambda(wordtks, lengths, Ndk_mean) end2 = time.time() return (end1 - start, end2 - end1, Ndk_mean) def sample_z(self, wordtks, lengths): batch_size = len(lengths) batch_N = sum(lengths) uni_rvs = np.random.uniform(size=(batch_N) * (self._sweeps + 1)) z = [{} for d in range(0, batch_size)] Ndk = np.zeros((batch_size, self.num_topics), dtype=np.uint32) Nkw_mean = np.zeros((self.num_topics, self.num_terms), dtype=np.float64) Ndk_mean = np.zeros((batch_size, self.num_topics), dtype=np.float64) util_funcs.sampling(Ndk, Nkw_mean, Ndk_mean, self.lda_model.model, uni_rvs, z, wordtks, lengths, self._alpha, self.update_unit, self.samples, self.burn_in) # normalize Ndk_mean Ndk_mean_norm = Ndk_mean.sum(axis=1) for d in range(len(Ndk_mean_norm)): if Ndk_mean_norm[d] == 0: Ndk_mean[d, :] = 0 else: Ndk_mean[d, :] /= Ndk_mean_norm[d] #Ndk_mean /= Ndk_mean_norm[:, np.newaxis] return Ndk_mean, z def update_lambda(self, wordtks, lengths, Ndk_mean): batch_size = len(lengths) _lambda = np.zeros((self.num_topics, self.num_terms)) # compute unit lambda for d in range(batch_size): for j in range(lengths[d]): _lambda[:, wordtks[d][j]] += Ndk_mean[d] # normalize _lambda _lambda_norm = _lambda.sum(axis=1) _lambda /= _lambda_norm[:, np.newaxis] # update _lambda base on ML rhot = pow(self._tau0 + self._update_t, -self._kappa) self._rhot = rhot self.lda_model.model *= (1 - rhot) self.lda_model.model += _lambda * rhot self._update_t += 1 def learn_model(self, save_statistic=False, save_model_every=0, compute_sparsity_every=0, save_top_words_every=0, num_top_words=0, model_folder=None, save_topic_proportions=None): self.data.set_output_format(DataFormat.TERM_SEQUENCE) super(MLCGS, self).learn_model(save_statistic=save_statistic, save_model_every=save_model_every, compute_sparsity_every=compute_sparsity_every, save_top_words_every=save_top_words_every, num_top_words=num_top_words, model_folder=model_folder, save_topic_proportions=save_topic_proportions) return self.lda_model def infer_new_docs(self, new_corpus): docs = convert_corpus_format(new_corpus, DataFormat.TERM_SEQUENCE) theta, z = self.sample_z(docs.word_ids_tks, docs.cts_lens) return theta def estimate_topic_proportions(self, param_theta): param_theta = param_theta + self._alpha norm = param_theta.sum(axis=1) theta = param_theta / norm[:, np.newaxis] return theta
class MLOPE(LdaLearning): """ Implements ML-OPE for LDA as described in "Inference in topic models II: provably guaranteed algorithms". """ def __init__(self, data=None, num_topics=100, alpha=0.01, tau0=1.0, kappa=0.9, iter_infer=50, lda_model=None): """ Arguments: num_terms: Number of unique terms in the corpus (length of the vocabulary). num_topics: Number of topics shared by the whole corpus. alpha: Hyperparameter for prior on topic mixture theta. tau0: A (positive) learning parameter that downweights early iterations. kappa: Learning rate: exponential decay rate should be between (0.5, 1.0] to guarantee asymptotic convergence. iter_infer: Number of iterations of FW algorithm Note that if you pass the same set of all documents in the corpus every time and set kappa=0 this class can also be used to do batch OPE. """ super(MLOPE, self).__init__(data, num_topics, lda_model) self.num_topics = num_topics self.alpha = alpha self.tau0 = tau0 self.kappa = kappa self.updatect = 1 self.INF_MAX_ITER = iter_infer if self.data is not None or self.lda_model is not None: if self.data is not None: self.num_terms = data.get_num_terms() if self.lda_model is not None: self.num_topics, self.num_terms = self.lda_model.model.shape else: # Initialize beta (topics) self.lda_model = LdaModel(self.num_terms, num_topics) self.lda_model.normalize() def static_online(self, wordids, wordcts): """ First does an E step on the mini-batch given in wordids and wordcts, then uses the result of that E step to update the topics in M step. Arguments: batch_size: Number of documents of the mini-batch. wordids: A list whose each element is an array (terms), corresponding to a document. Each element of the array is index of a unique term, which appears in the document, in the vocabulary. wordcts: A list whose each element is an array (frequency), corresponding to a document. Each element of the array says how many time the corresponding term in wordids appears in the document. Returns time the E and M steps have taken and the list of topic mixtures of all documents in the mini-batch. """ # E step start1 = time.time() theta = self.e_step(wordids, wordcts) end1 = time.time() # M step start2 = time.time() self.m_step(wordids, wordcts, theta) end2 = time.time() return (end1 - start1, end2 - start2, theta) def e_step(self, wordids, wordcts): """ Does e step Returns topic mixtures theta. """ # Declare theta of minibatch batch_size = len(wordids) theta = np.zeros((batch_size, self.num_topics)) # Inference for d in range(batch_size): thetad = self.infer_doc(wordids[d], wordcts[d]) theta[d, :] = thetad return (theta) def infer_doc(self, ids, cts): """ Does inference for a document using Online MAP Estimation algorithm. Arguments: ids: an element of wordids, corresponding to a document. cts: an element of wordcts, corresponding to a document. Returns inferred theta. """ # locate cache memory beta = self.lda_model.model[:, ids] # Initialize theta randomly theta = np.random.rand(self.num_topics) + 1. theta /= sum(theta) # x = sum_(k=2)^K theta_k * beta_{kj} x = np.dot(theta, beta) # Loop T = [1, 0] for l in range(1, self.INF_MAX_ITER): # Pick fi uniformly T[np.random.randint(2)] += 1 # Select a vertex with the largest value of # derivative of the function F df = T[0] * np.dot(beta, cts / x) + T[1] * (self.alpha - 1) / theta index = np.argmax(df) alpha = 1.0 / (l + 1) # Update theta theta *= 1 - alpha theta[index] += alpha # Update x x = x + alpha * (beta[index, :] - x) return (theta) def m_step(self, wordids, wordcts, theta): """ Does m step: update global variables beta. """ # Compute intermediate beta which is denoted as "unit beta" batch_size = len(wordids) beta = np.zeros((self.num_topics, self.num_terms), dtype=float) for d in range(batch_size): beta[:, wordids[d]] += np.outer(theta[d], wordcts[d]) # Check zeros index beta_sum = beta.sum(axis=0) ids = np.where(beta_sum != 0)[0] unit_beta = beta[:, ids] # Normalize the intermediate beta unit_beta_norm = unit_beta.sum(axis=1) unit_beta /= unit_beta_norm[:, np.newaxis] # Update beta rhot = pow(self.tau0 + self.updatect, -self.kappa) self.rhot = rhot self.lda_model.model *= (1 - rhot) self.lda_model.model[:, ids] += unit_beta * rhot self.updatect += 1 def infer_docs(self, new_corpus): docs = convert_corpus_format(new_corpus, DataFormat.TERM_FREQUENCY) theta = self.e_step(docs.word_ids_tks, docs.cts_lens) return theta def estimate_topic_proportions(self, param_theta): return param_theta
def learn_model(self, save_statistic=False, save_model_every=0, compute_sparsity_every=0, save_top_words_every=0, num_top_words=10, model_folder=None, save_topic_proportions=None): """ Args: data: save_model_every: compute_sparsity_every: save_statistic: save_top_words_every: num_top_words: model_folder: Returns: """ mini_batch_no = 0 # create model_folder if model_folder is not None: if not os.path.exists(model_folder): os.mkdir(model_folder) if save_topic_proportions is not None: self.data.init_database(save_topic_proportions) logger.info("Start learning Lda model, passes over") # Iterating while not self.data.check_end_of_data(): mini_batch = self.data.load_mini_batch() # This using for streaming method if self.num_terms != self.data.get_num_terms(): self.num_terms = self.data.get_num_terms() new_model = LdaModel(self.num_terms, self.num_topics, random_type=1) new_model.model[:, :self.lda_model.model. shape[1]] = self.lda_model.model self.lda_model = new_model # run expectation - maximization algorithms time_e, time_m, param_theta = self.static_online( mini_batch.word_ids_tks, mini_batch.cts_lens) theta = self.estimate_topic_proportions(param_theta) if save_topic_proportions is not None: self.data.store_topic_proportions(theta) self.lda_model.presence_score += theta.sum(axis=0) del theta self.statistics.record_time(time_e, time_m) # compute documents sparsity if compute_sparsity_every > 0 and (self.data.mini_batch_no % compute_sparsity_every) == 0: sparsity = utilizies.compute_sparsity(param_theta, param_theta.shape[0], param_theta.shape[1], 't') self.statistics.record_sparsity(sparsity) # save model : lambda, beta, N_phi if save_model_every > 0 and (self.data.mini_batch_no % save_model_every) == 0: model_file = model_folder + '/model_batch' + str( mini_batch_no) + '.txt' self.lda_model.save(model_file) # save top words if save_top_words_every > 0 and (self.data.mini_batch_no % save_top_words_every) == 0: top_words_file = model_folder + '/top_words_batch_' + str( mini_batch_no) + '.txt' self.lda_model.print_top_words(num_top_words, vocab_file=self.data.vocab_file, display_result=top_words_file) if self.data.end_of_file and not self.data.check_end_of_data(): self.lda_model.presence_score *= 0 mini_batch_no += 1 # save learning statistic if save_statistic: time_file = model_folder + '/time' + str( self.data.mini_batch_no) + '.csv' self.statistics.save_time(time_file) if compute_sparsity_every > 0: sparsity_file = model_folder + '/sparsity' + str( self.data.mini_batch_no) + '.csv' self.statistics.save_sparsity(sparsity_file) # Finish logger.info('Finish training!!!') return self.lda_model