def fit(self, dtm): ''' Parallel version of the lda: the temporary topics are computed in parallel for each document inside a mini-batch ''' # Initialisation num_docs, num_words = dtm.shape topics = np.random.gamma(100., 1./100., (self.num_topics, num_words)) gamma = np.ones((num_docs, self.num_topics)) ExpELogBeta = np.zeros((self.num_topics, num_words)) topics_int = np.zeros((self.num_threads, self.num_topics, num_words)) num_batch = num_docs / self.batch_size batches = np.array_split( np.arange(num_docs, dtype=np.int32), num_batch) for it_batch in range(num_batch): ovi_cython.exp_digamma2d(topics, ExpELogBeta) docs_thread = np.array_split(batches[it_batch], self.num_threads) # vector of threads threads = [None]*self.num_threads for tid in range(self.num_threads): threads[tid] = threading.Thread(target=self._worker_estep, args=(docs_thread[tid], dtm, topics_int[tid, :, :], gamma, ExpELogBeta)) threads[tid].start() for thread in threads: thread.join() # Synchronizing the topics_int topics_int_tot = np.sum(topics_int, axis=0) # Initialize the list of topics int for the next batch topics_int[:, :, :] = 0 # M-step indices = (np.sum(dtm[batches[it_batch], :], axis=0) > 0).astype( np.int32) ovi_cython.m_step(topics, topics_int_tot, indices, num_docs, self.batch_size, self.tau, self.kappa, it_batch) self.topics = topics self.gamma = gamma # Compute the perplexity of the trained model on the train data self.perplexity_train = Evaluation._log_likelihood(self, gamma, dtm)
def transform(self, dtm): ''' Transform dtm into gamma according to the previously trained model. ''' if self.topics is None: raise NameError('The model has not been trained yet') # Initialisation num_docs, num_words = dtm.shape np.random.seed(0) gamma = np.ones((num_docs, self.num_topics)) ExpELogBeta = np.zeros((self.num_topics, num_words)) topics_int = np.zeros((self.num_threads, self.num_topics, num_words)) num_batch = num_docs / self.batch_size batches = np.array_split( np.arange(num_docs, dtype=np.int32), num_batch) for it_batch in range(num_batch): ovi_cython.exp_digamma2d(self.topics, ExpELogBeta) docs_thread = np.array_split(batches[it_batch], self.num_threads) # vector of threads threads = [None]*self.num_threads for tid in range(self.num_threads): threads[tid] = threading.Thread(target=self._worker_estep, args=(docs_thread[tid], dtm, topics_int[tid, :, :], gamma, ExpELogBeta)) threads[tid].start() for thread in threads: thread.join() return gamma