Пример #1
0
    def fit(self, dtm, batch_size, tau=512, kappa=0.7):
        '''
        Parallel version of the lda: the temporary topics are computed in
        parallel for each document inside a mini-batch

        '''
        # Initialisation
        num_docs, num_words = dtm.shape
        np.random.seed(0)
        topics = np.random.gamma(100., 1. / 100., (self.num_topics, num_words))
        gamma = np.ones((num_docs, self.num_topics))
        ExpELogBeta = np.zeros((self.num_topics, num_words))
        topics_int = np.zeros((self.num_threads, self.num_topics, num_words))

        num_batch = num_docs / batch_size
        batches = np.array_split(np.arange(num_docs, dtype=np.int32),
                                 num_batch)

        for it_batch in range(num_batch):
            lda_vi_cython.exp_digamma_arr(topics, ExpELogBeta)

            docs_thread = np.array_split(batches[it_batch], self.num_threads)

            # vector of threads
            threads = [None] * self.num_threads

            for tid in range(self.num_threads):
                threads[tid] = threading.Thread(target=self.worker_estep,
                                                args=(docs_thread[tid], dtm,
                                                      topics_int[tid, :, :],
                                                      gamma, ExpELogBeta))
                threads[tid].start()

            for thread in threads:
                thread.join()

            # Synchronizing the topics_int
            topics_int_tot = np.sum(topics_int, axis=0)
            # Initialize the list of topics int for the next batch
            topics_int[:, :, :] = 0
            # M-step
            indices = (np.sum(dtm[batches[it_batch], :], axis=0) > 0).astype(
                np.int32)
            lda_vi_cython.m_step(topics, topics_int_tot, indices, num_docs,
                                 batch_size, tau, kappa, it_batch)

        self.topics = topics
        self.gamma = gamma
Пример #2
0
    def fit(self, dtm, batch_size, tau=512, kappa=0.7):
        '''
        Parallel version of the lda: the temporary topics are computed in
        parallel for each document inside a mini-batch

        '''
        # Initialisation
        num_docs, num_words = dtm.shape
        np.random.seed(0)
        topics = np.random.gamma(100., 1./100., (self.num_topics, num_words))
        gamma = np.ones((num_docs, self.num_topics))
        ExpELogBeta = np.zeros((self.num_topics, num_words))
        topics_int = np.zeros((self.num_threads, self.num_topics, num_words))

        num_batch = num_docs / batch_size
        batches = np.array_split(
            np.arange(num_docs, dtype=np.int32), num_batch)

        for it_batch in range(num_batch):
            lda_vi_cython.exp_digamma_arr(topics, ExpELogBeta)

            docs_thread = np.array_split(batches[it_batch], self.num_threads)

            # vector of threads
            threads = [None]*self.num_threads

            for tid in range(self.num_threads):
                threads[tid] = threading.Thread(target=self.worker_estep,
                                                args=(docs_thread[tid], dtm,
                                                      topics_int[tid, :, :],
                                                      gamma, ExpELogBeta))
                threads[tid].start()

            for thread in threads:
                thread.join()

            # Synchronizing the topics_int
            topics_int_tot = np.sum(topics_int, axis=0)
            # Initialize the list of topics int for the next batch
            topics_int[:, :, :] = 0
            # M-step
            indices = (np.sum(dtm[batches[it_batch], :], axis=0) > 0).astype(
                np.int32)
            lda_vi_cython.m_step(topics, topics_int_tot, indices, num_docs,
                                 batch_size, tau, kappa, it_batch)

        self.topics = topics
        self.gamma = gamma
Пример #3
0
    def transform(self, dtm, batch_size, tau=512, kappa=0.7):
        '''
        Transform dtm into gamma according to the previously trained model.

        '''
        if self.topics is None:
            raise NameError('The model has not been trained yet')
        # Initialisation
        num_docs, num_words = dtm.shape
        np.random.seed(0)
        gamma = np.ones((num_docs, self.num_topics))
        ExpELogBeta = np.zeros((self.num_topics, num_words))
        topics_int = np.zeros((self.num_threads, self.num_topics, num_words))

        num_batch = num_docs / batch_size
        batches = np.array_split(np.arange(num_docs, dtype=np.int32),
                                 num_batch)

        for it_batch in range(num_batch):
            lda_vi_cython.exp_digamma_arr(self.topics, ExpELogBeta)

            docs_thread = np.array_split(batches[it_batch], self.num_threads)

            # vector of threads
            threads = [None] * self.num_threads

            for tid in range(self.num_threads):
                threads[tid] = threading.Thread(target=self.worker_estep,
                                                args=(docs_thread[tid], dtm,
                                                      topics_int[tid, :, :],
                                                      gamma, ExpELogBeta))
                threads[tid].start()

            for thread in threads:
                thread.join()

        return gamma
Пример #4
0
    def transform(self, dtm, batch_size, tau=512, kappa=0.7):
        '''
        Transform dtm into gamma according to the previously trained model.

        '''
        if self.topics is None:
            raise NameError('The model has not been trained yet')
        # Initialisation
        num_docs, num_words = dtm.shape
        np.random.seed(0)
        gamma = np.ones((num_docs, self.num_topics))
        ExpELogBeta = np.zeros((self.num_topics, num_words))
        topics_int = np.zeros((self.num_threads, self.num_topics, num_words))

        num_batch = num_docs / batch_size
        batches = np.array_split(
            np.arange(num_docs, dtype=np.int32), num_batch)

        for it_batch in range(num_batch):
            lda_vi_cython.exp_digamma_arr(self.topics, ExpELogBeta)

            docs_thread = np.array_split(batches[it_batch], self.num_threads)

            # vector of threads
            threads = [None]*self.num_threads

            for tid in range(self.num_threads):
                threads[tid] = threading.Thread(target=self.worker_estep,
                                                args=(docs_thread[tid], dtm,
                                                      topics_int[tid, :, :],
                                                      gamma, ExpELogBeta))
                threads[tid].start()

            for thread in threads:
                thread.join()

        return gamma