def load_corpus(self, corpus_dir):
        """Load corpus from a given directory, then initialize the documents
        and model.
        Line format: token1 \t token2 \t token3 \t ... ...
        """
        self.documents = []
        rand = random.Random()

        logging.info('Load corpus from %s.' % corpus_dir)
        for root, dirs, files in os.walk(corpus_dir):
            for f in files:
                filename = os.path.join(root, f)
                logging.info('Load filename %s.' % filename)
                fp = open(filename, 'r')
                for doc_str in fp.readlines():
                    doc_str = doc_str.decode('gbk')
                    doc_tokens = doc_str.strip().split('\t')
                    if len(doc_tokens) < 2:
                        continue
                    document = Document(self.model.num_topics)
                    document.parse_from_tokens(doc_tokens, rand, self.vocabulary)
                    if document.num_words() < 2:
                        continue
                    self.documents.append(document)
                fp.close()

        logging.info('The document number is %d.' % len(self.documents))
        self._initialize_model()

        self._compute_smoothing_only_bucket()
        self._initialize_topic_word_coefficient()
Exemplo n.º 2
0
    def _inference_one_chain(self, doc_tokens, rand):
        """Inference topics with one markov chain.

        Returns the sparse topics p(z|d).
        """
        document = Document(self.model.num_topics)
        document.parse_from_tokens(doc_tokens, rand, self.vocabulary,
                                   self.model)
        if document.num_words() == 0:
            return dict()

        accumulated_topic_hist = {}
        for i in xrange(self.total_iterations):
            # one iteration
            for word in document.words:
                # --
                document.decrease_topic(word.topic, 1)

                new_topic = self._sample_word_topic(document, word.id, rand)
                assert new_topic != None
                word.topic = new_topic
                # ++
                document.increase_topic(new_topic, 1)

            if i >= self.burn_in_iterations:
                for non_zero in document.doc_topic_hist.non_zeros:
                    if non_zero.topic in accumulated_topic_hist:
                        accumulated_topic_hist[
                            non_zero.topic] += non_zero.count
                    else:
                        accumulated_topic_hist[non_zero.topic] = non_zero.count

        topic_dist = self._l1normalize_distribution(accumulated_topic_hist)
        return topic_dist
Exemplo n.º 3
0
    def load_corpus(self, corpus_dir):
        """Load corpus from a given directory, then initialize the documents
        and model.
        Line format: token1 \t token2 \t token3 \t ... ...
        """
        self.documents = []
        rand = random.Random()

        logging.info('Load corpus from %s.' % corpus_dir)
        for root, dirs, files in os.walk(corpus_dir):
            for f in files:
                filename = os.path.join(root, f)
                logging.info('Load filename %s.' % filename)
                fp = open(filename, 'r')
                for doc_str in fp.readlines():
                    doc_str = doc_str.decode('gbk')
                    doc_tokens = doc_str.strip().split('\t')
                    if len(doc_tokens) < 2:
                        continue
                    document = Document(self.model.num_topics)
                    document.parse_from_tokens(doc_tokens, rand,
                                               self.vocabulary)
                    if document.num_words() < 2:
                        continue
                    self.documents.append(document)
                fp.close()

        logging.info('The document number is %d.' % len(self.documents))
        self._initialize_model()

        self._compute_smoothing_only_bucket()
        self._initialize_topic_word_coefficient()
Exemplo n.º 4
0
    def _inference_one_chain(self, doc_tokens, rand):
        """Inference topics with one markov chain.

        Returns the sparse topics p(z|d).
        """
        document = Document(self.model.num_topics)
        document.parse_from_tokens(doc_tokens, rand,
                self.vocabulary, self.model)
        if document.num_words() == 0:
            return dict()

        accumulated_topic_hist = {}
        for i in xrange(self.total_iterations):
            # one iteration
            for word in document.words:
                # --
                document.decrease_topic(word.topic, 1)

                new_topic = self._sample_word_topic(document, word.id, rand)
                assert new_topic != None
                word.topic = new_topic
                # ++
                document.increase_topic(new_topic, 1)

            if i >= self.burn_in_iterations:
                for non_zero in document.doc_topic_hist.non_zeros:
                    if non_zero.topic in accumulated_topic_hist:
                        accumulated_topic_hist[non_zero.topic] += non_zero.count
                    else:
                        accumulated_topic_hist[non_zero.topic] = non_zero.count

        topic_dist = self._l1normalize_distribution(accumulated_topic_hist)
        return topic_dist
Exemplo n.º 5
0
 def test_compute_loglikelihood(self):
     doc_tokens = ['macbook', 'ipad',  # exist in vocabulary and model
             'mac os x', 'chrome',  # only exist in vocabulary
             'nokia', 'null']  # inexistent
     document = Document(self.model.num_topics)
     rand = random.Random()
     rand.seed(0)
     document.parse_from_tokens(
             doc_tokens, rand, self.vocabulary, self.model)
     documents = [document, document]
     self.assertEqual(-14.113955684239654,
             self.model_evaluator.compute_loglikelihood(documents))