Пример #1
0
    def test_topic_coherence(self):
        """Test LdaVowpalWabbit topic coherence."""
        if not self.vw_path:  # for python 2.6
            return
        corpus, dictionary = get_corpus()
        lda = LdaVowpalWabbit(self.vw_path,
                              corpus=corpus,
                              passes=10,
                              chunksize=256,
                              id2word=dictionary,
                              cleanup_files=True,
                              alpha=0.1,
                              eta=0.1,
                              num_topics=len(TOPIC_WORDS),
                              random_seed=1)
        lda.print_topics(5, 10)

        # map words in known topic to an ID
        topic_map = {}
        for i, words in enumerate(TOPIC_WORDS):
            topic_map[frozenset(words)] = i

        n_coherent = 0
        for topic_id in range(lda.num_topics):
            topic = lda.show_topic(topic_id, topn=20)

            # get all words from LDA topic
            topic_words = [w[1] for w in topic]

            # get list of original topics that each word actually belongs to
            ids = []
            for word in topic_words:
                for src_topic_words, src_topic_id in six.iteritems(topic_map):
                    if word in src_topic_words:
                        ids.append(src_topic_id)

            # count the number of times each original topic appears
            counts = defaultdict(int)
            for found_topic_id in ids:
                counts[found_topic_id] += 1

            # if at least 6/10 words assigned to same topic, consider it coherent
            max_count = 0
            for count in six.itervalues(counts):
                max_count = max(max_count, count)

            if max_count >= 6:
                n_coherent += 1

        # not 100% deterministic, but should always get 3+ coherent topics
        self.assertTrue(n_coherent >= 3)
    def test_topic_coherence(self):
        """Test LdaVowpalWabbit topic coherence."""
        if not self.vw_path: # for python 2.6
            return
        corpus, dictionary = get_corpus()
        lda = LdaVowpalWabbit(self.vw_path,
                              corpus=corpus,
                              passes=10,
                              chunksize=256,
                              id2word=dictionary,
                              cleanup_files=True,
                              alpha=0.1,
                              eta=0.1,
                              num_topics=len(TOPIC_WORDS),
                              random_seed=1)
        lda.print_topics(5, 10)

        # map words in known topic to an ID
        topic_map = {}
        for i, words in enumerate(TOPIC_WORDS):
            topic_map[frozenset(words)] = i

        n_coherent = 0
        for topic_id in range(lda.num_topics):
            topic = lda.show_topic(topic_id, topn=20)

            # get all words from LDA topic
            topic_words = [w[1] for w in topic]

            # get list of original topics that each word actually belongs to
            ids = []
            for word in topic_words:
                for src_topic_words, src_topic_id in six.iteritems(topic_map):
                    if word in src_topic_words:
                        ids.append(src_topic_id)

            # count the number of times each original topic appears
            counts = defaultdict(int)
            for found_topic_id in ids:
                counts[found_topic_id] += 1

            # if at least 6/10 words assigned to same topic, consider it coherent
            max_count = 0
            for count in six.itervalues(counts):
                max_count = max(max_count, count)

            if max_count >= 6:
                n_coherent += 1

        # not 100% deterministic, but should always get 3+ coherent topics
        self.assertTrue(n_coherent >= 3)