def test_topic_coherence(self): """Test LdaVowpalWabbit topic coherence.""" if not self.vw_path: # for python 2.6 return corpus, dictionary = get_corpus() lda = LdaVowpalWabbit(self.vw_path, corpus=corpus, passes=10, chunksize=256, id2word=dictionary, cleanup_files=True, alpha=0.1, eta=0.1, num_topics=len(TOPIC_WORDS), random_seed=1) lda.print_topics(5, 10) # map words in known topic to an ID topic_map = {} for i, words in enumerate(TOPIC_WORDS): topic_map[frozenset(words)] = i n_coherent = 0 for topic_id in range(lda.num_topics): topic = lda.show_topic(topic_id, topn=20) # get all words from LDA topic topic_words = [w[1] for w in topic] # get list of original topics that each word actually belongs to ids = [] for word in topic_words: for src_topic_words, src_topic_id in six.iteritems(topic_map): if word in src_topic_words: ids.append(src_topic_id) # count the number of times each original topic appears counts = defaultdict(int) for found_topic_id in ids: counts[found_topic_id] += 1 # if at least 6/10 words assigned to same topic, consider it coherent max_count = 0 for count in six.itervalues(counts): max_count = max(max_count, count) if max_count >= 6: n_coherent += 1 # not 100% deterministic, but should always get 3+ coherent topics self.assertTrue(n_coherent >= 3)
def test_topic_coherence(self): """Test LdaVowpalWabbit topic coherence.""" if not self.vw_path: # for python 2.6 return corpus, dictionary = get_corpus() lda = LdaVowpalWabbit(self.vw_path, corpus=corpus, passes=10, chunksize=256, id2word=dictionary, cleanup_files=True, alpha=0.1, eta=0.1, num_topics=len(TOPIC_WORDS), random_seed=1) lda.print_topics(5, 10) # map words in known topic to an ID topic_map = {} for i, words in enumerate(TOPIC_WORDS): topic_map[frozenset(words)] = i n_coherent = 0 for topic_id in range(lda.num_topics): topic = lda.show_topic(topic_id, topn=20) # get all words from LDA topic topic_words = [w[1] for w in topic] # get list of original topics that each word actually belongs to ids = [] for word in topic_words: for src_topic_words, src_topic_id in six.iteritems(topic_map): if word in src_topic_words: ids.append(src_topic_id) # count the number of times each original topic appears counts = defaultdict(int) for found_topic_id in ids: counts[found_topic_id] += 1 # if at least 6/10 words assigned to same topic, consider it coherent max_count = 0 for count in six.itervalues(counts): max_count = max(max_count, count) if max_count >= 6: n_coherent += 1 # not 100% deterministic, but should always get 3+ coherent topics self.assertTrue(n_coherent >= 3)