def train_lda(self, cache_path): print(cache_path) trainBatchIter = BatchIterBert(self.trainDataIter, filling_last_batch=False, postProcessor=batchPostProcessor, batch_size=1) bow_list = [] for item in trainBatchIter: bow = item[1].squeeze().detach().numpy().tolist() bow_list.append(self.bow_2_gensim(bow)) print(len(bow_list)) #print(self.dictProcess.common_dictionary.id2token) lda = LdaModel(np.array(bow_list), num_topics=50, passes=200, chunksize=len(bow_list), id2word=self.dictProcess.common_dictionary) #print(lda.show_topic(1, topn=10)) output_topic_line = '' for topic_id in range(50): current_topic_list = [] current_topic = lda.show_topic(topic_id, topn=10) for topic_tuple in current_topic: current_topic_list.append(topic_tuple[0]) output_topic_line += ' '.join(current_topic_list) + '\n' #print(current_topic_list) topic_file = os.path.join(cache_path, 'ldatopic.txt') with open(topic_file, 'w') as fo: fo.write(output_topic_line) testBatchIter = BatchIterBert(self.testDataIter, filling_last_batch=False, postProcessor=batchPostProcessor, batch_size=1) test_bow_list = [] word_count = 0 for item in testBatchIter: bow = item[1].squeeze().detach().numpy().tolist() word_count += sum(bow) test_bow_list.append(self.bow_2_gensim(bow)) print(word_count) ppl = lda.log_perplexity(test_bow_list, len(test_bow_list)) print(ppl) bound = lda.bound(test_bow_list) print(bound / word_count) print(np.exp2(-bound / word_count))