def lda_pred(models, vocab, doc): """Get a class prediction for a document """ tokenized = word_tokenize_doc(doc) vectorizer = CountVectorizer(min_df=1, vocabulary = vocab, stop_words=None) X = vectorizer.fit_transform([' '.join(tokenized)]) label_score = [] for label, model in models.iteritems(): n_topics = len(model.components_) topic_dist = model.transform(X) log_likelihood = 0 for token in tokenized: if token in vocab: max_likelihood = -1 * 10 ** 8 for topic in range(n_topics): ll = np.log(model.components_[topic][vocab[token]]) + np.log(topic_dist[0][topic]) max_likelihood = max_likelihood if max_likelihood > ll else ll log_likelihood += max_likelihood label_score.append((label, log_likelihood)) return max(label_score, key = lambda x:x[1])[0]
def hlda_pred(models, dictionary, doc): corpus = [dictionary.doc2bow(word_tokenize_doc(doc))] label_score = [] for label, hdp in models.iteritems(): label_score.append((label, hdp.evaluate_test_corpus(corpus))) return max(label_score, key = lambda x:x[1])[0]