dynamicLda = DynamicLda(lda_model, lda_model.bows) biased_topic_distribution = dynamicLda.get_biased_topic_distribution( query_terms, good_doc_ids, bad_doc_ids, good_terms, bad_terms ) sims = lda_model.calc_sims_for_topic_distribution(biased_topic_distribution, sim_method) topn = sims[:N] json_list = build_json_list(topn) # topn_json = json.dumps(topn) # print str(topn_json) return json_list if __name__ == "__main__": print("Loading BoW Vectors from cache...\n") vocab = Vocabulary(vocab_file) vocab.load_from_cache(reuters_cache_path) text_corpus = Text(text_file, vocab) text_corpus.load_from_cache(reuters_cache_path) print("Loading LDA model...\n") lda_model = LdaCalc( bows=text_corpus.bow_vectors, sims_cache_dir=sims_cache_path, lda_cache_dir=reuters_cache_path, num_topics=num_topics, ) lda_model.load() lda_model.print_topics() # print("Indexing Model for similarities...")
__author__ = 'thomas' import os import gensim from reuters.vocabulary import Vocabulary from reuters.text import Text from reuters.vectors import Vectors from gensim.similarities import SparseMatrixSimilarity reuters_cache_path = "/home/thomas/projects/clms/internship/lda/cache/reuters" reuters_path = "/home/thomas/projects/clms/internship/corpora/reuters/" vocab_file = os.path.join(reuters_path, "stem.termid.idf.map.txt") text_file = os.path.join(reuters_path, "lyrl2004_tokens_train.dat") vector_file = os.path.join(reuters_path, "lyrl2004_vectors_train.dat") print("Loading Vocabulary...") vocab = Vocabulary(vocab_file) vocab.load_from_text() print("Saving Vocabulary to Cache...") vocab.save_to_cache(reuters_cache_path) #vectors = Vectors(vector_file, 100) #vectors.load_from_text() print("Loading Corpus...") text = Text(text_file, vocab) text.load_from_text() print("Saving Corpus to Cache...") text.save_to_cache(reuters_cache_path) num_topics = 100 print("Making Dictionary from Corpus (" + str(len(text.bow_vectors)) + " documents)...") # get an lda-compatible dictionary