Exemplo n.º 1
0
    dynamicLda = DynamicLda(lda_model, lda_model.bows)
    biased_topic_distribution = dynamicLda.get_biased_topic_distribution(
        query_terms, good_doc_ids, bad_doc_ids, good_terms, bad_terms
    )

    sims = lda_model.calc_sims_for_topic_distribution(biased_topic_distribution, sim_method)
    topn = sims[:N]
    json_list = build_json_list(topn)
    # topn_json = json.dumps(topn)
    # print str(topn_json)
    return json_list


if __name__ == "__main__":
    print("Loading BoW Vectors from cache...\n")
    vocab = Vocabulary(vocab_file)
    vocab.load_from_cache(reuters_cache_path)
    text_corpus = Text(text_file, vocab)
    text_corpus.load_from_cache(reuters_cache_path)

    print("Loading LDA model...\n")
    lda_model = LdaCalc(
        bows=text_corpus.bow_vectors,
        sims_cache_dir=sims_cache_path,
        lda_cache_dir=reuters_cache_path,
        num_topics=num_topics,
    )
    lda_model.load()
    lda_model.print_topics()

    # print("Indexing Model for similarities...")
Exemplo n.º 2
0
__author__ = 'thomas'
import os
import gensim
from reuters.vocabulary import Vocabulary
from reuters.text import Text
from reuters.vectors import Vectors
from gensim.similarities import SparseMatrixSimilarity

reuters_cache_path = "/home/thomas/projects/clms/internship/lda/cache/reuters"
reuters_path = "/home/thomas/projects/clms/internship/corpora/reuters/"
vocab_file = os.path.join(reuters_path, "stem.termid.idf.map.txt")
text_file = os.path.join(reuters_path, "lyrl2004_tokens_train.dat")
vector_file = os.path.join(reuters_path, "lyrl2004_vectors_train.dat")

print("Loading Vocabulary...")
vocab = Vocabulary(vocab_file)
vocab.load_from_text()
print("Saving Vocabulary to Cache...")
vocab.save_to_cache(reuters_cache_path)
#vectors = Vectors(vector_file, 100)
#vectors.load_from_text()
print("Loading Corpus...")
text = Text(text_file, vocab)
text.load_from_text()
print("Saving Corpus to Cache...")
text.save_to_cache(reuters_cache_path)

num_topics = 100

print("Making Dictionary from Corpus (" + str(len(text.bow_vectors)) + " documents)...")
# get an lda-compatible dictionary