wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2')
    wiki.save(outp + '_corpus.pkl.bz2')
    # load back the id->word mapping directly from file
    # this seems to save more memory, compared to keeping the wiki.dictionary object from above
    dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2')

# build tfidf
if os.path.exists(outp + '_tfidf.mm'):
    mm = gensim.corpora.MmCorpus(outp + '_tfidf.mm')
else:
    tfidf = TfidfModel(wiki, id2word=dictionary, normalize=True)
    #tfidf.save(outp + '.tfidf_model')

    # save tfidf vectors in matrix market format
    mm = tfidf[wiki]
    MmCorpus.serialize(outp + '_tfidf.mm', mm, progress_cnt=10000)

logger.info("finished pre-processing, starting LDA %s", program)

lda = LdaMulticore(mm, id2word=dictionary, workers=10, num_topics=ntopics)
lda.save(model_name)
topics = lda.show_topics(num_topics=ntopics, num_words=30)
print(topics)
logger.info("finished LDA %s", program)

toptopics = lda.top_topics(corpus=wiki,
                           dictionary=lda.id2word,
                           coherence='u_mass')
logger.info("top topicsL %s", 'u_mass')
print(toptopics)
예제 #2
0
    # Set training parameters.
    num_topics = 100
    chunksize = 2000
    eval_every = None  # Don't evaluate model perplexity, takes too much time.

    # Make a index to word dictionary.
    temp = dictionary[0]  # This is only to "load" the dictionary.
    id2word = dictionary.id2token

    model = LdaMulticore(
        corpus=corpus,
        id2word=id2word,
        num_topics=num_topics,
    )

    top_topics = model.top_topics(corpus)  #, num_words=20)

    # Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
    avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
    print('Average topic coherence: %.4f.' % avg_topic_coherence)

    from pprint import pprint
    pprint(top_topics)

    import pyLDAvis.gensim
    import pyLDAvis
    #  pyLDAvis.enable_notebook()
    vis = pyLDAvis.gensim.prepare(model, corpus, dictionary=dictionary)
    pyLDAvis.save_html(vis, open("lda_vis.html", "w"))