示例#1
0
def save_and_inference(model: LdaModel, corpus, num_topics, chunksize=0):
    path = f"./dev/model/{_ARGS.name}_{num_topics}.pkl"
    try:
        model.save(path)
        output(f"model saved at <{path}>")
        if chunksize > 0:
            gammas = [model.inference(chunk)[0] for chunk in grouper(corpus, chunksize)]
            gamma = concatenate(gammas)
        else:
            gamma, _ = model.inference(corpus)
    except RuntimeError as e:
        logging.error(f"PID: {os.getpid()}, num_topics: {num_topics} error")
        print(e)
    output(f"num_topics {num_topics} inference compete.")
    return gamma.argmax(axis=1)
示例#2
0
def perform_lda(dictionary, corpus, num_topics, wiki_path=None, passes=1, iterations=50, chunksize=200):
    """


    :param dictionary:
    :param corpus:
    :param wiki_path:
    :param num_topics:
    :param passes:
    :param iterations:
    :param chunksize:
    :return:
    """
    if wiki_path is not None:
        logging.info('Generating wiki corpus...')
        wikis = unpickle(wiki_path)
        wiki_corpus = [dictionary.doc2bow(wiki) for wiki in wikis]

        logging.info('Combining original corpus and wiki corpus...')
        corpus = corpus + wiki_corpus  # wiki_corpus is merged after the original corpus

    lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, passes=passes,
                         iterations=iterations, alpha='auto', chunksize=chunksize)
    corpus_ids = get_corpus_ids(dictionary.corpus_id2orig_id)
    # doc_vector_ids = dictionary.corpus_id2orig_id[corpus_ids]
    doc_vector_ids = [dictionary.corpus_id2orig_id[corpus_id] for corpus_id in corpus_ids]
    doc_vectors = lda_model.inference(corpus)[0]
    doc_vectors = doc_vectors[corpus_ids, :]
    doc_vectors = doc_vectors / doc_vectors.sum(axis=1).reshape(doc_vectors.shape[0], 1)

    return lda_model, doc_vectors, doc_vector_ids
示例#3
0
topic, contrib = zip(*tpl)
DTdist = pd.DataFrame(
    contrib,
    columns=[
        "Top 5 words that contribute to each topic with associated probability"
    ],
    index=indx)

distLatex = DTdist.to_latex(index=True, index_names="Topics")
# document distribution
doc_distribution = np.array([
    tup[1]
    for tup in lda.get_document_topics(bow=corpus, per_word_topics=False)
])
obj = lda.get_topics()
a = lda.inference(corpus)
print(doc_distribution[:853])
# training corpus document by topic matrix
doc_topic_dist_corpus = np.array([[tup[1] for tup in lst]
                                  for lst in lda[corpus]])
save_obj(lda, 'LDA_MODEL_APPLICATION')
#%%
lda = load_obj('LDA_MODEL_APPLICATION')
fig, axes = plt.subplots(2, 3, figsize=(20, 10), sharex=True, sharey=True)

for i, ax in enumerate(axes.flatten()):
    fig.add_subplot(ax)
    plt.imshow(
        WordCloud(background_color="white").fit_words(
            dict(lda.show_topic(i, 200))))
    plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16))
#https://www.jianshu.com/p/74ec7d5f6821
'''Usage examples'''
'''Train an LDA model using a Gensim corpus'''
from gensim.test.utils import common_texts
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel
# Create a corpus from a list of texts
common_dictionary = Dictionary(common_texts)
#print(common_dictionary)
common_corpus = [common_dictionary.doc2bow(text) for text in common_texts]

# Train the model on the corpus.
lda = LdaModel(common_corpus, num_topics=10)
print(lda)
print(lda.inference(common_corpus))
'''Save a model to disk, or reload a pre-trained model'''
'''
from gensim.test.utils import datapath

#Save model to disk
temp_file = datapath('model')
lda.save(temp_file)

#Load a potentially pretrained model from disk.
lada = LdaModel.load(temp_file)
'''
'''Query, the model using new, unseen documents'''
# Create a new corpus, made of previously unseen documents.
other_texts = [['computer', 'time', 'graph'], ['survey', 'response', 'eps'],
               ['human', 'system', 'computer']]