def save_and_inference(model: LdaModel, corpus, num_topics, chunksize=0): path = f"./dev/model/{_ARGS.name}_{num_topics}.pkl" try: model.save(path) output(f"model saved at <{path}>") if chunksize > 0: gammas = [model.inference(chunk)[0] for chunk in grouper(corpus, chunksize)] gamma = concatenate(gammas) else: gamma, _ = model.inference(corpus) except RuntimeError as e: logging.error(f"PID: {os.getpid()}, num_topics: {num_topics} error") print(e) output(f"num_topics {num_topics} inference compete.") return gamma.argmax(axis=1)
def perform_lda(dictionary, corpus, num_topics, wiki_path=None, passes=1, iterations=50, chunksize=200): """ :param dictionary: :param corpus: :param wiki_path: :param num_topics: :param passes: :param iterations: :param chunksize: :return: """ if wiki_path is not None: logging.info('Generating wiki corpus...') wikis = unpickle(wiki_path) wiki_corpus = [dictionary.doc2bow(wiki) for wiki in wikis] logging.info('Combining original corpus and wiki corpus...') corpus = corpus + wiki_corpus # wiki_corpus is merged after the original corpus lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, passes=passes, iterations=iterations, alpha='auto', chunksize=chunksize) corpus_ids = get_corpus_ids(dictionary.corpus_id2orig_id) # doc_vector_ids = dictionary.corpus_id2orig_id[corpus_ids] doc_vector_ids = [dictionary.corpus_id2orig_id[corpus_id] for corpus_id in corpus_ids] doc_vectors = lda_model.inference(corpus)[0] doc_vectors = doc_vectors[corpus_ids, :] doc_vectors = doc_vectors / doc_vectors.sum(axis=1).reshape(doc_vectors.shape[0], 1) return lda_model, doc_vectors, doc_vector_ids
topic, contrib = zip(*tpl) DTdist = pd.DataFrame( contrib, columns=[ "Top 5 words that contribute to each topic with associated probability" ], index=indx) distLatex = DTdist.to_latex(index=True, index_names="Topics") # document distribution doc_distribution = np.array([ tup[1] for tup in lda.get_document_topics(bow=corpus, per_word_topics=False) ]) obj = lda.get_topics() a = lda.inference(corpus) print(doc_distribution[:853]) # training corpus document by topic matrix doc_topic_dist_corpus = np.array([[tup[1] for tup in lst] for lst in lda[corpus]]) save_obj(lda, 'LDA_MODEL_APPLICATION') #%% lda = load_obj('LDA_MODEL_APPLICATION') fig, axes = plt.subplots(2, 3, figsize=(20, 10), sharex=True, sharey=True) for i, ax in enumerate(axes.flatten()): fig.add_subplot(ax) plt.imshow( WordCloud(background_color="white").fit_words( dict(lda.show_topic(i, 200)))) plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16))
#https://www.jianshu.com/p/74ec7d5f6821 '''Usage examples''' '''Train an LDA model using a Gensim corpus''' from gensim.test.utils import common_texts from gensim.corpora.dictionary import Dictionary from gensim.models import LdaModel # Create a corpus from a list of texts common_dictionary = Dictionary(common_texts) #print(common_dictionary) common_corpus = [common_dictionary.doc2bow(text) for text in common_texts] # Train the model on the corpus. lda = LdaModel(common_corpus, num_topics=10) print(lda) print(lda.inference(common_corpus)) '''Save a model to disk, or reload a pre-trained model''' ''' from gensim.test.utils import datapath #Save model to disk temp_file = datapath('model') lda.save(temp_file) #Load a potentially pretrained model from disk. lada = LdaModel.load(temp_file) ''' '''Query, the model using new, unseen documents''' # Create a new corpus, made of previously unseen documents. other_texts = [['computer', 'time', 'graph'], ['survey', 'response', 'eps'], ['human', 'system', 'computer']]