def test_pickle_unpickle(): pfile = 'tests/data/test_pickle_unpickle.pickle' input_data = ('foo', 123, []) pickle_data(input_data, pfile) output_data = unpickle_file(pfile) for i, o in zip(input_data, output_data): assert i == o
def load_ldamodel_from_pickle(picklefile, **kwargs): """Load a LDA model from a pickle file.""" return unpickle_file(picklefile, **kwargs)
print('call script as: %s <tokens preprocessing pipeline> <eta> <alpha factor> <num. iterations>' % sys.argv[0]) print('<tokens preprocessing pipeline> must be 0, 1 or 2') exit(1) preproc_mode = int(sys.argv[1]) assert 0 <= preproc_mode <= 2 eta = float(sys.argv[2]) assert 0 < eta < 1 alpha_mod = float(sys.argv[3]) assert alpha_mod > 0 n_iter = int(sys.argv[4]) assert n_iter > 0 dtm_pickle = DATA_PICKLE_DTM % preproc_mode print('loading DTM from file `%s`...' % dtm_pickle) doc_labels, vocab, dtm, doc_tokens = unpickle_file(dtm_pickle) assert len(doc_labels) == dtm.shape[0] assert len(vocab) == dtm.shape[1] tokens = list(doc_tokens.values()) del doc_tokens assert len(tokens) == len(doc_labels) print('loaded DTM with %d documents, %d vocab size, %d tokens' % (len(doc_labels), len(vocab), dtm.sum())) print('evaluating topic models...') constant_params = dict(n_iter=n_iter, # random_state=1, eta=eta) print('constant parameters:') pprint(constant_params) varying_num_topics = list(range(20, 100, 10)) + list(range(100, 200, 20)) + list(range(200, 501, 50)) #varying_num_topics = list(range(5,11))
ep-00-02-16.de""".split('\n') FILEIDS = ['german/' + f for f in FILES] DTM_PICKLE = 'data/read_preproc_lda_de_dtm.pickle' LDA_PICKLE = 'data/read_preproc_lda_de_lda.pickle' logging.basicConfig(level=logging.DEBUG) tmtoolkit_log = logging.getLogger('tmtoolkit') tmtoolkit_log.setLevel(logging.DEBUG) tmtoolkit_log.propagate = True if __name__ == '__main__': # this is necessary for multiprocessing on Windows! if os.path.exists(DTM_PICKLE): print("loading DTM data from pickle file '%s'..." % DTM_PICKLE) pickled_data = unpickle_file(DTM_PICKLE) assert pickled_data['dtm'].shape[0] == len(pickled_data['docnames']) assert pickled_data['dtm'].shape[1] == len(pickled_data['vocab']) dtm, vocab, doc_labels = pickled_data['dtm'], pickled_data[ 'vocab'], pickled_data['docnames'] else: europarl = nltk.corpus.util.LazyCorpusLoader( 'europarl_raw', nltk.corpus.EuroparlCorpusReader, fileids=FILEIDS) corpus = Corpus( {f: europarl.raw(f_id) for f, f_id in zip(FILES, FILEIDS)}) print("all loaded documents:") for dl, text in corpus.docs.items():
if len(sys.argv) != 4: print('run script as: %s <tokens preprocessing pipeline> <eta> <alpha factor>' % sys.argv[0]) print('<tokens preprocessing pipeline> must be 0, 1 or 2') exit(1) toks = int(sys.argv[1]) eta = float(sys.argv[2]) alpha_mod = float(sys.argv[3]) #%% picklefile = 'data/tm_eval_results_tok%d_eta_%.2f_alphamod_%.2f.pickle' % (toks, eta, alpha_mod) print('loading pickle file with evaluation results from `%s`' % picklefile) eval_results = unpickle_file(picklefile) eval_results_by_n_topics = results_by_parameter(eval_results, 'n_topics') n_metrics = len(eval_results_by_n_topics[0][1]) #%% fig, axes = plot_eval_results(eval_results_by_n_topics, title='Evaluation results for alpha=%.2f/k, beta=%.2f' % (alpha_mod, eta), xaxislabel='num. topics (k)') plot_file_eval_res = 'fig/tm_eval_results_tok%d_eta_%.2f_alphamod_%.2f.png' % (toks, eta, alpha_mod) print('saving plot to file `%s`' % plot_file_eval_res) plt.savefig(plot_file_eval_res) plt.show()
# other parameters BURNIN = 5 # with a default of refresh=10 this means 50 burnin iterations # paths to data files DATA_PICKLE_DTM = 'data/speeches_tokens_%d.pickle' % toks LDA_MODEL_PICKLE = 'data/model%d.pickle' % toks LDA_MODEL_LL_PLOT = 'data/model%d_logliks.png' % toks LDA_MODEL_EXCEL_OUTPUT = 'data/model%d_results.xlsx' % toks #%% load print('input tokens from preprocessing pipeline %d' % toks) print('loading DTM from `%s`...' % DATA_PICKLE_DTM) doc_labels, vocab, dtm, tokens = unpickle_file(DATA_PICKLE_DTM) assert len(doc_labels) == dtm.shape[0] assert len(vocab) == dtm.shape[1] print('loaded DTM with %d documents, %d vocab size, %d tokens' % (len(doc_labels), len(vocab), dtm.sum())) #%% compute model print('generating model with parameters:') pprint(LDA_PARAMS) model = LDA(**LDA_PARAMS) model.fit(dtm) #%% output
import numpy as np import pandas as pd import matplotlib.pyplot as plt from tmtoolkit.utils import unpickle_file from tmtoolkit.topicmod.model_stats import get_most_relevant_words_for_topic, get_topic_word_relevance, \ get_doc_lengths, get_marginal_topic_distrib, exclude_topics pd.set_option('display.width', 180) #%% load data # model and DTM doc_labels, vocab, dtm, model = unpickle_file('data/model2.pickle') n_docs, n_topics = model.doc_topic_.shape _, n_vocab = model.topic_word_.shape assert n_docs == len(doc_labels) == dtm.shape[0] assert n_topics == model.topic_word_.shape[0] assert n_vocab == len(vocab) == dtm.shape[1] print('loaded model with %d documents, vocab size %d, %d tokens and %d topics' % (n_docs, n_vocab, dtm.sum(), n_topics)) # raw speeches speeches_merged = unpickle_file('data/speeches_merged.pickle') # TOPs data