def test_pickle_unpickle(): pfile = 'tests/data/test_pickle_unpickle.pickle' input_data = ('foo', 123, []) pickle_data(input_data, pfile) output_data = unpickle_file(pfile) for i, o in zip(input_data, output_data): assert i == o
def save_ldamodel_to_pickle(picklefile, model, vocab, doc_labels, dtm=None, **kwargs): """Save a LDA model as pickle file.""" pickle_data( { 'model': model, 'vocab': vocab, 'doc_labels': doc_labels, 'dtm': dtm }, picklefile)
print('POS tagged:') preproc.pos_tag() pprint(preproc.tokens_with_pos_tags) print('lemmatized:') preproc.lemmatize() pprint(preproc.tokens_with_pos_tags) print('lowercase:') preproc.tokens_to_lowercase() pprint(preproc.tokens) print('cleaned:') preproc.clean_tokens() pprint(preproc.tokens_with_pos_tags) pprint(preproc.tokens) print('filtered:') preproc.filter_for_token(u'einfach', remove_found_token=True) preproc.filter_for_pos('N') pprint(preproc.tokens_with_pos_tags) print('saving tokens as pickle...') pickle_data(preproc.tokens, 'data/preproc_gen_dtm_de_tokens.pickle') print('DTM:') doc_labels, vocab, dtm = preproc.get_dtm() print(pd.DataFrame(dtm.todense(), columns=vocab, index=doc_labels))
# evaluate topic models with different parameters const_params = dict(n_iter=1200, random_state=1, refresh=10) ks = list(range(10, 160, 5)) + list(range(160, 300, 10)) + [300, 325, 350, 375, 400] varying_params = [dict(n_topics=k, alpha=1.0 / k) for k in ks] # this will evaluate all models in parallel # still, this will take some time print('evaluating %d topic models' % len(varying_params)) models = tm_lda.evaluate_topic_models( dtm, varying_params, const_params, return_models=True) # retain the calculated models # save the results as pickle print('saving results') pickle_data(models, 'data/lda_evaluation_results.pickle') # plot the results print('plotting evaluation results') results_by_n_topics = results_by_parameter(models, 'n_topics') plot_eval_results(results_by_n_topics, xaxislabel='num. topics k', title='Evaluation results for alpha=1/k, beta=0.01', figsize=(8, 6)) plt.savefig('data/lda_evaluation_plot.png') plt.show() # the peak seems to be around n_topics == 140 # print the distributions of this model n_topics_best_model = 140 print('printing best model with n_topics=%d' % n_topics_best_model)
const_params = dict( update_every=0, passes=20, iterations=400, alpha='auto', eta='auto', ) ks = list(range(10, 140, 10)) + list(range(140, 200, 20)) varying_params = [dict(num_topics=k, alpha=1.0 / k) for k in ks] print('evaluating %d topic models' % len(varying_params)) eval_results = tm_gensim.evaluate_topic_models( (gnsm_dict, gnsm_corpus), varying_params, const_params, coherence_gensim_texts=model_lists) # necessary for coherence C_V metric # save the results as pickle print('saving results') pickle_data(eval_results, 'gensim_evaluation_results_entire.pickle') # plot the results print('plotting evaluation results') plt.style.use('ggplot') results_by_n_topics = results_by_parameter(eval_results, 'num_topics') plot_eval_results(results_by_n_topics, xaxislabel='num. topics k', title='Evaluation results', figsize=(8, 6)) plt.savefig('gensim_evaluation_plot_entire.png') plt.show()
tokens = list(doc_tokens.values()) del doc_tokens assert len(tokens) == len(doc_labels) print('loaded DTM with %d documents, %d vocab size, %d tokens' % (len(doc_labels), len(vocab), dtm.sum())) print('evaluating topic models...') constant_params = dict(n_iter=n_iter, # random_state=1, eta=eta) print('constant parameters:') pprint(constant_params) varying_num_topics = list(range(20, 100, 10)) + list(range(100, 200, 20)) + list(range(200, 501, 50)) #varying_num_topics = list(range(5,11)) varying_alpha = [alpha_mod/k for k in varying_num_topics] varying_params = [dict(n_topics=k, alpha=a) for k, a in zip(varying_num_topics, varying_alpha)] print('varying parameters:') pprint(varying_params) eval_results = tm_lda.evaluate_topic_models(dtm, varying_params, constant_params, metric=('griffiths_2004', 'cao_juan_2009', 'arun_2010', 'coherence_mimno_2011', 'coherence_gensim_c_v'), coherence_gensim_vocab=vocab, coherence_gensim_texts=tokens) pickle_file_eval_res = 'data/tm_eval_results_tok%d_eta_%.2f_alphamod_%.2f.pickle' % (preproc_mode, eta, alpha_mod) print('saving results to file `%s`' % pickle_file_eval_res) pickle_data(eval_results, pickle_file_eval_res) print('done.')
print('creating gensim corpus...') gnsm_dict = gensim.corpora.Dictionary.from_documents(texts) gnsm_corpus = [gnsm_dict.doc2bow(text) for text in texts] # evaluate topic models with different parameters const_params = dict(update_every=0, passes=10) ks = list(range(10, 140, 10)) + list(range(140, 200, 20)) varying_params = [dict(num_topics=k, alpha=1.0 / k) for k in ks] print('evaluating %d topic models' % len(varying_params)) eval_results = tm_gensim.evaluate_topic_models( (gnsm_dict, gnsm_corpus), varying_params, const_params, coherence_gensim_texts=texts) # necessary for coherence C_V metric # save the results as pickle print('saving results') pickle_data(eval_results, 'data/gensim_evaluation_results.pickle') # plot the results print('plotting evaluation results') plt.style.use('ggplot') results_by_n_topics = results_by_parameter(eval_results, 'num_topics') plot_eval_results(results_by_n_topics, xaxislabel='num. topics k', title='Evaluation results', figsize=(8, 6)) plt.savefig('data/gensim_evaluation_plot.png') plt.show()
print('-- processing took %f sec. so far' % proc_time) preproc.save_state('data/read_preproc_lda_de_state.pickle') print('token samples:') for dl, tokens in preproc.tokens_with_pos_tags.items(): print("> %s:" % dl) print(">>", sample(tokens, 10)) print('generating DTM...') doc_labels, vocab, dtm = preproc.get_dtm() print("saving DTM data to pickle file '%s'..." % DTM_PICKLE) pickle_data({ 'dtm': dtm, 'vocab': vocab, 'docnames': doc_labels }, DTM_PICKLE) print("running LDA...") # note: this won't result in a good topic model. it's only here for demonstration purposes. # we should increase the number of iterations and also do some evaluation to get the "correct" number of topics. model = lda.LDA(n_topics=30, n_iter=500) model.fit(dtm) # print topic-word distributions with respective probabilities print_ldamodel_topic_words(model.topic_word_, vocab) # print document-topic distributions with respective probabilities print_ldamodel_doc_topics(model.doc_topic_, doc_labels)
assert len(vocab) == dtm.shape[1] print('loaded DTM with %d documents, %d vocab size, %d tokens' % (len(doc_labels), len(vocab), dtm.sum())) #%% compute model print('generating model with parameters:') pprint(LDA_PARAMS) model = LDA(**LDA_PARAMS) model.fit(dtm) #%% output print('saving model to `%s`' % LDA_MODEL_PICKLE) pickle_data((doc_labels, vocab, dtm, model), LDA_MODEL_PICKLE) print('saving results to `%s`' % LDA_MODEL_EXCEL_OUTPUT) save_ldamodel_summary_to_excel(LDA_MODEL_EXCEL_OUTPUT, model.topic_word_, model.doc_topic_, doc_labels, vocab, dtm=dtm) #%% print('displaying loglikelihoods...') plt.plot( np.arange(BURNIN, len(model.loglikelihoods_)) * 10, model.loglikelihoods_[BURNIN:]) plt.xlabel('iterations')
] uncommon_special_chars = set( [pttrn_token_w_specialchar_inv.sub('', t) for t in tokens_w_specialchars]) uncommon_special_chars = set( sum([[c for c in cs] for cs in uncommon_special_chars], [])) print('detected the following uncommon special characters:') for c in uncommon_special_chars: print('%04x' % ord(c)) print('running preprocessing pipeline...') preproc.pos_tag()\ .lemmatize()\ .tokens_to_lowercase()\ .remove_special_chars_in_tokens()\ .clean_tokens(remove_shorter_than=2)\ .remove_common_tokens(0.9)\ .remove_uncommon_tokens(3, absolute=True) print('retrieving tokens...') tokens = preproc.tokens print('generating DTM...') doc_labels, vocab, dtm = preproc.get_dtm() output_dtm_pickle = DATA_PICKLE_DTM % preproc_mode print('writing DTM to `%s`...' % output_dtm_pickle) pickle_data((doc_labels, vocab, dtm, tokens), output_dtm_pickle) print('done.')