Пример #1
0
def evaluate_model(file_name, date, n_iter, scope, lang, n_eval=5):
  #
  corpus = Corpus()
  corpus.add_files(file_name, encoding='utf8')
  #
  preproc = TMPreproc(corpus)
  dtm_bg = preproc.dtm
  #
  var_params = [{'n_topics': k} for k in range(5, int(n_eval*10), n_eval)]
  #
  const_params = {
    'n_iter': n_iter,
    'random_state': 20200713  # to make results reproducible
  }
  eval_results = evaluate_topic_models(dtm_bg,
                                     varying_parameters=var_params,
                                     constant_parameters=const_params,
                                     metric=['loglikelihood', 'cao_juan_2009', 'arun_2010']#,
                                     #return_models=True
                                     )
  #
  eval_results_by_topics = results_by_parameter(eval_results, 'n_topics')
  #
  name = "evaluate_model_{}_{}iter_{}eval_{}_{}.png".format(date, n_iter, n_eval, scope, lang)
  plot_eval_results(eval_results_by_topics, figsize=(8, 6), metric_direction_font_size='x-small', title_fontsize='small', axes_title_fontsize='x-small')
  plt.tight_layout()
  plt.savefig('out/'+name)
  return
    def toolkit_cv_plot(self, varying_params, 
                         constant_params,
                         save_plot=True,
                         save_dir='results/model_validation',
                         filename='',
                         ext='.pdf', 
                         size=(20, 15),
                         **kwargs):
        '''
        Using tmtoolkit for parameter tuning based on a wider variety of measures
        '''

        warnings.filterwarnings("ignore", category = UserWarning)   

        print('evaluating {} topic models'.format(len(varying_params)))
        eval_results = tm_gensim.evaluate_topic_models((self.gensim_dict, 
                                                        self.bow), 
                                                        varying_params, 
                                                        constant_params,
                                                        coherence_gensim_texts=self.text,
                                                        **kwargs)  

        results_by_n_topics = results_by_parameter(eval_results, 'num_topics')
        plot_eval_results(results_by_n_topics, xaxislabel='num topics',
                  title='Evaluation results', figsize=size);
          
        if save_plot:
            filename = 'tmtoolkit_CV_'                                                     
            full_path = save_folder_file(save_dir, filename, ext=ext, 
                                         optional_folder='convergence_plots')
      
            plt.savefig(full_path)
        return(results_by_n_topics)    
def test_results_by_parameter_single_validation(n_param_sets, n_params, n_metrics):
    # TODO: implement a better test here

    param_names = ['param' + str(i) for i in range(n_params)]
    metric_names = ['metric' + str(i) for i in range(n_metrics)]
    res = []
    for _ in range(n_param_sets):
        param_set = dict(zip(param_names, np.random.randint(0, 100, n_params)))
        metric_results = dict(zip(metric_names, np.random.uniform(0, 1, n_metrics)))
        res.append((param_set, metric_results))

    p = random.choice(param_names)
    by_param = evaluate.results_by_parameter(res, p)
    assert len(res) == len(by_param)
    assert all(x == 2 for x in map(len, by_param))
def test_plot_eval_results(n_param_sets, n_params, n_metrics, plot_specific_metric):
    param_names = ['param' + str(i) for i in range(n_params)]
    metric_names = ['metric' + str(i) for i in range(n_metrics)]
    res = []
    for _ in range(n_param_sets):
        param_set = dict(zip(param_names, np.random.randint(0, 100, n_params)))
        metric_results = dict(zip(metric_names, np.random.uniform(0, 1, n_metrics)))
        res.append((param_set, metric_results))

    p = random.choice(param_names)
    by_param = evaluate.results_by_parameter(res, p)

    if not by_param:
        with pytest.raises(ValueError):
            visualize.plot_eval_results(by_param)
    else:
        if plot_specific_metric:
            metric = random.choice(metric_names)
        else:
            metric = None

        fig, axes = visualize.plot_eval_results(by_param, metric=metric)
        plt.close(fig)
Пример #5
0
import numpy as np
from scipy.sparse import csr_matrix
rows = []
cols = []
data = []
for i in range(0, len(trigram_bow_corpus)):
    line = trigram_bow_corpus[i]
    for indx, freq in line:
        rows.append(i)
        cols.append(indx)
        data.append(freq)
dtm = csr_matrix((data, (rows, cols)),
                 shape=(len(trigram_bow_corpus), len(trigram_dictionary)),
                 dtype=int)

const_params = dict(n_iter=20)
ks = list(range(5, 100,
                5))  #+ list(range(50, 200, 50)) + list(range(200, 500, 100))
varying_params = [dict(n_topics=k, alpha=1.0 / k) for k in ks]

eval_results = tm_lda.evaluate_topic_models(
    dtm, varying_params, const_params, return_models=True)  #,n_max_processes=8

results_by_n_topics = results_by_parameter(eval_results, 'n_topics')

# fig, ax = plt.subplots(figsize=(8, 6))
plot_eval_results(results_by_n_topics)
plt.tight_layout()
# plt.savefig('valid_lda.eps', format='eps', dpi=300)
plt.show()
Пример #6
0
    ks = list(range(10, 140, 10)) + list(range(140, 300, 20)) + [300, 325, 350, 375, 400, 450, 500]
    varying_params = [dict(n_topics=k, alpha=1.0/k) for k in ks]

    # this will evaluate all models in parallel using the metrics in tm_lda.DEFAULT_METRICS
    # still, this will take some time
    print('evaluating %d topic models' % len(varying_params))
    models = tm_lda.evaluate_topic_models(dtm, varying_params, const_params,
                                          return_models=True)  # retain the calculated models

    # save the results as pickle
    print('saving results')
    pickle_data(models, 'data/lda_evaluation_results.pickle')

    # plot the results
    print('plotting evaluation results')
    results_by_n_topics = results_by_parameter(models, 'n_topics')
    plot_eval_results(results_by_n_topics, xaxislabel='num. topics k',
                      title='Evaluation results for alpha=1/k, beta=0.1', figsize=(8, 6))
    plt.savefig('data/lda_evaluation_plot.png')
    plt.show()

    # the peak seems to be around n_topics == 120
    # print the distributions of this model
    n_topics_best_model = 120
    best_model = dict(results_by_n_topics)[n_topics_best_model]['model']

    print('saving final model with n_topics=%d' % n_topics_best_model)
    save_ldamodel_to_pickle('data/lda_evaluation_finalmodel.pickle', best_model, vocab, doc_labels, dtm)

    print('printing final model')
    print_ldamodel_topic_words(best_model.topic_word_, vocab)