def perform_experiment(n_dw_matrix, optimizer, T, samples): optimizer.iteration_callback = experiments.default_callback( train_n_dw_matrix=n_dw_matrix, top_pmi_sizes=[5, 10, 20, 30], top_avg_jaccard_sizes=[10, 50, 100, 200], measure_time=True) for seed in range(samples): print(seed) experiments.default_sample(n_dw_matrix, T, seed, optimizer) print(timed_default.SimpleTimer.total_times)
def perform_experiment((train_n_dw_matrix, test_n_dw_matrix, optimizer, T, samples, output_path)): optimizer.iteration_callback = experiments.default_callback( train_n_dw_matrix=train_n_dw_matrix, test_n_dw_matrix=test_n_dw_matrix, top_pmi_sizes=[5, 10, 20, 30], top_avg_jaccard_sizes=[10, 50, 100, 200]) for seed in range(samples): print(seed) experiments.default_sample(train_n_dw_matrix, T, seed, optimizer) optimizer.iteration_callback.save_results(output_path)
def perform_iteration_dependency_experiment(( train_n_dw_matrix, test_n_dw_matrix, optimizer, T, samples, output_path )): optimizer.iteration_callback = experiments.default_callback( train_n_dw_matrix=train_n_dw_matrix, test_n_dw_matrix=test_n_dw_matrix, uniqueness_measures=True, iter_eval_step=5 ) for seed in range(samples): print(seed) experiments.default_sample(train_n_dw_matrix, T, seed, optimizer) optimizer.iteration_callback.save_results(output_path)
def perform_experiment(( train_n_dw_matrix, test_n_dw_matrix, optimizer, T, samples, init_iters, output_path )): init_optimizer = default.Optimizer([regularizers.Trivial()] * init_iters) callback = experiments.default_callback( train_n_dw_matrix=train_n_dw_matrix, test_n_dw_matrix=test_n_dw_matrix ) init_optimizer.iteration_callback = callback optimizer.iteration_callback = callback for seed in range(samples): print(seed) plsa_phi, plsa_theta = experiments.default_sample( train_n_dw_matrix=train_n_dw_matrix, T=T, seed=seed, optimizer=init_optimizer, finish_launch=False, ) optimizer.run(train_n_dw_matrix, plsa_phi, plsa_theta) if optimizer.iteration_callback: optimizer.iteration_callback.finish_launch() optimizer.iteration_callback.save_results(output_path)
def perform_plots((train_n_dw_matrix, T, output_path)): matrices = [] gamma_callback = ( lambda it, n_tw, n_dt, gamma_tw, gamma_dt: matrices.append((it, np.copy(n_tw), np.copy(n_dt), np.copy(gamma_tw), np.copy(gamma_dt))) if it % 20 == 0 else None ) init_optimizer = obd.Optimizer( [regularizers.Trivial()] * 100, gamma_tw_min_delta=-100000, gamma_callback=gamma_callback ) post_optimizer = obd.Optimizer( [regularizers.Trivial()] * 121, gamma_tw_min_delta=0.1, gamma_tw_max_delta=40, gamma_tw_delta_percentile=0.5, gamma_callback=gamma_callback ) plsa_phi, plsa_theta = experiments.default_sample( train_n_dw_matrix=train_n_dw_matrix, T=T, seed=42, optimizer=init_optimizer, ) post_optimizer.run(train_n_dw_matrix, plsa_phi, plsa_theta) save_results(matrices, output_path)
def perform_alpha_dependency_experiment(( train_n_dw_matrix, optimizer, T, samples, output_path )): callback = experiments.default_callback( train_n_dw_matrix=train_n_dw_matrix, uniqueness_measures=True ) callback.start_launch() for seed in range(samples): print(seed) callback(0, *experiments.default_sample( train_n_dw_matrix, T, seed, optimizer )) callback.finish_launch() callback.save_results(output_path)
def perform_experiment(train_n_dw_matrix, test_n_dw_matrix, optimizer, T, samples, output_path, tau, path_phi_output): optimizer.iteration_callback = experiments.default_callback( train_n_dw_matrix=train_n_dw_matrix, test_n_dw_matrix=test_n_dw_matrix, top_pmi_sizes=[5, 10, 20, 30], top_avg_jaccard_sizes=[10, 50, 100, 200], measure_time=False) for seed in range(samples): expphi, exptheta = experiments.default_sample(train_n_dw_matrix, T, seed, optimizer, tau) optimizer.iteration_callback.save_results(output_path) with open(path_phi_output, 'wb') as resource_file: pickle.dump(expphi, resource_file) return (expphi, exptheta)
def perform_lda( n_dw_matrix, optimizer, T, samples, output_path, init_phi=None, init_theta=None ): calc_perplexity = metrics.calc_perplexity_function(n_dw_matrix) phis = list() perplexities = list() for seed in range(samples): print(seed) phi, theta = experiments.default_sample( n_dw_matrix, T, seed, optimizer, init_phi_zeros=init_phi, init_theta_zeros=init_theta ) phis.append(phi.flatten()) perplexities.append(calc_perplexity(phi, theta)) if not os.path.exists(os.path.dirname(output_path)): os.makedirs(os.path.dirname(output_path)) with open(output_path, 'w') as output_file: pickle.dump({ 'init_phi': init_phi, 'init_theta': init_theta, 'perplexities': perplexities, 'phis': phis }, output_file)
for index, target in enumerate(doc_targets): if target == 0: topic_0_indices.append(index) elif target == 1: topic_1_indices.append(index) thetaless_rels = [] lda_rels = [] for balance in range(10, 201, 10): print(balance) n_dw_matrix = _n_dw_matrix[topic_0_indices + topic_1_indices * balance, :] regularization_list = [regularizers.Additive(-0.1, 0.)] * 100 lda_phi, lda_theta = experiments.default_sample( n_dw_matrix, T=2, seed=42, optimizer=default.Optimizer(regularization_list, verbose=False)) thetaless_phi, thetaless_theta = experiments.default_sample( n_dw_matrix, T=2, seed=42, optimizer=thetaless.Optimizer(regularization_list, verbose=False)) # print(np.argmax(thetaless_theta[:len(topic_0_indices), :2], axis=1).mean()) # print(np.argmax(thetaless_theta[len(topic_0_indices):, :2], axis=1).mean()) # print('!') # for topic_set in metrics.get_top_words(thetaless_phi, 10): # print('\n\t'.join(map(num_2_token.get, topic_set))) # print() # for topic_set in metrics.get_top_words(thetaless_phi, 5): # print('\n\t'.join(map(num_2_token.get, topic_set)))
n_dw_matrix, _, _, _ = main_cases.get_20newsgroups([ 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space' ]) print('Original PLSA') perform_lda( n_dw_matrix, optimizer=get_optimizer(0., 100), T=10, samples=300, output_path='stability_exp/plsa.pkl' ) print('Full initialized PLSA') phi, theta = experiments.default_sample( n_dw_matrix, T=10, seed=42, optimizer=get_optimizer(-0.1, 100) ) init_phi, init_theta = experiments.default_sample( n_dw_matrix, T=10, seed=42, optimizer=get_optimizer(0., 100), init_phi_zeros=phi, init_theta_zeros=theta ) perform_lda( n_dw_matrix, optimizer=get_optimizer(0., 100), T=10, samples=300, output_path='stability_exp/full_initialized_plsa.pkl', init_phi=init_phi, init_theta=init_theta ) print('Synthetic PLSA') matrix = np.dot(init_theta, init_phi) matrix[np.isnan(matrix)] = 0. synthetic_n_dw_matrix = scipy.sparse.csr_matrix(matrix)
def perform_doc_experiment( (n_dw_matrix_doc_train, doc_targets_doc_train, n_dw_matrix_doc_test, doc_targets_doc_test, optimizer, T, samples, output_path)): D, _ = n_dw_matrix_doc_test.shape svm_train_score = metrics.create_svm_score_function(doc_targets_doc_train, verbose=False) opt_plsa_not_const_phi = default.Optimizer( regularization_list=optimizer.regularization_list[:10], const_phi=False) opt_plsa_const_phi = default.Optimizer( regularization_list=optimizer.regularization_list[:10], const_phi=True) opt_artm_thetaless = thetaless.Optimizer( regularization_list=optimizer.regularization_list[:10]) res_plsa_not_const_phi = [] res_plsa_const_phi = [] res_artm_thetaless = [] cv_fold_scores = [] cv_test_scores = [] for seed in range(samples): print(seed) phi, theta = experiments.default_sample(n_dw_matrix_doc_train, T, seed, optimizer) (best_C, best_gamma, cv_fold_score, cv_test_score) = svm_train_score(theta) cv_fold_scores.append(cv_fold_score) cv_test_scores.append(cv_test_score) print('Fold score: {}\tTest score: {}'.format(cv_fold_score, cv_test_score)) algo = SVC(C=best_C, gamma=best_gamma).fit(theta, doc_targets_doc_train) init_theta = common.get_prob_matrix_by_counters( np.ones(shape=(D, T), dtype=np.float64)) plsa_not_const_phi = [] plsa_const_phi = [] artm_thetaless = [] opt_plsa_not_const_phi.iteration_callback = ( lambda it, phi, theta: plsa_not_const_phi.append( accuracy_score(algo.predict(theta), doc_targets_doc_test))) opt_plsa_const_phi.iteration_callback = ( lambda it, phi, theta: plsa_const_phi.append( accuracy_score(algo.predict(theta), doc_targets_doc_test))) opt_artm_thetaless.iteration_callback = ( lambda it, phi, theta: artm_thetaless.append( accuracy_score(algo.predict(theta), doc_targets_doc_test))) for opt in [ opt_plsa_not_const_phi, opt_plsa_const_phi, opt_artm_thetaless ]: opt.run(n_dw_matrix_doc_test, phi, init_theta) res_plsa_not_const_phi.append(plsa_not_const_phi) res_plsa_const_phi.append(plsa_const_phi) res_artm_thetaless.append(artm_thetaless) callbacks.save_results( { 'res_plsa_not_const_phi': res_plsa_not_const_phi, 'res_plsa_const_phi': res_plsa_const_phi, 'res_artm_thetaless': res_artm_thetaless, 'cv_fold_scores': cv_fold_scores, 'cv_test_scores': cv_test_scores }, output_path)
'talk.politics.guns', ]) topic_0_indices, topic_1_indices = [], [] for index, target in enumerate(doc_targets): if target == 0: topic_0_indices.append(index) elif target == 1: topic_1_indices.append(index) for balance in [1, 2, 5, 10, 20, 50, 100, 200, 300, 500]: n_dw_matrix = _n_dw_matrix[topic_0_indices + topic_1_indices * balance, :] regularization_list = [regularizers.Additive(-0.1, 0.)] * 100 lda_phi, lda_theta = experiments.default_sample( n_dw_matrix, T=2, seed=42, optimizer=default.Optimizer(regularization_list, verbose=False)) balanced_phi, balanced_theta = experiments.default_sample( n_dw_matrix, T=2, seed=42, optimizer=balanced.Optimizer(regularization_list, verbose=False)) balanced_ptdw_phi, balanced_ptdw_theta = experiments.default_sample( n_dw_matrix, T=2, seed=42, optimizer=balanced_ptdw.Optimizer(regularization_list, verbose=False)) for topic_set in metrics.get_top_words(lda_phi, 10): print('\n\t'.join(map(num_2_token.get, topic_set)))