def perform_experiment(train_n_dw_matrix, test_n_dw_matrix, T, num_2_token): train_corpus = [zip(row.indices, row.data) for row in train_n_dw_matrix] for seed in [42, 7, 777, 12]: model = LdaModel(train_corpus, alpha='auto', id2word=num_2_token, num_topics=T, iterations=500, random_state=seed) gensim_phi = exp_common.get_phi(model) gensim_theta = exp_common.get_theta(train_corpus, model) print('gensim perplexity') print(np.exp(-model.log_perplexity(train_corpus))) D, W = train_n_dw_matrix.shape random_gen = np.random.RandomState(seed) phi = common.get_prob_matrix_by_counters( random_gen.uniform(size=(T, W)).astype(np.float64)) theta = common.get_prob_matrix_by_counters( np.ones(shape=(D, T)).astype(np.float64)) phi, theta = default.Optimizer([regularizers.Additive(0.1, 0.)] * 100, verbose=False).run( train_n_dw_matrix, phi, theta) callback = experiments.default_callback( train_n_dw_matrix=train_n_dw_matrix, test_n_dw_matrix=test_n_dw_matrix, top_pmi_sizes=[5, 10, 20, 30], top_avg_jaccard_sizes=[10, 50, 100, 200], measure_time=True) callback.start_launch() callback(0, phi, theta) callback(1, gensim_phi, gensim_theta) print('artm') for name, values in callback.launch_result.items(): print('\t{}: {}'.format(name, values[0])) print('gensim') for name, values in callback.launch_result.items(): print('\t{}: {}'.format(name, values[1]))
from pyartm import regularizers from pyartm_datasets import main_cases from pyartm.optimizations import timed_default import manager if __name__ == '__main__': n_dw_matrix = main_cases.get_20newsgroups([ 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space' ])[0] manager.perform_experiment( n_dw_matrix, timed_default.Optimizer( regularization_list=[regularizers.Additive(0., 0.)] * 100, return_counters=True), 10, 100)
ITERS_COUNT = 100 SAMPLES = 5 if __name__ == '__main__': train_n_dw_matrix, test_n_dw_matrix = main_cases.get_20newsgroups( [ 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space' ], train_proportion=0.8)[:2] args_list = list() for T in [10, 25]: for theta_alpha in [0.1, 0.01, 0.1]: regularization_list = [regularizers.Additive(0, theta_alpha) ] * ITERS_COUNT args_list.append( (train_n_dw_matrix, test_n_dw_matrix, default.Optimizer(regularization_list), T, SAMPLES, '20news_experiment/20news_{}t_default_{}_{}.pkl'.format( T, 0., theta_alpha))) args_list.append( (train_n_dw_matrix, test_n_dw_matrix, thetaless.Optimizer(regularization_list), T, SAMPLES, '20news_experiment/20news_{}t_thetaless_{}_{}.pkl'.format( T, 0., theta_alpha))) args_list.append( (train_n_dw_matrix, test_n_dw_matrix, transfer_thetaless.Optimizer(regularization_list), T, SAMPLES, '20news_experiment/20news_{}t_transfer_thetaless_{}_{}.pkl'.
'talk.politics.guns', ]) topic_0_indices, topic_1_indices = [], [] for index, target in enumerate(doc_targets): if target == 0: topic_0_indices.append(index) elif target == 1: topic_1_indices.append(index) thetaless_rels = [] lda_rels = [] for balance in range(10, 201, 10): print(balance) n_dw_matrix = _n_dw_matrix[topic_0_indices + topic_1_indices * balance, :] regularization_list = [regularizers.Additive(-0.1, 0.)] * 100 lda_phi, lda_theta = experiments.default_sample( n_dw_matrix, T=2, seed=42, optimizer=default.Optimizer(regularization_list, verbose=False)) thetaless_phi, thetaless_theta = experiments.default_sample( n_dw_matrix, T=2, seed=42, optimizer=thetaless.Optimizer(regularization_list, verbose=False)) # print(np.argmax(thetaless_theta[:len(topic_0_indices), :2], axis=1).mean()) # print(np.argmax(thetaless_theta[len(topic_0_indices):, :2], axis=1).mean()) # print('!') # for topic_set in metrics.get_top_words(thetaless_phi, 10): # print('\n\t'.join(map(num_2_token.get, topic_set)))
import manager ITERS_COUNT = 100 SAMPLES = 50 if __name__ == '__main__': train_n_dw_matrix, test_n_dw_matrix = main_cases.get_nips( train_proportion=0.8)[:2] args_list = list() for T in [20, 50]: for phi_alpha in [-0.1, 0., 0.1]: for theta_alpha in [-0.1, 0., 0.1]: regularization_list = [ regularizers.Additive(phi_alpha, theta_alpha) ] * ITERS_COUNT args_list.append( (train_n_dw_matrix, test_n_dw_matrix, default.Optimizer(regularization_list), T, SAMPLES, 'nips_experiment/NIPS_{}t_base_{}_{}.pkl'.format( T, phi_alpha, theta_alpha))) args_list.append( (train_n_dw_matrix, test_n_dw_matrix, naive_thetaless.Optimizer(regularization_list), T, SAMPLES, 'nips_experiment/NIPS_{}t_naive_{}_{}.pkl'.format( T, phi_alpha, theta_alpha))) for use_B_cheat in [False, True]: args_list.append( (train_n_dw_matrix, test_n_dw_matrix,
from pyartm.optimizations import default import manager if __name__ == '__main__': train_n_dw_matrix, test_n_dw_matrix = main_cases.get_20newsgroups( [ 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space' ], train_proportion=0.8)[:2] args_list = list() for T in [10, 30]: for tau in [1e7, 1e8, 1.5e8, 2e8, 2.5e8, 3e8, 3.5e8, 4e8, 4.5e8, 5e8]: for use_old_phi in [True]: # [False, True] regularization_list = [ regularizers.Combination( regularizers.Decorrelator(tau, use_old_phi), regularizers.Additive(-0.01, -0.01), ) ] * 500 args_list.append( (train_n_dw_matrix, test_n_dw_matrix, default.Optimizer(regularization_list), T, 10, '20news_experiment/20news_{}t_{}_{}.pkl'.format( T, int(tau), use_old_phi))) Pool(processes=8).map(manager.perform_experiment, args_list)
def get_optimizer(phi_alpha, iters_count): return default.Optimizer( [regularizers.Additive(phi_alpha, 0.)] * iters_count )
SAMPLES = 100 INIT_ITERS = 100 if __name__ == '__main__': train_n_dw_matrix, test_n_dw_matrix = main_cases.get_20newsgroups( [ 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space' ], train_proportion=0.8)[:2] args_list = list() for T in [10, 25]: plsa_list = [regularizers.Trivial()] * ITERS_COUNT sparse_lda_list = [regularizers.Additive(-1, 0.)] * ITERS_COUNT args_list.append( (train_n_dw_matrix, test_n_dw_matrix, default.Optimizer(sparse_lda_list), T, SAMPLES, INIT_ITERS, '20news_experiment/20news_{}t_post_lda.pkl'.format(T))) args_list.append( (train_n_dw_matrix, test_n_dw_matrix, obd.Optimizer( plsa_list, gamma_tw_min_delta=1, ), T, SAMPLES, INIT_ITERS, '20news_experiment/20news_{}t_post_obd_limited.pkl'.format(T))) args_list.append( (train_n_dw_matrix, test_n_dw_matrix, naive_obd.Optimizer(
from pyartm import regularizers from pyartm.optimizations import default from pyartm.optimizations import thetaless from pyartm.optimizations import naive_thetaless from pyartm.optimizations import obd if __name__ == '__main__': # train_n_dw_matrix = sparse.csr_matrix(np.array([ # [1, 1, 1, 0], # [1, 0, 1, 1], # [1, 1, 0, 1], # ])) train_n_dw_matrix = sparse.csr_matrix( np.random.RandomState(42).uniform(0, 1, size=(100, 1000)) < 0.3) regularization_list = [regularizers.Trivial()] * 500 extra_opt = obd.Optimizer([regularizers.Additive(0, 0)] * 500, verbose=False) for module in [default, thetaless, naive_thetaless]: print(module.__name__) optimizer = module.Optimizer(regularization_list, verbose=False) D, W = train_n_dw_matrix.shape T = 4 random_gen = np.random.RandomState(47) phi_matrix = common.get_prob_matrix_by_counters( random_gen.uniform(size=(T, W)).astype(np.float64)) theta_matrix = common.get_prob_matrix_by_counters( np.ones(shape=(D, T)).astype(np.float64)) phi_matrix, theta_matrix = optimizer.run(train_n_dw_matrix, phi_matrix, theta_matrix) mod_phi_matrix, mod_theta_matrix = extra_opt.run(