def compute_pk_large(cfg_file, random_seed, file_str='rank3'): settings, paths, working_folder = cfg.get_settings(cfg_file, 'sub') num_batch = int(settings['num_of_batch_per_subset']) num_subset = int(settings['num_of_subset']) batch_sizes = np.array(range(num_subset)) * num_batch + num_batch working_folder = '../results/' + working_folder + '_rnd' + str( random_seed) + '/' multi_ker_str = ana_utils.get_top_kern_cmb(batch_sizes, 3, working_folder) print(multi_ker_str) samplesize = 1000 prior_pg = 0.1 for s in batch_sizes: lk.compute_pk(s, file_str, samplesize, prior_pg, top_n=None, multi_ker_str=multi_ker_str, working_folder=working_folder, normalized=True, scaler=2) data_to_use_size = settings['data_to_use'] minibatch_size = settings['minibatch_size'] sp.line_plot_pk(working_folder, file_str, samplesize, prior_pg, batch_sizes, True, minibatch_size, data_to_use_size) return
def plot_pk_subset(cfg_file, random_seed): settings, paths, working_folder = cfg.get_settings(cfg_file, 'sub') data_to_use_size = settings['data_to_use'] minibatch_size = settings['minibatch_size'] num_batch = int(settings['num_of_batch_per_subset']) num_subset = int(settings['num_of_subset']) batch_sizes = np.array(range(num_subset)) * num_batch + num_batch working_folder = '../results/' + working_folder + '_rnd' + str( random_seed) + '/' datastd = 1 # multi_ker_str = ['s', 'r', 'p', 'p*r+s', 'p*r*r', 'r+r*p', 'p*r', 'p+r*p'] multi_ker_str = [ 's', 'r', 'p', 'p*r+s', 'p*r', 'p+r*p', 'p*p*p', 's*s+s', 'p+p*r', 'r*r+s', 's*s+r', 'p+p*p' ] file_str = 'group2' samplesize = 2000 prior = 0.1 # compute pk for s in batch_sizes: lk.compute_pk(s, file_str, samplesize, prior, top_n=None, multi_ker_str=multi_ker_str, working_folder=working_folder, normalized=True) # compute rmse for s in batch_sizes: BMA.subset_BMA(working_folder, file_str, s, samplesize, prior, True) # save rmse and plot pr.gather_and_plot_rmse(working_folder, batch_sizes, file_str, samplesize, prior, minibatch_size, data_to_use_size, datastd, normalized=True) # plot_single_pk sp.line_plot_pk(working_folder, file_str, samplesize, prior, batch_sizes, True, minibatch_size, data_to_use_size) sp.bar_plot_pk(working_folder, file_str, samplesize, prior, batch_sizes, True, 'subset')
def plot_rmse(cfg_file, random_seed, file_str='top10'): settings, paths, working_folder = cfg.get_settings(cfg_file, 'sub') data_to_use_size = settings['data_to_use'] minibatch_size = settings['minibatch_size'] num_batch = int(settings['num_of_batch_per_subset']) num_subset = int(settings['num_of_subset']) batch_sizes = np.array(range(num_subset)) * num_batch + num_batch # save and plot the normalized results here datastd = 1 working_folder = '../results/' + working_folder + '_rnd' + str( random_seed) + '/' samplesize = 1000 prior_pg = 0.5 # --- prob, BMA, plot --- for s in batch_sizes: lk.compute_pk(s, file_str, samplesize, prior_pg, top_n=10, multi_ker_str=None, working_folder=working_folder, normalized=True) for s in batch_sizes: BMA.subset_BMA(working_folder, file_str, s, samplesize, prior_pg, True) pr.gather_and_plot_rmse(working_folder, batch_sizes, file_str, samplesize, prior_pg, minibatch_size, data_to_use_size, datastd, normalized=True)
import sys import utils.parse_cfg as cfg import utils.get_init_dict as gidc gpu_id = int(sys.argv[1]) use_gpu(gpu_id) group = int(sys.argv[2]) start = int(sys.argv[3]) group_size = int(sys.argv[4]) ker = pd.read_csv('kernelstring3.csv') multi_ker_str = np.array(ker).reshape(-1)[start + group * group_size:start + (group + 1) * group_size] cfg_file = '../config/' + sys.argv[5] settings, paths, working_folder = cfg.get_settings(cfg_file, "full") datafile = paths['datafile'] multi_ker_init_dict = None if paths['init_hyper_file'] is not None: multi_ker_init_dict = gidc.get_saved_init(multi_ker_str, paths['init_hyper_file']) working_folder = '../results/' + working_folder + "/" if not os.path.exists(working_folder): os.mkdir(working_folder) settings['initfile'] = paths["init_hyper_file"] print(settings) df = pd.DataFrame(settings, index=[0]) df.to_csv(working_folder + "setting.csv") twi.train_fulldata(datafile, settings, multi_ker_str, multi_ker_init_dict, working_folder)
return if __name__ == "__main__": cfg_file = '../../config/swiss_cfg_1.json' file_str = 'swiss_top10' prior_pg = 0.5 seed = 10 # cfg_file = '../../config/air_cfg_1.json' # file_str = 'temper_top10' # prior_pg = 0.5 # seed = 10 settings, paths, wf = cfg.get_settings(cfg_file, 'full') elbo_file_name = '../../results/' + wf + '/' + 'fulldata_summary.csv' domain, evidence_all, train_time_all, rmse_all = pre_trained_res(elbo_file_name) datafile = '../'+paths['datafile'] datastd = 1 _, _, wf_sub = cfg.get_settings(cfg_file, 'sub') bayesian_file_name = '../../results/' + wf_sub + '_rnd' + str(seed) + '/' + 'plots_res/' + file_str + "_RMSE_comparison_" + 'ss' + str( 1000) + '_p' + str(prior_pg) + "_normalized.csv" subset_size = 200 bo_res_file = '../../results/' + wf + '/' + 'bo_subsize' + str(subset_size) + '.pkl' plot_rmse_cmp(elbo_file_name, bayesian_file_name, bo_res_file, datastd) # ==averaging multiple runs: # saving_folder = '../../results/multiple_results/'
import utils.parse_cfg as cfg import training_wrap_inputs as twi gpu_id = int(sys.argv[1]) use_gpu(gpu_id) group = int(sys.argv[2]) random_seed = int(sys.argv[3]) # multiple runs start = int(sys.argv[4]) group_size = int(sys.argv[5]) ker = pd.read_csv('kernelstring3.csv') multi_ker_str = np.array(ker).reshape(-1)[start + group * group_size:start + (group + 1) * group_size] cfg_file = '../config/' + sys.argv[6] settings, paths, working_folder = cfg.get_settings(cfg_file, 'sub') num_batch = int(settings['num_of_batch_per_subset']) num_subset = int(settings['num_of_subset']) batch_sizes = np.array(range(num_subset)) * num_batch + num_batch reuse_batch_sizes = np.array(range(num_subset)) * num_batch datafile = paths['datafile'] multi_ker_init_dict = None if paths['init_hyper_file'] is not None: multi_ker_init_dict = gidc.get_saved_init(multi_ker_str, paths['init_hyper_file']) settings['random_s'] = random_seed settings['initfile'] = paths["init_hyper_file"] print(settings)
'best_ker': domain[best_ker_ind], 'best_ker_elbo': max(evidence_all)} with open(result_filename, 'wb') as fout: pickle.dump(bo_results, fout) if __name__ == "__main__": cfg_file = '../../config/swiss_cfg_1.json' subset_candi_size = 200000 # cfg_file = '../../config/air_cfg_1.json' # subset_candi_size = 10000 subset_size = 200 _, paths, wf = cfg.get_settings(cfg_file, 'full') wf = '../../results/' + wf + "/" elbo_file_name = wf + 'fulldata_summary.csv' if not os.path.exists(elbo_file_name): k = gtr.get_kernel_names(wf) df = gtr.get_fulldata_res(wf, k) df.to_csv(elbo_file_name) domain, evidence_all, train_time_all, _ = pre_trained_res(elbo_file_name) evidence_all = evidence_all datafile = '../' + paths['datafile'] bo_res_file = wf + 'bo_subsize' + str(subset_size) + '.pkl' run_bo(datafile, subset_candi_size, subset_size, bo_res_file)