def process(config_file, samples): # Setup logging. log = logging.getLogger(__name__) start_time = time.time() # Load config from file. config = utils.load_config(config_file) # Load entire dataset, this # should only be done once # because it's 1.5 GB at load time. data = utils.load_dataset(config) # Applying nominal cuts to get the subset # of events that I consider good when # using the "best" cut values. nominal_filter = utils.build_filter(data) nominal_data = utils.build_dataframe(data, nominal_filter) # Randomize the sectors to test # if we can at least get the same # answer. utils.randomize_sector(data) varfile = os.path.dirname(__file__) + '/../../variations.json' variations = load_variations(varfile) # Use quantile binning to get integrated bins # for the axes listed in the configuration. bins = setup_binning(config, nominal_data) # Calculate the results for the nominal subset of data. results = {} results['nominal'] = utils.get_results(nominal_data, bins, config) # Calculate the results for each sector. for sector in range(1, 7): sector_data = data[data['sector'] == sector] for imc in range(samples): var_time = time.time() log.info('Doing sector {}'.format(sector)) random_filter = utils.get_random_config(sector_data, variations) random_data = utils.build_dataframe(sector_data, random_filter) sect_result = utils.get_results(random_data, bins, config) elapsed_time = time.time() - var_time log.info('Elapsed time %.3f' % elapsed_time) output_filename = str(config['database_path'] + 'phi/random/sector_' + str(sector) + '_{}.csv'.format(imc)) sect_result.to_csv(output_filename, index=False) exe_time = time.time() - start_time log.info('Finished execution in %.3f seconds.' % exe_time)
def process_par_set(data, config, alpha_min=0.55, alpha_max=1.0, dist_cc_min=-1.0, dist_cc_max=1.0, dist_cc_theta_min=-1.0, dist_cc_theta_max=1.0, dist_dcr1_min=-1.0, dist_dcr1_max=1.0, dist_dcr3_min=-1.0, dist_dcr3_max=1.0, dist_ecsf_min=-1.0, dist_ecsf_max=1.0, dist_ecu_min=-1.0, dist_ecu_max=1.0, dist_ecv_min=-1.0, dist_ecv_max=1.0, dist_ecw_min=-1.0, dist_ecw_max=1.0, dist_ec_edep_min=-1.0, dist_ec_edep_max=1.0, dist_vz_min=-1.0, dist_vz_max=1.0, missing_mass_min=0.0, missing_mass_max=5.0, p_mes_min=0.35, p_mes_max=2.0): """ Process one dataset. """ conf = {} conf['alpha'] = [alpha_min, alpha_max] conf['dist_cc'] = [dist_cc_min, dist_cc_max] conf['dist_cc_theta'] = [dist_cc_theta_min, dist_cc_theta_max] conf['dist_dcr1'] = [dist_dcr1_min, dist_dcr1_max] conf['dist_dcr3'] = [dist_dcr3_min, dist_dcr3_max] conf['dist_ecsf'] = [dist_ecsf_min, dist_ecsf_max] conf['dist_ecu'] = [dist_ecu_min, dist_ecu_max] conf['dist_ecv'] = [dist_ecv_min, dist_ecv_max] conf['dist_ecw'] = [dist_ecw_min, dist_ecw_max] conf['dist_ec_edep'] = [dist_ec_edep_min, dist_ec_edep_max] conf['dist_vz'] = [dist_vz_min, dist_vz_max] conf['missing_mass'] = [missing_mass_min, missing_mass_max] conf['p_mes'] = [p_mes_min, p_mes_max] data_filter = utils.build_filter(data, conf) df = utils.build_dataframe(data, data_filter) # how many points?! npoints = len(config['axes']) * config['n_bins'] * 12 sector_values = np.zeros(shape=(npoints, 6)) sector_errors = np.zeros(shape=(npoints, 6)) for i in range(1, 7): sector_data = df[df['sector'] == i] sector_result = utils.get_results(sector_data, bins, config) sector_values[:, i - 1] = sector_result['value'].values sector_errors[:, i - 1] = sector_result['stat'].values var = np.var(sector_values, axis=1) metric = np.exp(-0.5 * np.sum(var)) return metric
'dist_ec_edep_min': (-1.1, -0.9), 'dist_ec_edep_max': (0.9, 1.1), 'dist_vz_min': (-1.1, -0.9), 'dist_vz_max': (0.9, 1.1), 'missing_mass_min': (0.0, 1.75), 'p_mes_min': (0.3, 0.4), 'p_mes_max': (1.6, 1.8) } # Load the configuration file and entire # dataset (once). config = utils.load_config(args.config) data = utils.load_dataset(config) # Nominal data to get binning nominal_filter = utils.build_filter(data) nominal_data = utils.build_dataframe(data, nominal_filter) bins = setup_binning(config, nominal_data) objective_fn = partial(process_par_set, data=data, config=config) opt = BayesianOptimization(f=objective_fn, pbounds=parameter_bounds, random_state=1) opt.maximize(init_points=args.init_points, n_iter=args.n_iter) print(opt.max) output_file = open('best_params.pkl', 'wb') pickle.dump(opt.max, output_file)
def process(config_file): # Setup logging. log = logging.getLogger(__name__) start_time = time.time() # Load config from file. config = utils.load_config(config_file) # Load entire dataset, this # should only be done once # because it's 1.5 GB at load time. data = utils.load_dataset(config) utils.randomize_sector(data) # Applying nominal cuts to get the subset # of events that I consider good when # using the "best" cut values. if args.bayes_opt_pars is not None: log.info("Using Bayesian Optimized parameters for nominal.") with open(args.bayes_opt_pars, 'rb') as f: bayes_pars = pickle.load(f) params = {str(k): float(v) for k, v in bayes_pars['params'].items()} bayes_conf = build_bayesian_optimized_config(**params) nominal_filter = utils.build_filter(data, bayes_conf) else: nominal_filter = utils.build_filter(data) nominal_data = utils.build_dataframe(data, nominal_filter) # Use quantile binning to get integrated bins # for the axes listed in the configuration. bins = setup_binning(config, nominal_data) # Calculate the results for the nominal subset of data. results = {} results['nominal'] = utils.get_results(nominal_data, bins, config) # Calculate the results for each sector. for sector in range(1, 7): var_time = time.time() log.info('Doing sector {}'.format(sector)) sector_data = nominal_data[nominal_data['sector'] == sector] sect_result = utils.get_results(sector_data, bins, config) elapsed_time = time.time() - var_time log.info('Elapsed time %.3f' % elapsed_time) output_filename = str(config['database_path'] + 'phi/sector_' + str(sector) + '.csv') sect_result.to_csv(output_filename, index=False) del nominal_data # Define variations to consider. These # are the systematics that are applied. variations = load_variations(config['variation_file']) for par in variations.keys(): results[par] = {} for index in variations[par].keys(): var_time = time.time() log.info( 'Doing %.3f < %s < %.3f' % (variations[par][index][0], par, variations[par][index][1])) # get these cut values temp_dict = {} temp_dict[par] = variations[par][index] # get data temp_filter = utils.build_filter(data, temp_dict) temp_data = utils.build_dataframe(data, temp_filter) results[par][index] = utils.get_results(temp_data, bins, config) del temp_data end_var_time = time.time() - var_time log.info('Elapsed time %.3f' % end_var_time) # Using all variations, systematic # uncertainties are added to the dataframe. systematic_sources = assign_systematics(results) with open(config['systematics_file'], 'wb') as outputfile: pickle.dump(systematic_sources, outputfile) #pickle.dump(systematic_sources, config['systematics_file']) # Write results to file. results['nominal'].to_csv(config['output_filename'], index=False) # Write other results too. dont_write = ['sector'.format(s) for s in range(1, 7)] dont_write.append('nominal') for key in results.keys(): if key not in dont_write: for conf in results[key]: output_filename = str(config['database_path'] + 'phi/variation_' + key + '_' + str(conf) + '.csv') results[key][conf].to_csv(output_filename, index=False) exe_time = time.time() - start_time log.info('Finished execution in %.3f seconds.' % exe_time)