def main(options): #take options from the yaml config with open(options.config, 'r') as config_file: config = yaml.load(config_file) output_tag = config['output_tag'] mc_dir = config['mc_file_dir'] mc_fnames = config['mc_file_names'] #data not needed yet, could use this for validation later. keep for compat with class data_dir = config['data_file_dir'] data_fnames = config['data_file_names'] train_vars = config['train_vars'] vars_to_add = config['vars_to_add'] presel = config['preselection'] proc_to_tree_name = config['proc_to_tree_name'] colours = ['#d7191c', '#fdae61', '#f2f229', '#abdda4', '#2b83ba'] #Data handling stuff# #load the mc dataframe for all years if options.pt_reweight: cr_selection = config['reweight_cr'] output_tag += '_pt_reweighted' root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir, data_fnames, proc_to_tree_name, train_vars, vars_to_add, cr_selection) else: root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir, data_fnames, proc_to_tree_name, train_vars, vars_to_add, presel) for sig_obj in root_obj.sig_objects: root_obj.load_mc(sig_obj, reload_samples=options.reload_samples) for bkg_obj in root_obj.bkg_objects: root_obj.load_mc(bkg_obj, bkg=True, reload_samples=options.reload_samples) #for data_obj in root_obj.data_objects: # root_obj.load_data(data_obj, reload_samples=options.reload_samples) root_obj.concat() if options.pt_reweight and options.reload_samples: for year in root_obj.years: root_obj.pt_reweight('DYMC', year, presel) #Plotter stuff# #add model predictions to sig df print 'loading classifier: {}'.format(options.model) clf = pickle.load(open("{}".format(options.model), "rb")) sig_df = root_obj.mc_df_sig sig_df['bdt_score'] = clf.predict_proba(sig_df[train_vars].values)[:,1:].ravel() bkg_df = root_obj.mc_df_bkg bkg_df['bdt_score'] = clf.predict_proba(bkg_df[train_vars].values)[:,1:].ravel() plotter = Plotter(root_obj, train_vars) #for VBF, good set is: [0.30 0.50 0.70 0.80 0.90 1.0] #for ggH, good set is: [0.10 0.30 0.45 0.53 0.60 0.8] bdt_bins = np.array(options.boundaries) Utils.check_dir('{}/plotting/plots/{}_sig_bkg_evo'.format(os.getcwd(), output_tag)) i_hist = 0 for var in train_vars+['dielectronMass']: fig = plt.figure(1) axes = fig.gca() var_bins = np.linspace(plotter.var_to_xrange[var][0], plotter.var_to_xrange[var][1], options.n_bins) for ibin in range(len(bdt_bins)-1): sig_cut = sig_df[np.logical_and( sig_df['bdt_score'] > bdt_bins[ibin], sig_df['bdt_score'] < bdt_bins[ibin+1])][var] weights_cut = sig_df[np.logical_and( sig_df['bdt_score'] > bdt_bins[ibin], sig_df['bdt_score'] < bdt_bins[ibin+1])]['weight'] axes.hist(sig_cut, bins=var_bins, label='{:.2f} $<$ MVA $<$ {:.2f}'.format(bdt_bins[ibin], bdt_bins[ibin+1]), weights=weights_cut, histtype='step', color=colours[i_hist], normed=True) i_hist += 1 i_hist=0 annotate_and_save(axes, plotter, var) fig.savefig('{0}/plotting/plots/{1}_sig_bkg_evo/{1}_{2}.pdf'.format(os.getcwd(), output_tag, var)) print('saving: {0}/plotting/plots/{1}_sig_bkg_evo/{1}_{2}.pdf'.format(os.getcwd(), output_tag, var)) plt.close() #plot background (check mass is not being sculpted) for var in ['dielectronMass']: fig = plt.figure(1) axes = fig.gca() var_bins = np.linspace(plotter.var_to_xrange[var][0], plotter.var_to_xrange[var][1], options.n_bins) for ibin in range(len(bdt_bins)-1): bkg_cut = bkg_df[np.logical_and( bkg_df['bdt_score'] > bdt_bins[ibin], bkg_df['bdt_score'] < bdt_bins[ibin+1])][var] bkg_weights_cut = bkg_df[np.logical_and( bkg_df['bdt_score'] > bdt_bins[ibin], bkg_df['bdt_score'] < bdt_bins[ibin+1])]['weight'] axes.hist(bkg_cut, bins=var_bins, label='{:.2f} $<$ MVA $<$ {:.2f}'.format(bdt_bins[ibin], bdt_bins[ibin+1]), weights=bkg_weights_cut, histtype='step', color=colours[i_hist], normed=True) i_hist+=1 i_hist=0 annotate_and_save(axes, plotter, var) fig.savefig('{0}/plotting/plots/{1}_sig_bkg_evo/{1}_{2}_bkg.pdf'.format(os.getcwd(), output_tag, var)) plt.close()
def main(options): #take options from the yaml config with open(options.config, 'r') as config_file: config = yaml.load(config_file) output_tag = config['output_tag'] mc_dir = config['mc_file_dir'] mc_fnames = config['mc_file_names'] #data not needed yet, could use this for validation later. keep for compat with class data_dir = config['data_file_dir'] data_fnames = config['data_file_names'] train_vars = config['train_vars'] vars_to_add = config['vars_to_add'] presel = config['preselection'] proc_to_tree_name = config['proc_to_tree_name'] sig_colour = 'forestgreen' #sig_colour = 'red' bkg_colour = 'violet' #Data handling stuff# #load the mc dataframe for all years if options.pt_reweight: cr_selection = config['reweight_cr'] output_tag += '_pt_reweighted' root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir, data_fnames, proc_to_tree_name, train_vars, vars_to_add, cr_selection) else: root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir, data_fnames, proc_to_tree_name, train_vars, vars_to_add, presel) for sig_obj in root_obj.sig_objects: root_obj.load_mc(sig_obj, reload_samples=options.reload_samples) for bkg_obj in root_obj.bkg_objects: root_obj.load_mc(bkg_obj, bkg=True, reload_samples=options.reload_samples) root_obj.concat() if options.pt_reweight and options.reload_samples: for year in root_obj.years: root_obj.pt_reweight('DYMC', year, presel) #Plotter stuff# with open('plotting/var_to_xrange.yaml', 'r') as plot_config_file: plot_config = yaml.load(plot_config_file) var_to_xrange = plot_config['var_to_xrange'] #get x string replacements from yaml config with open('plotting/var_to_xstring.yaml', 'r') as plot_config_file: plot_string_cfg = yaml.load(plot_config_file) var_to_xstring = plot_string_cfg['var_to_xstring'] #set up X, w and y, train-test plotter = Plotter(root_obj, train_vars, sig_col=sig_colour, norm_to_data=True) for var in train_vars: #for var in ['dielectronCosPhi']: fig = plt.figure(1) axes = fig.gca() var_sig = root_obj.mc_df_sig[var].values sig_weights = root_obj.mc_df_sig['weight'].values var_bkg = root_obj.mc_df_bkg[var].values bkg_weights = root_obj.mc_df_bkg['weight'].values bins = np.linspace(var_to_xrange[var][0], var_to_xrange[var][1], 56) #add sig mc axes.hist(var_sig, bins=bins, label=plotter.sig_labels[0]+r' ($\mathrm{H}\rightarrow\mathrm{ee}$)', weights=sig_weights, histtype='stepfilled', color='red', zorder=10, alpha=0.4, normed=True) axes.hist(var_bkg, bins=bins, label='Simulated background', weights=bkg_weights, histtype='stepfilled', color='blue', zorder=0, alpha=0.4, normed=True) axes.set_ylabel('Arbitrary Units', ha='right', y=1, size=13) current_bottom, current_top = axes.get_ylim() axes.set_ylim(bottom=0, top=1.2*current_top) axes.set_xlim(left=var_to_xrange[var][0], right=var_to_xrange[var][1]) axes.legend(bbox_to_anchor=(0.97,0.97), ncol=1) plotter.plot_cms_labels(axes, lumi='') axes.set_xlabel('{}'.format(var_to_xstring[var]), ha='right', x=1, size=13) Utils.check_dir('{}/plotting/plots/{}/normed/'.format(os.getcwd(), output_tag)) fig.savefig('{0}/plotting/plots/{1}/normed/{1}_{2}_normalised.pdf'.format(os.getcwd(), output_tag, var)) plt.close()
def main(options): #take options from the yaml config with open(options.config, 'r') as config_file: config = yaml.load(config_file) output_tag = config['output_tag'] mc_dir = config['mc_file_dir'] mc_fnames = config['mc_file_names'] #data not needed yet, but stil specify in the config for compatibility with constructor data_dir = config['data_file_dir'] data_fnames = config['data_file_names'] proc_to_tree_name = config['proc_to_tree_name'] train_vars = config['train_vars'] vars_to_add = config['vars_to_add'] presel = config['preselection'] #Data handling stuff# #load the mc dataframe for all years if options.pt_reweight: cr_selection = config['reweight_cr'] output_tag += '_pt_reweighted' root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir, data_fnames, proc_to_tree_name, train_vars, vars_to_add, cr_selection) else: root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir, data_fnames, proc_to_tree_name, train_vars, vars_to_add, presel) for sig_obj in root_obj.sig_objects: root_obj.load_mc(sig_obj, reload_samples=options.reload_samples) for bkg_obj in root_obj.bkg_objects: root_obj.load_mc(bkg_obj, bkg=True, reload_samples=options.reload_samples) for data_obj in root_obj.data_objects: root_obj.load_data(data_obj, reload_samples=options.reload_samples) root_obj.concat() #reweight samples in bins of pT (and maybe Njets), for each year separely. Note targetted selection # is applied here and all df's are resaved for smaller mem if options.pt_reweight and options.reload_samples: for year in root_obj.years: root_obj.pt_reweight('DYMC', year, presel) #root_obj.pt_njet_reweight('DYMC', year, presel) #BDT stuff# #set up X, w and y, train-test bdt_hee = BDTHelpers(root_obj, train_vars, options.train_frac, eq_train=options.eq_train) bdt_hee.create_X_and_y(mass_res_reweight=True) #bdt_hee.create_X_and_y(mass_res_reweight=False) #submit the HP search if option true if options.hp_perm is not None: if options.opt_hps and options.train_best: raise Exception( 'Cannot optimise HPs and train best model. Run optimal training after hyper paramter optimisation' ) elif options.opt_hps and options.hp_perm: raise Exception( 'opt_hps option submits scripts with the hp_perm option; Cannot submit a script with both!' ) else: print 'About to train + validate on dataset with {} fold splitting'.format( options.k_folds) bdt_hee.set_hyper_parameters(options.hp_perm) bdt_hee.set_k_folds(options.k_folds) for i_fold in range(options.k_folds): bdt_hee.set_i_fold(i_fold) bdt_hee.train_classifier(root_obj.mc_dir, save=False) bdt_hee.validation_rocs.append(bdt_hee.compute_roc()) with open('{}/bdt_hp_opt_{}.txt'.format(mc_dir, output_tag), 'a+') as val_roc_file: bdt_hee.compare_rocs(val_roc_file, options.hp_perm) val_roc_file.close() elif options.opt_hps: #FIXME: add warning that many jobs are about to be submiited if options.k_folds < 2: raise ValueError('K-folds option must be at least 2') if path.isfile('{}/bdt_hp_opt_{}.txt'.format(mc_dir, output_tag)): system('rm {}/bdt_hp_opt_{}.txt'.format(mc_dir, output_tag)) print('deleting: {}/bdt_hp_opt_{}.txt'.format( mc_dir, output_tag)) bdt_hee.batch_gs_cv(k_folds=options.k_folds, pt_rew=options.pt_reweight) elif options.train_best: output_tag += '_best' with open('{}/bdt_hp_opt_{}.txt'.format(mc_dir, output_tag), 'r') as val_roc_file: hp_roc = val_roc_file.readlines() best_params = hp_roc[-1].split(';')[0] print 'Best classifier params are: {}'.format(best_params) bdt_hee.set_hyper_parameters(best_params) bdt_hee.train_classifier(root_obj.mc_dir, save=True, model_name=output_tag) bdt_hee.compute_roc() bdt_hee.plot_roc(output_tag) bdt_hee.plot_output_score( output_tag, ratio_plot=True, norm_to_data=(not options.pt_reweight)) #else just train BDT with default HPs else: bdt_hee.train_classifier(root_obj.mc_dir, save=True, model_name=output_tag + '_clf') bdt_hee.compute_roc() bdt_hee.plot_roc(output_tag) #bdt_hee.plot_output_score(output_tag, ratio_plot=True, norm_to_data=(not options.pt_reweight)) bdt_hee.plot_output_score(output_tag, ratio_plot=True, norm_to_data=(not options.pt_reweight))
def main(options): #take options from the yaml config with open(options.config, 'r') as config_file: config = yaml.load(config_file) output_tag = config['output_tag'] mc_dir = config['mc_file_dir'] mc_fnames = config['mc_file_names'] data_dir = config['data_file_dir'] data_fnames = config['data_file_names'] proc_to_tree_name = config['proc_to_tree_name'] object_vars = config['object_vars'] flat_obj_vars = [var for i_object in object_vars for var in i_object] event_vars = config['event_vars'] vars_to_add = config['vars_to_add'] presel = config['preselection'] #Data handling stuff# if options.pt_reweight: cr_selection = config['reweight_cr'] output_tag += '_pt_reweighted' root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir, data_fnames, proc_to_tree_name, flat_obj_vars + event_vars, vars_to_add, cr_selection) else: root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir, data_fnames, proc_to_tree_name, flat_obj_vars + event_vars, vars_to_add, presel) #load the dataframes for all years for sig_obj in root_obj.sig_objects: root_obj.load_mc(sig_obj, reload_samples=options.reload_samples) for bkg_obj in root_obj.bkg_objects: root_obj.load_mc(bkg_obj, bkg=True, reload_samples=options.reload_samples) for data_obj in root_obj.data_objects: # for plotting root_obj.load_data(data_obj, reload_samples=options.reload_samples) root_obj.concat() #reweight samples in bins of pT (and maybe Njets), for each year separely. if options.pt_reweight and options.reload_samples: for year in root_obj.years: root_obj.pt_reweight('DYMC', year, presel) #LSTM stuff# LSTM = LSTM_DNN(root_obj, object_vars, event_vars, options.train_frac, options.eq_weights, options.batch_boost) if not options.opt_hps: LSTM.var_transform(do_data=True) X_tot, y_tot = LSTM.create_X_y(mass_res_reweight=True) LSTM.split_X_y(X_tot, y_tot, do_data=True) if options.hp_perm is not None: LSTM.get_X_scaler(LSTM.all_vars_X_train, out_tag=output_tag, save=False) else: LSTM.get_X_scaler(LSTM.all_vars_X_train, out_tag=output_tag) LSTM.X_scale_train_test(do_data=True) LSTM.set_low_level_2D_test_train(do_data=True, ignore_train=options.batch_boost) #functions called in subbed job, if options.opt_hps was true if options.hp_perm is not None: if options.opt_hps and options.train_best: raise Exception( 'Cannot optimise HPs and train best model. Run optimal training after hyper paramter optimisation' ) elif options.opt_hps and options.hp_perm: raise Exception( 'opt_hps option submits scripts with the hp_perm option; Cannot submit a script with both!' ) else: LSTM.set_hyper_parameters(options.hp_perm) LSTM.model.summary() LSTM.train_w_batch_boost(out_tag=output_tag, save=False) with open('{}/lstm_hp_opt_{}.txt'.format(mc_dir, output_tag), 'a+') as val_roc_file: LSTM.compare_rocs(val_roc_file, options.hp_perm) val_roc_file.close() elif options.opt_hps: #FIXME: add warning that many jobs are about to be submiited if path.isfile('{}/lstm_hp_opt_{}.txt'.format(mc_dir, output_tag)): system('rm {}/lstm_hp_opt_{}.txt'.format(mc_dir, output_tag)) print('deleting: {}/lstm_hp_opt_{}.txt'.format( mc_dir, output_tag)) LSTM.batch_gs_cv(pt_rew=options.pt_reweight) elif options.train_best: output_tag += '_best' with open('{}/lstm_hp_opt_{}.txt'.format(mc_dir, output_tag), 'r') as val_roc_file: hp_roc = val_roc_file.readlines() best_params = hp_roc[-1].split(';')[0] print 'Best classifier params are: {}'.format(best_params) LSTM.set_hyper_parameters(best_params) LSTM.model.summary() LSTM.train_w_batch_boost(out_tag=output_tag) #compute final roc on test set LSTM.compute_roc(batch_size=1024) LSTM.plot_roc(output_tag) LSTM.plot_output_score(output_tag, batch_size=1024, ratio_plot=True, norm_to_data=(not options.pt_reweight)) #else train with basic parameters/architecture else: LSTM.model.summary() if options.batch_boost: #type of model selection so need validation set LSTM.train_w_batch_boost( out_tag=output_tag ) #handles creating validation set and 2D vars and sequential saving else: #LSTM.train_network(epochs=3, batch_size=1024) LSTM.train_network(epochs=7, batch_size=32) LSTM.save_model(out_tag=output_tag) LSTM.compute_roc(batch_size=32) #compute final roc on test set LSTM.plot_roc(output_tag) LSTM.plot_output_score(output_tag, batch_size=32, ratio_plot=True, norm_to_data=(not options.pt_reweight))
def main(options): #take options from the yaml config with open(options.config, 'r') as config_file: config = yaml.load(config_file) output_tag = config['output_tag'] mc_dir = config['mc_file_dir'] mc_fnames = config['mc_file_names'] #data not needed yet, could use this for validation later. keep for compat with class data_dir = config['data_file_dir'] data_fnames = config['data_file_names'] train_vars = config['train_vars'] vars_to_add = config['vars_to_add'] presel = config['preselection'] proc_to_tree_name = config['proc_to_tree_name'] sig_colour = 'forestgreen' #sig_colour = 'red' #Data handling stuff# #load the mc dataframe for all years if options.pt_reweight: cr_selection = config['reweight_cr'] output_tag += '_pt_reweighted' root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir, data_fnames, proc_to_tree_name, train_vars, vars_to_add, cr_selection) else: root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir, data_fnames, proc_to_tree_name, train_vars, vars_to_add, presel) for sig_obj in root_obj.sig_objects: root_obj.load_mc(sig_obj, reload_samples=options.reload_samples) for bkg_obj in root_obj.bkg_objects: root_obj.load_mc(bkg_obj, bkg=True, reload_samples=options.reload_samples) for data_obj in root_obj.data_objects: root_obj.load_data(data_obj, reload_samples=options.reload_samples) root_obj.concat() if options.pt_reweight and options.reload_samples: for year in root_obj.years: root_obj.pt_reweight('DYMC', year, presel) #Plotter stuff# #set up X, w and y, train-test plotter = Plotter(root_obj, train_vars, sig_col=sig_colour, norm_to_data=True) for var in train_vars + ['dielectronMass', 'dielectronPt']: plotter.plot_input(var, options.n_bins, output_tag, options.ratio_plot, norm_to_data=(not options.pt_reweight))