def main(options): #take options from the yaml config with open(options.config, 'r') as config_file: config = yaml.load(config_file) output_tag = config['output_tag'] mc_dir = config['mc_file_dir'] mc_fnames = config['mc_file_names'] #data not needed yet, could use this for validation later. keep for compat with class data_dir = config['data_file_dir'] data_fnames = config['data_file_names'] train_vars = config['train_vars'] vars_to_add = config['vars_to_add'] presel = config['preselection'] proc_to_tree_name = config['proc_to_tree_name'] #sig_colour = 'forestgreen' sig_colour = 'red' #Data handling stuff# sys.exit(1) #load the mc dataframe for all years if options.pt_reweight: cr_selection = config['reweight_cr'] output_tag += '_pt_reweighted' root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir, data_fnames, proc_to_tree_name, train_vars, vars_to_add, cr_selection) else: root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir, data_fnames, proc_to_tree_name, train_vars, vars_to_add, presel) for sig_obj in root_obj.sig_objects: root_obj.load_mc(sig_obj, reload_samples=options.reload_samples) for bkg_obj in root_obj.bkg_objects: root_obj.load_mc(bkg_obj, bkg=True, reload_samples=options.reload_samples) for data_obj in root_obj.data_objects: root_obj.load_data(data_obj, reload_samples=options.reload_samples) root_obj.concat() if options.pt_reweight and options.reload_samples: root_obj.apply_pt_rew('DYMC', presel) #Plotter stuff# #set up X, w and y, train-test plotter = Plotter(root_obj, train_vars, sig_col=sig_colour, norm_to_data=True) for var in train_vars: plotter.plot_input(var, options.n_bins, output_tag, options.ratio_plot, norm_to_data=True)
def main(options): #take options from the yaml config with open(options.config, 'r') as config_file: config = yaml.load(config_file) output_tag = config['output_tag'] mc_dir = config['mc_file_dir'] mc_fnames = config['mc_file_names'] data_dir = config['data_file_dir'] data_fnames = config['data_file_names'] proc_to_tree_name = config['proc_to_tree_name'] object_vars = config['object_vars'] flat_obj_vars = [var for i_object in object_vars for var in i_object] event_vars = config['event_vars'] vars_to_add = config['vars_to_add'] presel = config['preselection'] colours = ['#d7191c', '#fdae61', '#f2f229', '#abdda4', '#2b83ba'] #Data handling stuff# #load the mc dataframe for all years root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir, data_fnames, proc_to_tree_name, flat_obj_vars + event_vars, vars_to_add, presel) for sig_obj in root_obj.sig_objects: print sig_obj.file_name print sig_obj.tree_name root_obj.load_mc(sig_obj, reload_samples=options.reload_samples) for bkg_obj in root_obj.bkg_objects: root_obj.load_mc(bkg_obj, bkg=True, reload_samples=options.reload_samples) root_obj.concat() #Plotter stuff# #add model predictions to sig df print 'loading DNN: {}'.format(options.model_architecture) with open('{}'.format(options.model_architecture), 'r') as model_json: model_architecture = model_json.read() model = keras.models.model_from_json(model_architecture) model.load_weights('{}'.format(options.model)) LSTM = LSTM_DNN(root_obj, object_vars, event_vars, 1.0, False, True) unscaled_sig_df = root_obj.mc_df_sig.copy() unscaled_bkg_df = root_obj.mc_df_bkg.copy() # set up X and y Matrices LSTM.var_transform(do_data=False) X_tot, y_tot = LSTM.create_X_y() X_tot = X_tot[flat_obj_vars + event_vars] #filter unused vars LSTM.load_X_scaler(out_tag=output_tag) X_tot = LSTM.X_scaler.transform(X_tot) X_tot = pd.DataFrame(X_tot, columns=flat_obj_vars + event_vars) X_tot_high_level = X_tot[event_vars].values X_tot_low_level = LSTM.join_objects(X_tot[flat_obj_vars]) pred_prob_tot = model.predict([X_tot_high_level, X_tot_low_level], batch_size=1024).flatten() unscaled_sig_df['bdt_score'] = pred_prob_tot[y_tot == 1] unscaled_bkg_df['bdt_score'] = pred_prob_tot[y_tot == 0] train_vars = flat_obj_vars + event_vars plotter = Plotter(root_obj, train_vars, norm_to_data=True) #for VBF, good set is: [0.30 0.50 0.70 0.80 0.90 1.0] bdt_bins = np.array(options.boundaries) Utils.check_dir('{}/plotting/plots/{}_sig_bkg_evo'.format( os.getcwd(), output_tag)) i_hist = 0 for var in train_vars + ['dielectronMass']: fig = plt.figure(1) axes = fig.gca() var_bins = np.linspace(plotter.var_to_xrange[var][0], plotter.var_to_xrange[var][1], options.n_bins) for ibin in range(len(bdt_bins) - 1): sig_cut = unscaled_sig_df[np.logical_and( unscaled_sig_df['bdt_score'] > bdt_bins[ibin], unscaled_sig_df['bdt_score'] < bdt_bins[ibin + 1])][var] weights_cut = unscaled_sig_df[np.logical_and( unscaled_sig_df['bdt_score'] > bdt_bins[ibin], unscaled_sig_df['bdt_score'] < bdt_bins[ibin + 1])]['weight'] weights_cut /= np.sum(weights_cut) axes.hist(sig_cut, bins=var_bins, label='{:.2f} $<$ MVA $<$ {:.2f}'.format( bdt_bins[ibin], bdt_bins[ibin + 1]), weights=weights_cut, histtype='step', color=colours[i_hist]) i_hist += 1 i_hist = 0 annotate_and_save(axes, plotter, var) axes.text(0.95, 0.6, 'Simulated VBF signal', ha='right', va='bottom', transform=axes.transAxes, size=14) fig.savefig( '{0}/plotting/plots/{1}_sig_bkg_evo/{1}_{2}.pdf'.format( os.getcwd(), output_tag, var)) plt.close() #plot background (check mass is not being sculpted) for var in ['dielectronMass']: fig = plt.figure(1) axes = fig.gca() var_bins = np.linspace(plotter.var_to_xrange[var][0], plotter.var_to_xrange[var][1], options.n_bins) for ibin in range(len(bdt_bins) - 1): bkg_cut = unscaled_bkg_df[np.logical_and( unscaled_bkg_df['bdt_score'] > bdt_bins[ibin], unscaled_bkg_df['bdt_score'] < bdt_bins[ibin + 1])][var] bkg_weights_cut = unscaled_bkg_df[np.logical_and( unscaled_bkg_df['bdt_score'] > bdt_bins[ibin], unscaled_bkg_df['bdt_score'] < bdt_bins[ibin + 1])]['weight'] bkg_weights_cut /= np.sum(bkg_weights_cut) axes.hist(bkg_cut, bins=var_bins, label='{:.2f} $<$ MVA $<$ {:.2f}'.format( bdt_bins[ibin], bdt_bins[ibin + 1]), weights=bkg_weights_cut, histtype='step', color=colours[i_hist]) i_hist += 1 i_hist = 0 annotate_and_save(axes, plotter, var) axes.text(0.95, 0.6, 'Simulated background', ha='right', va='bottom', transform=axes.transAxes, size=14) fig.savefig( '{0}/plotting/plots/{1}_sig_bkg_evo/{1}_{2}_bkg.pdf'.format( os.getcwd(), output_tag, var)) plt.close()
def main(options): #take options from the yaml config with open(options.config, 'r') as config_file: config = yaml.load(config_file) output_tag = config['output_tag'] mc_dir = config['mc_file_dir'] mc_fnames = config['mc_file_names'] data_dir = config['data_file_dir'] data_fnames = config['data_file_names'] proc_to_tree_name = config['proc_to_tree_name'] object_vars = config['object_vars'] flat_obj_vars = [var for i_object in object_vars for var in i_object] event_vars = config['event_vars'] vars_to_add = config['vars_to_add'] presel = config['preselection'] #Data handling stuff# #load the mc dataframe for all years root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir, data_fnames, proc_to_tree_name, flat_obj_vars+event_vars, vars_to_add, presel) for sig_obj in root_obj.sig_objects: root_obj.load_mc(sig_obj, reload_samples=options.reload_samples) if not options.data_as_bkg: for bkg_obj in root_obj.bkg_objects: root_obj.load_mc(bkg_obj, bkg=True, reload_samples=options.reload_samples) else: for data_obj in root_obj.data_objects: root_obj.load_data(data_obj, reload_samples=options.reload_samples) #overwrite background attribute, for compat with DNN class root_obj.mc_df_bkg = root_obj.data_df root_obj.concat() #apply cut-based selection if not optimising BDT score (pred probs still evaluated for compatability w exisiting catOpt constructor). if len(options.cut_based_str)>0: root_obj.apply_more_cuts(options.cut_based_str) # DNN evaluation stuff # #load architecture and model weights print 'loading DNN: {}'.format(options.model_architecture) with open('{}'.format(options.model_architecture), 'r') as model_json: model_architecture = model_json.read() model = keras.models.model_from_json(model_architecture) model.load_weights('{}'.format(options.model)) LSTM = LSTM_DNN(root_obj, object_vars, event_vars, 1.0, False, True) # set up X and y Matrices. Log variables that have GeV units LSTM.var_transform(do_data=False) #bkg=data here. This option is for plotting purposes X_tot, y_tot = LSTM.create_X_y() X_tot = X_tot[flat_obj_vars+event_vars] #filter unused vars #scale X_vars to mean=0 and std=1. Use scaler fit during previous dnn training LSTM.load_X_scaler(out_tag=output_tag) X_tot = LSTM.X_scaler.transform(X_tot) #make 2D vars for LSTM layers X_tot = pd.DataFrame(X_tot, columns=flat_obj_vars+event_vars) X_tot_high_level = X_tot[event_vars].values X_tot_low_level = LSTM.join_objects(X_tot[flat_obj_vars]) #predict probs pred_prob_tot = model.predict([X_tot_high_level, X_tot_low_level], batch_size=1024).flatten() sig_weights = root_obj.mc_df_sig['weight'].values sig_m_ee = root_obj.mc_df_sig['dielectronMass'].values pred_prob_sig = pred_prob_tot[y_tot==1] bkg_weights = root_obj.data_df['weight'].values bkg_m_ee = root_obj.data_df['dielectronMass'].values pred_prob_bkg = pred_prob_tot[y_tot==0] #category optimisation stuff# #set up optimiser ranges and no. categories to test if non-cut based ranges = [ [0.3,1.] ] names = ['{} score'.format(output_tag)] #arbitrary print_str = '' cats = [1,2,3,4] AMS = [] #just to use class methods here if len(options.cut_based_str)>0: optimiser = CatOptim(sig_weights, sig_m_ee, [pred_prob_sig], bkg_weights, bkg_m_ee, [pred_prob_bkg], 0, ranges, names) AMS = optimiser.cutBasedAMS() print 'String for cut based optimimastion: {}'.format(options.cut_based_str) print 'Cut-based optimimsation gives AMS = {:1.8f}'.format(AMS) else: for n_cats in cats: optimiser = CatOptim(sig_weights, sig_m_ee, [pred_prob_sig], bkg_weights, bkg_m_ee, [pred_prob_bkg], n_cats, ranges, names) optimiser.optimise(1, options.n_iters) #set lumi to 1 as already scaled when loading in print_str += 'Results for {} categories : \n'.format(n_cats) print_str += optimiser.getPrintableResult() AMS.append(optimiser.bests.totSignif) print '\n {}'.format(print_str) #make nCat vs AMS plots Plotter.cats_vs_ams(cats, AMS, output_tag)
def main(options): #take options from the yaml config with open(options.config, 'r') as config_file: config = yaml.load(config_file) output_tag = config['output_tag'] mc_dir = config['mc_file_dir'] mc_fnames = config['mc_file_names'] #data not needed yet, but stil specify in the config for compatibility with constructor data_dir = config['data_file_dir'] data_fnames = config['data_file_names'] proc_to_tree_name = config['proc_to_tree_name'] train_vars = config['train_vars'] vars_to_add = config['vars_to_add'] presel = config['preselection'] #Data handling stuff# #load the mc dataframe for all years if options.pt_reweight: cr_selection = config['reweight_cr'] output_tag += '_pt_reweighted' root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir, data_fnames, proc_to_tree_name, train_vars, vars_to_add, cr_selection) else: root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir, data_fnames, proc_to_tree_name, train_vars, vars_to_add, presel) for sig_obj in root_obj.sig_objects: root_obj.load_mc(sig_obj, reload_samples=options.reload_samples) for bkg_obj in root_obj.bkg_objects: root_obj.load_mc(bkg_obj, bkg=True, reload_samples=options.reload_samples) for data_obj in root_obj.data_objects: root_obj.load_data(data_obj, reload_samples=options.reload_samples) root_obj.concat() #reweight samples in bins of pT (and maybe Njets), for each year separely. Note targetted selection # is applied here and all df's are resaved for smaller mem if options.pt_reweight and options.reload_samples: #FIXME what about reading files in first time, wanting to pT rew, but not including options.reload samples? It wont reweight and save the reweighted df's root_obj.apply_pt_rew('DYMC', presel) #root_obj.pt_njet_reweight('DYMC', year, presel) #BDT stuff# #set up X, w and y, train-test bdt_hee = BDTHelpers(root_obj, train_vars, options.train_frac, eq_train=options.eq_train) bdt_hee.create_X_and_y(mass_res_reweight=True) #submit the HP search if option true if options.hp_perm is not None: if options.opt_hps and options.train_best: raise Exception( 'Cannot optimise HPs and train best model. Run optimal training after hyper paramter optimisation' ) elif options.opt_hps and options.hp_perm: raise Exception( 'opt_hps option submits scripts with the hp_perm option; Cannot submit a script with both!' ) else: print( 'About to train + validate on dataset with {} fold splitting' .format(options.k_folds)) bdt_hee.set_hyper_parameters(options.hp_perm) bdt_hee.set_k_folds(options.k_folds) for i_fold in range(options.k_folds): bdt_hee.set_i_fold(i_fold) bdt_hee.train_classifier(root_obj.mc_dir, save=False) bdt_hee.validation_rocs.append(bdt_hee.compute_roc()) with open('{}/bdt_hp_opt_{}.txt'.format(mc_dir, output_tag), 'a+') as val_roc_file: bdt_hee.compare_rocs(val_roc_file, options.hp_perm) val_roc_file.close() elif options.opt_hps: #FIXME: add warning that many jobs are about to be submiited if options.k_folds < 2: raise ValueError('K-folds option must be at least 2') if path.isfile('{}/bdt_hp_opt_{}.txt'.format(mc_dir, output_tag)): system('rm {}/bdt_hp_opt_{}.txt'.format(mc_dir, output_tag)) print('deleting: {}/bdt_hp_opt_{}.txt'.format( mc_dir, output_tag)) bdt_hee.batch_gs_cv(k_folds=options.k_folds, pt_rew=options.pt_reweight) elif options.train_best: output_tag += '_best' with open('{}/bdt_hp_opt_{}.txt'.format(mc_dir, output_tag), 'r') as val_roc_file: hp_roc = val_roc_file.readlines() best_params = hp_roc[-1].split(';')[0] print('Best classifier params are: {}'.format(best_params)) bdt_hee.set_hyper_parameters(best_params) bdt_hee.train_classifier(root_obj.mc_dir, save=True, model_name=output_tag) bdt_hee.compute_roc() bdt_hee.plot_roc(output_tag) bdt_hee.plot_output_score( output_tag, ratio_plot=True, norm_to_data=(not options.pt_reweight)) #else just train BDT with default HPs else: bdt_hee.train_classifier(root_obj.mc_dir, save=True, model_name=output_tag + '_clf') #bdt_hee.train_classifier(root_obj.mc_dir, save=False, model_name=output_tag+'_clf') bdt_hee.compute_roc() bdt_hee.plot_roc(output_tag) #bdt_hee.plot_output_score(output_tag, ratio_plot=True, norm_to_data=(not options.pt_reweight), log=False) bdt_hee.plot_output_score(output_tag, ratio_plot=True, norm_to_data=(not options.pt_reweight), log=True)
def main(options): #take options from the yaml config with open(options.config, 'r') as config_file: config = yaml.load(config_file) output_tag = config['output_tag'] mc_dir = config['mc_file_dir'] mc_fnames = config['mc_file_names'] data_dir = config['data_file_dir'] data_fnames = config['data_file_names'] proc_to_tree_name = config['proc_to_tree_name'] train_vars = config['train_vars'] vars_to_add = config['vars_to_add'] presel = config['preselection'] #load the mc dataframe for all years root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir, data_fnames, proc_to_tree_name, train_vars, vars_to_add, presel) for sig_obj in root_obj.sig_objects: root_obj.load_mc(sig_obj, reload_samples=options.reload_samples) if not options.data_as_bkg: for bkg_obj in root_obj.bkg_objects: root_obj.load_mc(bkg_obj, bkg=True, reload_samples=options.reload_samples) else: for data_obj in root_obj.data_objects: root_obj.load_data(data_obj, reload_samples=options.reload_samples) root_obj.concat() print 'loading classifier: {}'.format(options.model) clf = pickle.load(open("{}".format(options.model), "rb")) #apply cut-based selection if not optimising BDT score (pred probs still evaluated for compatability w exisiting constructor). if len(options.cut_based_str) > 0: root_obj.apply_more_cuts(options.cut_based_str) sig_weights = root_obj.mc_df_sig['weight'].values sig_m_ee = root_obj.mc_df_sig['dielectronMass'].values pred_prob_sig = clf.predict_proba( root_obj.mc_df_sig[train_vars].values)[:, 1:].ravel() if options.data_as_bkg: bkg_weights = root_obj.data_df['weight'].values bkg_m_ee = root_obj.data_df['dielectronMass'].values pred_prob_bkg = clf.predict_proba( root_obj.data_df[train_vars].values)[:, 1:].ravel() else: bkg_weights = root_obj.mc_df_bkg['weight'].values bkg_m_ee = root_obj.mc_df_bkg['dielectronMass'].values pred_prob_bkg = clf.predict_proba( root_obj.mc_df_bkg[train_vars].values)[:, 1:].ravel() #set up optimiser ranges and no. categories to test if non-cut based ranges = [[0.15, 1.]] names = ['{} score'.format(output_tag)] #arbitrary print_str = '' cats = [1, 2, 3, 4, 5] AMS = [] #just to use class methods here if len(options.cut_based_str) > 0: optimiser = CatOptim(sig_weights, sig_m_ee, [pred_prob_sig], bkg_weights, bkg_m_ee, [pred_prob_bkg], 0, ranges, names) AMS = optimiser.cutBasedAMS() print 'String for cut based optimimastion: {}'.format( options.cut_based_str) print 'Cut-based optimimsation gives AMS = {:1.8f}'.format(AMS) else: for n_cats in cats: optimiser = CatOptim(sig_weights, sig_m_ee, [pred_prob_sig], bkg_weights, bkg_m_ee, [pred_prob_bkg], n_cats, ranges, names) optimiser.optimise( 1, options.n_iters ) #set lumi to 1 as already scaled when loading in print_str += 'Results for {} categories : \n'.format(n_cats) print_str += optimiser.getPrintableResult() AMS.append(optimiser.bests.totSignif) print '\n {}'.format(print_str) #make nCat vs AMS plots Plotter.cats_vs_ams(cats, AMS, output_tag)
def main(options): #take options from the yaml config with open(options.config, 'r') as config_file: config = yaml.load(config_file) output_tag = config['output_tag'] mc_dir = config['mc_file_dir'] mc_fnames = config['mc_file_names'] data_dir = config['data_file_dir'] data_fnames = config['data_file_names'] proc_to_tree_name = config['proc_to_tree_name'] object_vars = config['object_vars'] flat_obj_vars = [var for i_object in object_vars for var in i_object] event_vars = config['event_vars'] vars_to_add = config['vars_to_add'] presel = config['preselection'] #Data handling stuff# if options.pt_reweight: cr_selection = config['reweight_cr'] output_tag += '_pt_reweighted' root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir, data_fnames, proc_to_tree_name, flat_obj_vars + event_vars, vars_to_add, cr_selection) else: root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir, data_fnames, proc_to_tree_name, flat_obj_vars + event_vars, vars_to_add, presel) #load the dataframes for all years for sig_obj in root_obj.sig_objects: root_obj.load_mc(sig_obj, reload_samples=options.reload_samples) for bkg_obj in root_obj.bkg_objects: root_obj.load_mc(bkg_obj, bkg=True, reload_samples=options.reload_samples) for data_obj in root_obj.data_objects: # for plotting root_obj.load_data(data_obj, reload_samples=options.reload_samples) root_obj.concat() #reweight samples in bins of pT (and maybe Njets), for each year separely. if options.pt_reweight and options.reload_samples: root_obj.apply_pt_rew('DYMC', presel) #LSTM stuff# LSTM = LSTM_DNN(root_obj, object_vars, event_vars, options.train_frac, options.eq_weights, options.batch_boost) if not options.opt_hps: LSTM.var_transform(do_data=True) X_tot, y_tot = LSTM.create_X_y(mass_res_reweight=True) LSTM.split_X_y(X_tot, y_tot, do_data=True) if options.hp_perm is not None: LSTM.get_X_scaler(LSTM.all_vars_X_train, out_tag=output_tag, save=False) else: LSTM.get_X_scaler(LSTM.all_vars_X_train, out_tag=output_tag) LSTM.X_scale_train_test(do_data=True) LSTM.set_low_level_2D_test_train(do_data=True, ignore_train=options.batch_boost) #functions called in subbed job, if options.opt_hps was true if options.hp_perm is not None: if options.opt_hps and options.train_best: raise Exception( 'Cannot optimise HPs and train best model. Run optimal training after hyper paramter optimisation' ) elif options.opt_hps and options.hp_perm: raise Exception( 'opt_hps option submits scripts with the hp_perm option; Cannot submit a script with both!' ) else: LSTM.set_hyper_parameters(options.hp_perm) LSTM.model.summary() LSTM.train_w_batch_boost(out_tag=output_tag, save=False) with open('{}/lstm_hp_opt_{}.txt'.format(mc_dir, output_tag), 'a+') as val_roc_file: LSTM.compare_rocs(val_roc_file, options.hp_perm) val_roc_file.close() elif options.opt_hps: #FIXME: add warning that many jobs are about to be submiited if path.isfile('{}/lstm_hp_opt_{}.txt'.format(mc_dir, output_tag)): system('rm {}/lstm_hp_opt_{}.txt'.format(mc_dir, output_tag)) print('deleting: {}/lstm_hp_opt_{}.txt'.format( mc_dir, output_tag)) LSTM.batch_gs_cv(pt_rew=options.pt_reweight) elif options.train_best: output_tag += '_best' with open('{}/lstm_hp_opt_{}.txt'.format(mc_dir, output_tag), 'r') as val_roc_file: hp_roc = val_roc_file.readlines() best_params = hp_roc[-1].split(';')[0] print 'Best classifier params are: {}'.format(best_params) LSTM.set_hyper_parameters(best_params) LSTM.model.summary() LSTM.train_w_batch_boost(out_tag=output_tag) #compute final roc on test set LSTM.compute_roc(batch_size=1024) LSTM.plot_roc(output_tag) LSTM.plot_output_score(output_tag, batch_size=1024, ratio_plot=True, norm_to_data=(not options.pt_reweight)) #else train with basic parameters/architecture else: LSTM.model.summary() if options.batch_boost: #type of model selection so need validation set LSTM.train_w_batch_boost( out_tag=output_tag ) #handles creating validation set and 2D vars and sequential saving else: LSTM.train_network(epochs=5, batch_size=1024) #LSTM.train_network(epochs=7, batch_size=32) LSTM.save_model(out_tag=output_tag) LSTM.compute_roc(batch_size=1024) #compute final roc on test set LSTM.plot_roc(output_tag) LSTM.plot_output_score(output_tag, batch_size=1024, ratio_plot=True, norm_to_data=(not options.pt_reweight))
def main(options): #take options from the yaml config with open(options.config, 'r') as config_file: config = yaml.load(config_file) output_tag = config['output_tag'] mc_dir = config['mc_file_dir'] mc_fnames = config['mc_file_names'] #data not needed yet, could use this for validation later. keep for compat with class data_dir = config['data_file_dir'] data_fnames = config['data_file_names'] train_vars = config['train_vars'] vars_to_add = config['vars_to_add'] presel = config['preselection'] proc_to_tree_name = config['proc_to_tree_name'] sig_colour = 'forestgreen' #sig_colour = 'red' bkg_colour = 'violet' #Data handling stuff# #load the mc dataframe for all years if options.pt_reweight: cr_selection = config['reweight_cr'] output_tag += '_pt_reweighted' root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir, data_fnames, proc_to_tree_name, train_vars, vars_to_add, cr_selection) else: root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir, data_fnames, proc_to_tree_name, train_vars, vars_to_add, presel) for sig_obj in root_obj.sig_objects: root_obj.load_mc(sig_obj, reload_samples=options.reload_samples) for bkg_obj in root_obj.bkg_objects: root_obj.load_mc(bkg_obj, bkg=True, reload_samples=options.reload_samples) root_obj.concat() if options.pt_reweight and options.reload_samples: for year in root_obj.years: root_obj.pt_reweight('DYMC', year, presel) #Plotter stuff# with open('plotting/var_to_xrange.yaml', 'r') as plot_config_file: plot_config = yaml.load(plot_config_file) var_to_xrange = plot_config['var_to_xrange'] #get x string replacements from yaml config with open('plotting/var_to_xstring.yaml', 'r') as plot_config_file: plot_string_cfg = yaml.load(plot_config_file) var_to_xstring = plot_string_cfg['var_to_xstring'] #set up X, w and y, train-test plotter = Plotter(root_obj, train_vars, sig_col=sig_colour, norm_to_data=True) for var in train_vars: #for var in ['dielectronCosPhi']: fig = plt.figure(1) axes = fig.gca() var_sig = root_obj.mc_df_sig[var].values sig_weights = root_obj.mc_df_sig['weight'].values var_bkg = root_obj.mc_df_bkg[var].values bkg_weights = root_obj.mc_df_bkg['weight'].values bins = np.linspace(var_to_xrange[var][0], var_to_xrange[var][1], 56) #add sig mc axes.hist(var_sig, bins=bins, label=plotter.sig_labels[0]+r' ($\mathrm{H}\rightarrow\mathrm{ee}$)', weights=sig_weights, histtype='stepfilled', color='red', zorder=10, alpha=0.4, normed=True) axes.hist(var_bkg, bins=bins, label='Simulated background', weights=bkg_weights, histtype='stepfilled', color='blue', zorder=0, alpha=0.4, normed=True) axes.set_ylabel('Arbitrary Units', ha='right', y=1, size=13) current_bottom, current_top = axes.get_ylim() axes.set_ylim(bottom=0, top=1.2*current_top) axes.set_xlim(left=var_to_xrange[var][0], right=var_to_xrange[var][1]) axes.legend(bbox_to_anchor=(0.97,0.97), ncol=1) plotter.plot_cms_labels(axes, lumi='') axes.set_xlabel('{}'.format(var_to_xstring[var]), ha='right', x=1, size=13) Utils.check_dir('{}/plotting/plots/{}/normed/'.format(os.getcwd(), output_tag)) fig.savefig('{0}/plotting/plots/{1}/normed/{1}_{2}_normalised.pdf'.format(os.getcwd(), output_tag, var)) plt.close()
def main(options): #take options from the yaml config with open(options.config, 'r') as config_file: config = yaml.load(config_file) output_tag = config['output_tag'] mc_dir = config['mc_file_dir'] mc_fnames = config['mc_file_names'] #data not needed yet, could use this for validation later. keep for compat with class data_dir = config['data_file_dir'] data_fnames = config['data_file_names'] train_vars = config['train_vars'] vars_to_add = config['vars_to_add'] presel = config['preselection'] proc_to_tree_name = config['proc_to_tree_name'] colours = ['#d7191c', '#fdae61', '#f2f229', '#abdda4', '#2b83ba'] #Data handling stuff# #load the mc dataframe for all years if options.pt_reweight: cr_selection = config['reweight_cr'] output_tag += '_pt_reweighted' root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir, data_fnames, proc_to_tree_name, train_vars, vars_to_add, cr_selection) else: root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir, data_fnames, proc_to_tree_name, train_vars, vars_to_add, presel) for sig_obj in root_obj.sig_objects: root_obj.load_mc(sig_obj, reload_samples=options.reload_samples) for bkg_obj in root_obj.bkg_objects: root_obj.load_mc(bkg_obj, bkg=True, reload_samples=options.reload_samples) #for data_obj in root_obj.data_objects: # root_obj.load_data(data_obj, reload_samples=options.reload_samples) root_obj.concat() if options.pt_reweight and options.reload_samples: for year in root_obj.years: root_obj.pt_reweight('DYMC', year, presel) #Plotter stuff# #add model predictions to sig df print 'loading classifier: {}'.format(options.model) clf = pickle.load(open("{}".format(options.model), "rb")) sig_df = root_obj.mc_df_sig sig_df['bdt_score'] = clf.predict_proba(sig_df[train_vars].values)[:,1:].ravel() bkg_df = root_obj.mc_df_bkg bkg_df['bdt_score'] = clf.predict_proba(bkg_df[train_vars].values)[:,1:].ravel() plotter = Plotter(root_obj, train_vars) #for VBF, good set is: [0.30 0.50 0.70 0.80 0.90 1.0] #for ggH, good set is: [0.10 0.30 0.45 0.53 0.60 0.8] bdt_bins = np.array(options.boundaries) Utils.check_dir('{}/plotting/plots/{}_sig_bkg_evo'.format(os.getcwd(), output_tag)) i_hist = 0 for var in train_vars+['dielectronMass']: fig = plt.figure(1) axes = fig.gca() var_bins = np.linspace(plotter.var_to_xrange[var][0], plotter.var_to_xrange[var][1], options.n_bins) for ibin in range(len(bdt_bins)-1): sig_cut = sig_df[np.logical_and( sig_df['bdt_score'] > bdt_bins[ibin], sig_df['bdt_score'] < bdt_bins[ibin+1])][var] weights_cut = sig_df[np.logical_and( sig_df['bdt_score'] > bdt_bins[ibin], sig_df['bdt_score'] < bdt_bins[ibin+1])]['weight'] axes.hist(sig_cut, bins=var_bins, label='{:.2f} $<$ MVA $<$ {:.2f}'.format(bdt_bins[ibin], bdt_bins[ibin+1]), weights=weights_cut, histtype='step', color=colours[i_hist], normed=True) i_hist += 1 i_hist=0 annotate_and_save(axes, plotter, var) fig.savefig('{0}/plotting/plots/{1}_sig_bkg_evo/{1}_{2}.pdf'.format(os.getcwd(), output_tag, var)) print('saving: {0}/plotting/plots/{1}_sig_bkg_evo/{1}_{2}.pdf'.format(os.getcwd(), output_tag, var)) plt.close() #plot background (check mass is not being sculpted) for var in ['dielectronMass']: fig = plt.figure(1) axes = fig.gca() var_bins = np.linspace(plotter.var_to_xrange[var][0], plotter.var_to_xrange[var][1], options.n_bins) for ibin in range(len(bdt_bins)-1): bkg_cut = bkg_df[np.logical_and( bkg_df['bdt_score'] > bdt_bins[ibin], bkg_df['bdt_score'] < bdt_bins[ibin+1])][var] bkg_weights_cut = bkg_df[np.logical_and( bkg_df['bdt_score'] > bdt_bins[ibin], bkg_df['bdt_score'] < bdt_bins[ibin+1])]['weight'] axes.hist(bkg_cut, bins=var_bins, label='{:.2f} $<$ MVA $<$ {:.2f}'.format(bdt_bins[ibin], bdt_bins[ibin+1]), weights=bkg_weights_cut, histtype='step', color=colours[i_hist], normed=True) i_hist+=1 i_hist=0 annotate_and_save(axes, plotter, var) fig.savefig('{0}/plotting/plots/{1}_sig_bkg_evo/{1}_{2}_bkg.pdf'.format(os.getcwd(), output_tag, var)) plt.close()
def main(options): with open(options.config, 'r') as config_file: config = yaml.load(config_file) output_tag = config['output_tag'] mc_dir = config['mc_file_dir'] mc_fnames = config['mc_file_names'] data_dir = config['data_file_dir'] data_fnames = config['data_file_names'] proc_to_tree_name = config['proc_to_tree_name'] #check if dnn (lstm) variables need to be read in varrs = config['train_vars'] all_train_vars = [] if isinstance(varrs, dict): object_vars = varrs['object_vars'] flat_obj_vars = [ var for i_object in object_vars for var in i_object ] event_vars = varrs['event_vars'] all_train_vars += (flat_obj_vars + event_vars) else: all_train_vars = varrs vars_to_add = config['vars_to_add'] presel = config['preselection'] cut_map = config['cut_map'] #Data handling stuff# #get the dataframe for all years. Do not apply any specific preselection to sim samples root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir, data_fnames, proc_to_tree_name, all_train_vars, vars_to_add, presel, read_systs=True) #for sig_obj in root_obj.sig_objects: # root_obj.load_mc(sig_obj, reload_samples=options.reload_samples) for bkg_obj in root_obj.bkg_objects: root_obj.load_mc(bkg_obj, bkg=True, reload_samples=options.reload_samples) for data_obj in root_obj.data_objects: root_obj.load_data(data_obj, reload_samples=options.reload_samples) root_obj.concat() #-------------------------------------------------------------------------------------------------- dy_plotter = DYPlotter(root_obj, cut_map) if options.reload_samples: #FIXME: reading in for the first time wont re-weight sample! (OR BEING READ IN THE FOR THE FIRST TIME) (or get the reload samples flag out of the DataHandler object since trigger if no sample exist dy_plotter.pt_reweight() #FIXME still need this else dont remove variables again! dy_plotter.manage_memory(options.systematics, save=options.reload_samples) #DEBUG print 'Background columns' print root_obj.mc_df_bkg.columns[:] print 'Background columns' print root_obj.data_df.columns[:] if (options.var_name is None) or ('mva' in options.var_name.lower()): dy_plotter.eval_mva( options.mva_config, output_tag ) #little bit hard coded - be careful if 'mva' not in MVA ouput name. Below line is safer but longer. #dy_plotter.eval_mva(options.mva_config, output_tag) #-------------------------------------------------------------------------------------------------- with open('plotting/var_to_xrange.yaml', 'r') as plot_config_file: plot_config = yaml.load(plot_config_file) var_to_xrange = plot_config['var_to_xrange'] with open('plotting/var_to_xstring.yaml', 'r') as plot_config_file: plot_string_cfg = yaml.load(plot_config_file) var_to_xstring = plot_string_cfg['var_to_xstring'] if options.var_name is not None: vars_to_plot = [options.var_name] else: var_to_plot = all_train_vars + [dy_plotter.proc + '_mva'] for var in vars_to_plot: if 'mva' in var: var_bins = np.linspace(0, 1, options.n_bins) else: var_bins = np.linspace(var_to_xrange[var][0], var_to_xrange[var][1], options.n_bins) print 'plotting var: {}'.format(var) fig, axes = plt.subplots(nrows=2, ncols=1, dpi=200, sharex=True, gridspec_kw={ 'height_ratios': [3, 0.8], 'hspace': 0.08 }) cut_str = dy_plotter.get_cut_string(var) #data stuff data_binned, bin_centres, data_stat_down_up = dy_plotter.plot_data( cut_str, axes, var, var_bins) dy_plotter.plot_bkgs(cut_str, axes, var, var_bins, data_binned, bin_centres, data_stat_down_up) #syst stuff dy_plotter.plot_systematics( cut_str, axes, var, var_bins, options.systematics, do_mva=('mva' in options.var_name)) #FIXME: make this more general axes = dy_plotter.set_canv_style(axes, var, var_bins) axes[0].legend(bbox_to_anchor=(0.97, 0.97), ncol=1) axes[1].set_xlabel(var_to_xstring[var], size=14, ha='right', x=1) Utils.check_dir('{}/plotting/plots/{}'.format( os.getcwd(), output_tag)) #fig.savefig('{0}/plotting/plots/{1}/{1}_{2}.pdf'.format(os.getcwd(), output_tag, var)) fig.savefig( '/vols/cms/jwd18/Hee/MLCategorisation/CMSSW_10_2_0/src/HToEE/plotting/plots/{0}/{0}_{1}.pdf' .format(output_tag, var)) #temp hardcode
def main(options): with open(options.config, 'r') as config_file: config = yaml.load(config_file) output_tag = config['output_tag'] mc_dir = config['mc_file_dir'] mc_fnames = config['mc_file_names'] #data not needed yet, but stil specify in the config for compatibility with constructor data_dir = config['data_file_dir'] data_fnames = config['data_file_names'] proc_to_tree_name = config['proc_to_tree_name'] proc_to_train_vars = config['train_vars'] all_train_vars = [ item for sublist in proc_to_train_vars.values() for item in sublist ] vars_to_add = config['vars_to_add'] #Data handling stuff# #apply loosest selection (ggh) first, else memory requirements are ridiculous. Fine to do this since all cuts all looser than VBF (not removing events with higher priority) loosest_selection = 'dielectronMass > 110 and dielectronMass < 150' #load the mc dataframe for all years. Do not apply any specific preselection root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir, data_fnames, proc_to_tree_name, all_train_vars, vars_to_add, loosest_selection) root_obj.no_lumi_scale() for sig_obj in root_obj.sig_objects: root_obj.load_mc(sig_obj, reload_samples=options.reload_samples) if options.data_as_bkg: for data_obj in root_obj.data_objects: root_obj.load_data(data_obj, reload_samples=options.reload_samples) else: for bkg_obj in root_obj.bkg_objects: root_obj.load_mc(bkg_obj, bkg=True, reload_samples=options.reload_samples) root_obj.concat() #Tag sequence stuff# if options.data_as_bkg: combined_df = pd.concat([root_obj.mc_df_sig, root_obj.data_df]) else: combined_df = pd.concat([root_obj.mc_df_sig, root_obj.mc_df_bkg]) del root_obj #decide sequence of tags and specify preselection for use with numpy.select: tag_sequence = ['VBF', 'ggH'] proc_to_preselection = { 'VBF': [ combined_df['dielectronMass'].gt(110) & combined_df['dielectronMass'].lt(150) & combined_df['leadElectronPToM'].gt(0.333) & combined_df['subleadElectronPToM'].gt(0.25) & combined_df['dijetMass'].gt(350) & combined_df['leadJetPt'].gt(40) & combined_df['subleadJetPt'].gt(30) ], 'ggH': [ combined_df['dielectronMass'].gt(110) & combined_df['dielectronMass'].lt(150) & combined_df['leadElectronPToM'].gt(0.333) & combined_df['subleadElectronPToM'].gt(0.25) ] } with open(options.bdt_config, 'r') as bdt_config_file: config = yaml.load(bdt_config_file) proc_to_model = config['models'] proc_to_tags = config['boundaries'] #evaluate MVA scores used in categorisation for proc, model in proc_to_model.iteritems(): print 'evaluating classifier: {}'.format(model) clf = pickle.load(open('models/{}'.format(model), "rb")) train_vars = proc_to_train_vars[proc] combined_df[proc + '_bdt'] = clf.predict_proba( combined_df[train_vars].values)[:, 1:].ravel() # TAG NUMBER # #decide on tag for proc in tag_sequence: presel = proc_to_preselection[proc] tag_bounds = proc_to_tags[proc].values() tag_masks = [] for i_bound in range( len(tag_bounds)): #c++ type looping for index reasons if i_bound == 0: #first bound, tag 0 tag_masks.append(presel[0] & combined_df['{}_bdt'.format( proc)].gt(tag_bounds[i_bound])) else: #intermed bound tag_masks.append(presel[0] & combined_df['{}_bdt'.format( proc)].lt(tag_bounds[i_bound - 1]) & combined_df[ '{}_bdt'.format(proc)].gt(tag_bounds[i_bound])) mask_key = [icat for icat in range(len(tag_bounds))] combined_df['{}_analysis_tag'.format(proc)] = np.select( tag_masks, mask_key, default=-999) # PROC PRIORITY # # deduce tag priority: if two or more tags satisfied then set final tag to highest priority tag. make this non hardcoded i.e. compare proc in position 1 to all lower prioty positions. then compare proc in pos 2 ... tag_priority_filter = [ combined_df['VBF_analysis_tag'].ne(-999) & combined_df['ggH_analysis_tag'].ne(-999), # 1) if both filled... combined_df['VBF_analysis_tag'].ne(-999) & combined_df['ggH_analysis_tag'].eq( -999), # 2) if VBF filled and ggH not, take VBF combined_df['VBF_analysis_tag'].eq(-999) & combined_df['ggH_analysis_tag'].ne( -999), # 3) if ggH filled and VBF not, take ggH ] tag_priority_key = [ 'VBF', #1) take VBF 'VBF', #2) take VBF 'ggH', #3) take ggH ] combined_df['priority_tag'.format(proc)] = np.select( tag_priority_filter, tag_priority_key, default='NOTAG') # else keep -999 i.e. NOTAG #some debug checks: #print combined_df[['dipho_mass', 'dipho_leadIDMVA', 'dipho_subleadIDMVA', 'dipho_lead_ptoM', 'dipho_sublead_ptoM', 'dijet_Mjj', 'dijet_LeadJPt', 'dijet_SubJPt', 'ggH_bdt', 'VBF_bdt', 'VBF_analysis_tag', 'ggH_analysis_tag', 'priority_tag']] #print combined_df[combined_df.VBF_analysis_tag>-1][['dipho_mass', 'dipho_leadIDMVA', 'dipho_subleadIDMVA', 'dipho_lead_ptoM', 'dipho_sublead_ptoM', 'dijet_Mjj', 'dijet_LeadJPt', 'dijet_SubJPt', 'ggH_bdt', 'VBF_bdt', 'VBF_analysis_tag', 'ggH_analysis_tag', 'priority_tag']] #print combined_df[combined_df.ggH_analysis_tag>-1][['dipho_mass', 'dipho_leadIDMVA', 'dipho_subleadIDMVA', 'dipho_lead_ptoM', 'dipho_sublead_ptoM', 'dijet_Mjj', 'dijet_LeadJPt', 'dijet_SubJPt', 'ggH_bdt', 'VBF_bdt', 'VBF_analysis_tag', 'ggH_analysis_tag', 'priority_tag']] # FILL TREES BASED ON BOTH OF ABOVE tree_vars = ['dZ', 'CMS_hgg_mass', 'weight'] combined_df['dZ'] = float(0.) combined_df['CMS_hgg_mass'] = combined_df['dielectronMass'] # FIXME: dont loop through events eventually but for now I cba to use numpy to vectorise it again #for true_proc in tag_sequence+['Data']: # #isolate true proc # true_proc_df = combined_df[combined_df.proc==true_proc.lower()] # #how much true proc landed in each of our analysis cats? # for target_proc in tag_sequence: #for all events that got the proc tag, which tag did they fall into? # true_proc_target_proc_df = true_proc_df[true_proc_df.priority_tag==target_proc] # for i_tag in range(len(proc_to_tags[target_proc].values())):#for each tag corresponding to the category we target, which events go in which tag # true_procs_target_proc_tag_i = true_proc_target_proc_df[true_proc_target_proc_df['{}_analysis_tag'.format(target_proc)].eq(i_tag)] # # branch_name = '{}_125_13TeV_{}cat{} :'.format(true_proc.lower(), target_proc.lower(), i_tag ) # print true_procs_target_proc_tag_i[['dipho_mass', 'dipho_leadIDMVA', 'dipho_subleadIDMVA', 'dipho_lead_ptoM', 'dipho_sublead_ptoM', 'dijet_Mjj', 'dijet_LeadJPt', 'dijet_SubJPt', 'ggH_bdt', 'VBF_bdt', 'VBF_analysis_tag', 'ggH_analysis_tag', 'priority_tag']].head(10) # print branch_name #get tree names branch_names = {} #print 'DEBUG: {}'.format(np.unique(combined_df['proc'])) for true_proc in tag_sequence + ['Data']: branch_names[true_proc] = [] for target_proc in tag_sequence: #for all events that got the proc tag, which tag did they fall into? for i_tag in range( len(proc_to_tags[target_proc].values()) ): #for each tag corresponding to the category we target, which events go in which tag if true_proc is not 'Data': branch_names[true_proc].append( '{}_125_13TeV_{}cat{}'.format( true_proc.lower(), target_proc.lower(), i_tag)) else: branch_names[true_proc].append( '{}_13TeV_{}cat{}'.format(true_proc, target_proc.lower(), i_tag)) #debug_procs = ['dipho_mass', 'dipho_leadIDMVA', 'dipho_subleadIDMVA', 'dipho_lead_ptoM', 'dipho_sublead_ptoM', 'dijet_Mjj', 'dijet_LeadJPt', 'dijet_SubJPt', 'ggH_bdt', 'VBF_bdt', 'VBF_analysis_tag', 'ggH_analysis_tag', 'priority_tag'] debug_vars = [ 'proc', 'VBF_analysis_tag', 'ggH_analysis_tag', 'priority_tag' ] combined_df['tree_name'] = combined_df.apply(assign_tree, axis=1) print combined_df[debug_vars + ['tree_name']] if not path.isdir('output_trees/'): print 'making directory: {}'.format('output_trees/') system('mkdir -p %s' % 'output_trees/') #have to save individual trees then hadd procs together on the command line. for proc in tag_sequence + ['Data']: selected_df = combined_df[combined_df.proc == proc] for bn in branch_names[proc]: print bn branch_selected_df = selected_df[selected_df.tree_name == bn] print branch_selected_df[debug_vars + ['tree_name']].head(20) root_pandas.to_root(branch_selected_df[tree_vars], 'output_trees/{}.root'.format(bn), key=bn) print
def main(options): with open(options.config, 'r') as config_file: config = yaml.load(config_file) output_tag = config['output_tag'] mc_dir = config['mc_file_dir'] mc_fnames = config['mc_file_names'] #data not needed yet, but stil specify in the config for compatibility with constructor data_dir = config['data_file_dir'] data_fnames = config['data_file_names'] proc_to_tree_name = config['proc_to_tree_name'] #check if dnn (lstm) variables need to be read in proc_to_train_vars = config['train_vars'] all_train_vars = [] for proc, varrs in proc_to_train_vars.iteritems(): if isinstance(varrs, dict): object_vars = proc_to_train_vars[proc]['object_vars'] flat_obj_vars = [ var for i_object in object_vars for var in i_object ] event_vars = proc_to_train_vars[proc]['event_vars'] all_train_vars += (flat_obj_vars + event_vars) else: all_train_vars += varrs vars_to_add = config['vars_to_add'] if options.syst_name is not None: syst = options.syst_name read_syst = True else: read_syst = False if read_syst and options.dump_weight_systs: raise IOError( 'Cannot dump weight variations and tree systematics at the same time. Please run separately for each.' ) if options.data_only and (read_syst or options.dump_weight_systs): raise IOError('Cannot read Data and apply sysetmatic shifts') #Data handling stuff# #apply loosest selection (ggh) first, else memory requirements are ridiculous. Fine to do this since all cuts all looser than VBF (not removing events with higher priority) #also note we norm the MC before applying this cut. In data we apply it when reading in. #loosest_selection = 'dielectronMass > 110 and dielectronMass < 150 and leadElectronPtOvM > 0.333 and subleadElectronPtOvM > 0.25' cant do this since these vars change with systematics! loosest_selection = 'dielectronMass > 100' #load the mc dataframe for all years. Do not apply any specific preselection to sim samples root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir, data_fnames, proc_to_tree_name, all_train_vars, vars_to_add, loosest_selection, read_systs=(read_syst or options.dump_weight_systs)) root_obj.no_lumi_scale() for sig_obj in root_obj.sig_objects: root_obj.load_mc(sig_obj, reload_samples=options.reload_samples) #if not read_syst: if not options.data_as_bkg: for bkg_obj in root_obj.bkg_objects: root_obj.load_mc(bkg_obj, bkg=True, reload_samples=options.reload_samples) else: for data_obj in root_obj.data_objects: root_obj.load_data(data_obj, reload_samples=options.reload_samples) #overwrite background attribute, for compat with DNN class root_obj.mc_df_bkg = root_obj.data_df root_obj.concat() #get year of samples for roob obj and check we didn't accidentally read in more than 1 year if len(root_obj.years) != 1: raise IOError( 'Reading in more than one year at a time! Tagging should be split by year' ) else: year = list(root_obj.years)[0] if ("2016" in year) and (not options.data_only): root_obj.scale_sig_partial_2016( ) #FIXME: check this is actually called #if read_syst: combined_df = root_obj.mc_df_sig doesnt work with DNN set up since need bkg class in _init_ #else: combined_df = pd.concat([root_obj.mc_df_sig, root_obj.mc_df_bkg]) combined_df = pd.concat([root_obj.mc_df_sig, root_obj.mc_df_bkg]) #Tag sequence stuff# #specify sequence of tags and preselection targetting each tag_sequence = ['VBF', 'ggH'] #categories targetted true_procs = ['VBF', 'ggH', 'ttH'] #procs to run through cats #true_procs = ['ggH', 'ttH'] #procs to run through cats if (not read_syst) and (not options.dump_weight_systs): true_procs.append( 'Data' ) #is this line needed? guess so since could run mc and data together in a stat-only config if options.data_only: true_procs = ['Data'] #do data on its own (for memory really) #create tag object tag_obj = taggerBase(tag_sequence, true_procs, combined_df, syst_name=options.syst_name) if read_syst: tag_obj.relabel_syst_vars() #not run if reading weight systematics #get number models and tag boundaries from config with open(options.mva_config, 'r') as mva_config_file: config = yaml.load(mva_config_file) proc_to_model = config['models'] tag_boundaries = config['boundaries'] #evaluate MVA scores used in categorisation for proc, model in proc_to_model.iteritems(): #for BDT - proc:[var list]. For DNN - proc:{var_type1:[var_list_type1], var_type2: [...], ...} if isinstance(model, dict): object_vars = proc_to_train_vars[proc]['object_vars'] flat_obj_vars = [ var for i_object in object_vars for var in i_object ] event_vars = proc_to_train_vars[proc]['event_vars'] dnn_loaded = tag_obj.load_dnn(proc, model) train_tag = model['architecture'].split('_model')[0] tag_obj.eval_lstm(dnn_loaded, train_tag, root_obj, proc, object_vars, flat_obj_vars, event_vars) elif isinstance(model, str): tag_obj.eval_bdt(proc, model, proc_to_train_vars[proc]) else: raise IOError( 'Did not get a classifier models in correct format in config' ) del root_obj #need to do this after eval MVAs, since LSTM class used in eval_lstm needs some Data in df for constructor if (read_syst or options.dump_weight_systs): tag_obj.combined_df = tag_obj.combined_df[ tag_obj.combined_df.proc != 'Data'].copy( ) #avoid copy warnings later tag_preselection = tag_obj.get_tag_preselection() #set up tag boundaries for each process being targeted tag_obj.decide_tag(tag_preselection, tag_boundaries) tag_obj.decide_priority() branch_names = tag_obj.get_tree_names(tag_boundaries, year) tag_obj.set_tree_names(tag_boundaries, options.dump_weight_systs, year) tag_obj.fill_trees(branch_names, year, print_yields=not read_syst) if not read_syst: pass #tag_obj.plot_matrix(branch_names, output_tag) #struct error?
def main(options): #take options from the yaml config with open(options.config, 'r') as config_file: config = yaml.load(config_file) output_tag = config['output_tag'] mc_dir = config['mc_file_dir'] mc_fnames = config['mc_file_names'] #data not needed yet, could use this for validation later. keep for compat with class data_dir = config['data_file_dir'] data_fnames = config['data_file_names'] train_vars = config['train_vars'] vars_to_add = config['vars_to_add'] presel = config['preselection'] proc_to_tree_name = config['proc_to_tree_name'] #sig_colour = 'forestgreen' sig_colour = 'red' #Data handling stuff# #load the mc dataframe for all years if options.pt_reweight: cr_selection = config['reweight_cr'] output_tag += '_pt_reweighted' root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir, data_fnames, proc_to_tree_name, train_vars, vars_to_add, cr_selection) else: root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir, data_fnames, proc_to_tree_name, train_vars, vars_to_add, presel) for sig_obj in root_obj.sig_objects: root_obj.load_mc(sig_obj, reload_samples=options.reload_samples) for bkg_obj in root_obj.bkg_objects: root_obj.load_mc(bkg_obj, bkg=True, reload_samples=options.reload_samples) for data_obj in root_obj.data_objects: root_obj.load_data(data_obj, reload_samples=options.reload_samples) root_obj.concat() if options.pt_reweight and options.reload_samples: root_obj.apply_pt_rew('DYMC', presel) #load MVA with open(options.mva_config, 'r') as mva_config_file: config = yaml.load(mva_config_file) model = config['models'][options.mva_proc] boundaries = config['boundaries'][options.mva_proc] #add DNN later if isinstance(model, str): print 'evaluating BDT: {}'.format(model) clf = pickle.load(open('models/{}'.format(model), "rb")) root_obj.mc_df_sig[ options.mva_proc + '_mva'] = clf.predict_proba( root_obj.mc_df_sig[train_vars].values)[:, 1:].ravel() root_obj.mc_df_bkg[ options.mva_proc + '_mva'] = clf.predict_proba( root_obj.mc_df_bkg[train_vars].values)[:, 1:].ravel() root_obj.data_df[ options.mva_proc + '_mva'] = clf.predict_proba( root_obj.data_df[train_vars].values)[:, 1:].ravel() else: raise IOError( 'Did not get a classifier models in correct format in config' ) #Plotter stuff# plotter = Plotter(root_obj, train_vars, sig_col=sig_colour, norm_to_data=True) cat_counter = 0 for b in boundaries: if cat_counter == 0: extra_cuts = options.mva_proc + '_mva >' + str( boundaries['tag_0']) else: extra_cuts = (options.mva_proc + '_mva <' + str( boundaries['tag_' + str(cat_counter - 1)])) + ' and ' + ( options.mva_proc + '_mva >' + str(boundaries['tag_' + str(cat_counter)])) plotter.plot_input(options.mass_var_name, options.n_bins, output_tag, options.ratio_plot, norm_to_data=True, extra_cuts=extra_cuts, extra_tag=cat_counter, blind=True) cat_counter += 1
def main(options): with open(options.config, 'r') as config_file: config = yaml.load(config_file) output_tag = config['output_tag'] mc_dir = config['mc_file_dir'] mc_fnames = config['mc_file_names'] data_dir = config['data_file_dir'] data_fnames = config['data_file_names'] proc_to_tree_name = config['proc_to_tree_name'] proc_to_train_vars = config['train_vars'] object_vars = proc_to_train_vars['VBF']['object_vars'] flat_obj_vars = [var for i_object in object_vars for var in i_object] event_vars = proc_to_train_vars['VBF']['event_vars'] #used to check all vars we need for categorisation are in our dfs all_train_vars = proc_to_train_vars['ggH'] + flat_obj_vars + event_vars vars_to_add = config['vars_to_add'] #Data handling stuff# #apply loosest selection (ggh) first, else memory requirements are ridiculous. Fine to do this since all cuts all looser than VBF (not removing events with higher priority) loosest_selection = 'dielectronMass > 110 and dielectronMass < 150' #load the mc dataframe for all years. Do not apply any specific preselection root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir, data_fnames, proc_to_tree_name, all_train_vars, vars_to_add, loosest_selection) root_obj.no_lumi_scale() for sig_obj in root_obj.sig_objects: root_obj.load_mc(sig_obj, reload_samples=options.reload_samples) if not options.data_as_bkg: for bkg_obj in root_obj.bkg_objects: root_obj.load_mc(bkg_obj, bkg=True, reload_samples=options.reload_samples) else: for data_obj in root_obj.data_objects: root_obj.load_data(data_obj, reload_samples=options.reload_samples) #overwrite background attribute, for compat with DNN class root_obj.mc_df_bkg = root_obj.data_df root_obj.concat() #Tag sequence stuff# #NOTE: these must be concatted in the same way they are concatted in LSTM.create_X_y(), else predicts are misaligned combined_df = pd.concat([root_obj.mc_df_sig, root_obj.mc_df_bkg]) #decide sequence of tags and specify preselection for use with numpy.select: tag_sequence = ['VBF', 'ggH'] proc_to_preselection = { 'VBF': [ combined_df['dielectronMass'].gt(110) & combined_df['dielectronMass'].lt(150) & combined_df['leadElectronPToM'].gt(0.333) & combined_df['subleadElectronPToM'].gt(0.25) & combined_df['dijetMass'].gt(350) & combined_df['leadJetPt'].gt(40) & combined_df['subleadJetPt'].gt(30) ], 'ggH': [ combined_df['dielectronMass'].gt(110) & combined_df['dielectronMass'].lt(150) & combined_df['leadElectronPToM'].gt(0.333) & combined_df['subleadElectronPToM'].gt(0.25) ] } # GET MVA SCORES # with open(options.mva_config, 'r') as mva_config_file: config = yaml.load(mva_config_file) proc_to_model = config['models'] proc_to_tags = config['boundaries'] #evaluate ggH BDT scores print 'evaluating ggH classifier: {}'.format(proc_to_model['ggH']) clf = pickle.load(open('models/{}'.format(proc_to_model['ggH']), "rb")) train_vars = proc_to_train_vars['ggH'] combined_df['ggH_mva'] = clf.predict_proba( combined_df[train_vars].values)[:, 1:].ravel() #Evaluate VBF LSTM print 'loading VBF DNN:' with open('models/{}'.format(proc_to_model['VBF']['architecture']), 'r') as model_json: model_architecture = model_json.read() model = keras.models.model_from_json(model_architecture) model.load_weights('models/{}'.format(proc_to_model['VBF']['model'])) LSTM = LSTM_DNN(root_obj, object_vars, event_vars, 1.0, False, True) # set up X and y Matrices. Log variables that have GeV units LSTM.var_transform(do_data=False) X_tot, y_tot = LSTM.create_X_y() X_tot = X_tot[flat_obj_vars + event_vars] #filter unused vars print np.isnan(X_tot).any() #scale X_vars to mean=0 and std=1. Use scaler fit during previous dnn training LSTM.load_X_scaler(out_tag='VBF_DNN') X_tot = LSTM.X_scaler.transform(X_tot) #make 2D vars for LSTM layers X_tot = pd.DataFrame(X_tot, columns=flat_obj_vars + event_vars) X_tot_high_level = X_tot[event_vars].values X_tot_low_level = LSTM.join_objects(X_tot[flat_obj_vars]) #predict probs. Corresponds to same events, since dfs are concattened internally in the same combined_df['VBF_mva'] = model.predict( [X_tot_high_level, X_tot_low_level], batch_size=1).flatten() # TAG NUMBER # #decide on tag for proc in tag_sequence: presel = proc_to_preselection[proc] tag_bounds = proc_to_tags[proc].values() tag_masks = [] for i_bound in range( len(tag_bounds)): #c++ type looping for index reasons if i_bound == 0: #first bound, tag 0 tag_masks.append(presel[0] & combined_df['{}_mva'.format( proc)].gt(tag_bounds[i_bound])) else: #intermed bound tag_masks.append(presel[0] & combined_df['{}_mva'.format( proc)].lt(tag_bounds[i_bound - 1]) & combined_df[ '{}_mva'.format(proc)].gt(tag_bounds[i_bound])) mask_key = [icat for icat in range(len(tag_bounds))] combined_df['{}_analysis_tag'.format(proc)] = np.select( tag_masks, mask_key, default=-999) # PROC PRIORITY # # deduce tag priority: if two or more tags satisfied then set final tag to highest priority tag. make this non hardcoded i.e. compare proc in position 1 to all lower prioty positions. then compare proc in pos 2 ... tag_priority_filter = [ combined_df['VBF_analysis_tag'].ne(-999) & combined_df['ggH_analysis_tag'].ne(-999), # 1) if both filled... combined_df['VBF_analysis_tag'].ne(-999) & combined_df['ggH_analysis_tag'].eq( -999), # 2) if VBF filled and ggH not, take VBF combined_df['VBF_analysis_tag'].eq(-999) & combined_df['ggH_analysis_tag'].ne( -999), # 3) if ggH filled and VBF not, take ggH ] tag_priority_key = [ 'VBF', #1) take VBF 'VBF', #2) take VBF 'ggH', #3) take ggH ] combined_df['priority_tag'.format(proc)] = np.select( tag_priority_filter, tag_priority_key, default='NOTAG') # else keep -999 i.e. NOTAG #some debug checks: #print combined_df[['dipho_mass', 'dipho_leadIDMVA', 'dipho_subleadIDMVA', 'dipho_lead_ptoM', 'dipho_sublead_ptoM', 'dijet_Mjj', 'dijet_LeadJPt', 'dijet_SubJPt', 'ggH_bdt', 'VBF_bdt', 'VBF_analysis_tag', 'ggH_analysis_tag', 'priority_tag']] #print combined_df[combined_df.VBF_analysis_tag>-1][['dipho_mass', 'dipho_leadIDMVA', 'dipho_subleadIDMVA', 'dipho_lead_ptoM', 'dipho_sublead_ptoM', 'dijet_Mjj', 'dijet_LeadJPt', 'dijet_SubJPt', 'ggH_bdt', 'VBF_bdt', 'VBF_analysis_tag', 'ggH_analysis_tag', 'priority_tag']] #print combined_df[combined_df.ggH_analysis_tag>-1][['dipho_mass', 'dipho_leadIDMVA', 'dipho_subleadIDMVA', 'dipho_lead_ptoM', 'dipho_sublead_ptoM', 'dijet_Mjj', 'dijet_LeadJPt', 'dijet_SubJPt', 'ggH_bdt', 'VBF_bdt', 'VBF_analysis_tag', 'ggH_analysis_tag', 'priority_tag']] # FILL TREES BASED ON BOTH OF ABOVE tree_vars = ['dZ', 'CMS_hgg_mass', 'weight'] combined_df['dZ'] = float(0.) combined_df['CMS_hgg_mass'] = combined_df['dielectronMass'] # FIXME: dont loop through events eventually but for now I cba to use numpy to vectorise it again #for true_proc in tag_sequence+['Data']: # #isolate true proc # true_proc_df = combined_df[combined_df.proc==true_proc.lower()] # #how much true proc landed in each of our analysis cats? # for target_proc in tag_sequence: #for all events that got the proc tag, which tag did they fall into? # true_proc_target_proc_df = true_proc_df[true_proc_df.priority_tag==target_proc] # for i_tag in range(len(proc_to_tags[target_proc].values())):#for each tag corresponding to the category we target, which events go in which tag # true_procs_target_proc_tag_i = true_proc_target_proc_df[true_proc_target_proc_df['{}_analysis_tag'.format(target_proc)].eq(i_tag)] # # branch_name = '{}_125_13TeV_{}cat{} :'.format(true_proc.lower(), target_proc.lower(), i_tag ) # print true_procs_target_proc_tag_i[['dipho_mass', 'dipho_leadIDMVA', 'dipho_subleadIDMVA', 'dipho_lead_ptoM', 'dipho_sublead_ptoM', 'dijet_Mjj', 'dijet_LeadJPt', 'dijet_SubJPt', 'ggH_bdt', 'VBF_bdt', 'VBF_analysis_tag', 'ggH_analysis_tag', 'priority_tag']].head(10) # print branch_name #get tree names branch_names = {} #print 'DEBUG: {}'.format(np.unique(combined_df['proc'])) for true_proc in tag_sequence + ['Data']: branch_names[true_proc] = [] for target_proc in tag_sequence: #for all events that got the proc tag, which tag did they fall into? for i_tag in range( len(proc_to_tags[target_proc].values()) ): #for each tag corresponding to the category we target, which events go in which tag if true_proc is not 'Data': branch_names[true_proc].append( '{}_125_13TeV_{}cat{}'.format( true_proc.lower(), target_proc.lower(), i_tag)) else: branch_names[true_proc].append( '{}_13TeV_{}cat{}'.format(true_proc, target_proc.lower(), i_tag)) #debug_procs = ['dipho_mass', 'dipho_leadIDMVA', 'dipho_subleadIDMVA', 'dipho_lead_ptoM', 'dipho_sublead_ptoM', 'dijet_Mjj', 'dijet_LeadJPt', 'dijet_SubJPt', 'ggH_bdt', 'VBF_bdt', 'VBF_analysis_tag', 'ggH_analysis_tag', 'priority_tag'] debug_vars = [ 'proc', 'VBF_analysis_tag', 'ggH_analysis_tag', 'priority_tag' ] combined_df['tree_name'] = combined_df.apply(assign_tree, axis=1) print combined_df[debug_vars + ['tree_name']] if not path.isdir('output_trees/'): print 'making directory: {}'.format('output_trees/') system('mkdir -p %s' % 'output_trees/') #have to save individual trees then hadd procs together on the command line. for proc in tag_sequence + ['Data']: selected_df = combined_df[combined_df.proc == proc] for bn in branch_names[proc]: print bn branch_selected_df = selected_df[selected_df.tree_name == bn] print branch_selected_df[debug_vars + ['tree_name']].head(20) root_pandas.to_root(branch_selected_df[tree_vars], 'output_trees/{}.root'.format(bn), key=bn) print