Exemplo n.º 1
0
def main(options):

    #take options from the yaml config
    with open(options.config, 'r') as config_file:
        config            = yaml.load(config_file)
        output_tag        = config['output_tag']

        mc_dir            = config['mc_file_dir']
        mc_fnames         = config['mc_file_names']
  
        #data not needed yet, could use this for validation later. keep for compat with class
        data_dir          = config['data_file_dir']
        data_fnames       = config['data_file_names']

        train_vars        = config['train_vars']
        vars_to_add       = config['vars_to_add']
        presel            = config['preselection']

        proc_to_tree_name = config['proc_to_tree_name']

        #sig_colour        = 'forestgreen'
        sig_colour        = 'red'
 
                                           #Data handling stuff#
        sys.exit(1)

        #load the mc dataframe for all years
        if options.pt_reweight: 
            cr_selection = config['reweight_cr']
            output_tag += '_pt_reweighted'
            root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir, data_fnames, proc_to_tree_name, train_vars, vars_to_add, cr_selection)
        else: root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir, data_fnames, proc_to_tree_name, train_vars, vars_to_add, presel)

        for sig_obj in root_obj.sig_objects:
            root_obj.load_mc(sig_obj, reload_samples=options.reload_samples)
        for bkg_obj in root_obj.bkg_objects:
            root_obj.load_mc(bkg_obj, bkg=True, reload_samples=options.reload_samples)
        for data_obj in root_obj.data_objects:
            root_obj.load_data(data_obj, reload_samples=options.reload_samples)
        root_obj.concat()

        if options.pt_reweight and options.reload_samples: 
            root_obj.apply_pt_rew('DYMC', presel)

                                            #Plotter stuff#
 
        #set up X, w and y, train-test 
        plotter = Plotter(root_obj, train_vars, sig_col=sig_colour, norm_to_data=True)
        for var in train_vars:
            plotter.plot_input(var, options.n_bins, output_tag, options.ratio_plot, norm_to_data=True)
Exemplo n.º 2
0
def main(options):

    #take options from the yaml config
    with open(options.config, 'r') as config_file:
        config = yaml.load(config_file)
        output_tag = config['output_tag']

        mc_dir = config['mc_file_dir']
        mc_fnames = config['mc_file_names']

        data_dir = config['data_file_dir']
        data_fnames = config['data_file_names']

        proc_to_tree_name = config['proc_to_tree_name']

        object_vars = config['object_vars']
        flat_obj_vars = [var for i_object in object_vars for var in i_object]
        event_vars = config['event_vars']
        vars_to_add = config['vars_to_add']
        presel = config['preselection']
        colours = ['#d7191c', '#fdae61', '#f2f229', '#abdda4', '#2b83ba']

        #Data handling stuff#

        #load the mc dataframe for all years
        root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir,
                               data_fnames, proc_to_tree_name,
                               flat_obj_vars + event_vars, vars_to_add, presel)

        for sig_obj in root_obj.sig_objects:
            print sig_obj.file_name
            print sig_obj.tree_name
            root_obj.load_mc(sig_obj, reload_samples=options.reload_samples)
        for bkg_obj in root_obj.bkg_objects:
            root_obj.load_mc(bkg_obj,
                             bkg=True,
                             reload_samples=options.reload_samples)

        root_obj.concat()

        #Plotter stuff#

        #add model predictions to sig df
        print 'loading DNN: {}'.format(options.model_architecture)
        with open('{}'.format(options.model_architecture), 'r') as model_json:
            model_architecture = model_json.read()
        model = keras.models.model_from_json(model_architecture)
        model.load_weights('{}'.format(options.model))

        LSTM = LSTM_DNN(root_obj, object_vars, event_vars, 1.0, False, True)
        unscaled_sig_df = root_obj.mc_df_sig.copy()
        unscaled_bkg_df = root_obj.mc_df_bkg.copy()

        # set up X and y Matrices
        LSTM.var_transform(do_data=False)
        X_tot, y_tot = LSTM.create_X_y()

        X_tot = X_tot[flat_obj_vars + event_vars]  #filter unused vars
        LSTM.load_X_scaler(out_tag=output_tag)
        X_tot = LSTM.X_scaler.transform(X_tot)

        X_tot = pd.DataFrame(X_tot, columns=flat_obj_vars + event_vars)
        X_tot_high_level = X_tot[event_vars].values
        X_tot_low_level = LSTM.join_objects(X_tot[flat_obj_vars])
        pred_prob_tot = model.predict([X_tot_high_level, X_tot_low_level],
                                      batch_size=1024).flatten()

        unscaled_sig_df['bdt_score'] = pred_prob_tot[y_tot == 1]
        unscaled_bkg_df['bdt_score'] = pred_prob_tot[y_tot == 0]

        train_vars = flat_obj_vars + event_vars
        plotter = Plotter(root_obj, train_vars, norm_to_data=True)
        #for VBF, good set is: [0.30 0.50 0.70 0.80 0.90 1.0]
        bdt_bins = np.array(options.boundaries)
        Utils.check_dir('{}/plotting/plots/{}_sig_bkg_evo'.format(
            os.getcwd(), output_tag))
        i_hist = 0

        for var in train_vars + ['dielectronMass']:
            fig = plt.figure(1)
            axes = fig.gca()
            var_bins = np.linspace(plotter.var_to_xrange[var][0],
                                   plotter.var_to_xrange[var][1],
                                   options.n_bins)
            for ibin in range(len(bdt_bins) - 1):
                sig_cut = unscaled_sig_df[np.logical_and(
                    unscaled_sig_df['bdt_score'] > bdt_bins[ibin],
                    unscaled_sig_df['bdt_score'] < bdt_bins[ibin + 1])][var]
                weights_cut = unscaled_sig_df[np.logical_and(
                    unscaled_sig_df['bdt_score'] > bdt_bins[ibin],
                    unscaled_sig_df['bdt_score'] <
                    bdt_bins[ibin + 1])]['weight']
                weights_cut /= np.sum(weights_cut)
                axes.hist(sig_cut,
                          bins=var_bins,
                          label='{:.2f} $<$ MVA $<$ {:.2f}'.format(
                              bdt_bins[ibin], bdt_bins[ibin + 1]),
                          weights=weights_cut,
                          histtype='step',
                          color=colours[i_hist])
                i_hist += 1
            i_hist = 0
            annotate_and_save(axes, plotter, var)
            axes.text(0.95,
                      0.6,
                      'Simulated VBF signal',
                      ha='right',
                      va='bottom',
                      transform=axes.transAxes,
                      size=14)
            fig.savefig(
                '{0}/plotting/plots/{1}_sig_bkg_evo/{1}_{2}.pdf'.format(
                    os.getcwd(), output_tag, var))
            plt.close()

        #plot background (check mass is not being sculpted)
        for var in ['dielectronMass']:
            fig = plt.figure(1)
            axes = fig.gca()
            var_bins = np.linspace(plotter.var_to_xrange[var][0],
                                   plotter.var_to_xrange[var][1],
                                   options.n_bins)
            for ibin in range(len(bdt_bins) - 1):
                bkg_cut = unscaled_bkg_df[np.logical_and(
                    unscaled_bkg_df['bdt_score'] > bdt_bins[ibin],
                    unscaled_bkg_df['bdt_score'] < bdt_bins[ibin + 1])][var]
                bkg_weights_cut = unscaled_bkg_df[np.logical_and(
                    unscaled_bkg_df['bdt_score'] > bdt_bins[ibin],
                    unscaled_bkg_df['bdt_score'] <
                    bdt_bins[ibin + 1])]['weight']
                bkg_weights_cut /= np.sum(bkg_weights_cut)
                axes.hist(bkg_cut,
                          bins=var_bins,
                          label='{:.2f} $<$ MVA $<$ {:.2f}'.format(
                              bdt_bins[ibin], bdt_bins[ibin + 1]),
                          weights=bkg_weights_cut,
                          histtype='step',
                          color=colours[i_hist])
                i_hist += 1
            i_hist = 0

            annotate_and_save(axes, plotter, var)
            axes.text(0.95,
                      0.6,
                      'Simulated background',
                      ha='right',
                      va='bottom',
                      transform=axes.transAxes,
                      size=14)
            fig.savefig(
                '{0}/plotting/plots/{1}_sig_bkg_evo/{1}_{2}_bkg.pdf'.format(
                    os.getcwd(), output_tag, var))
            plt.close()
Exemplo n.º 3
0
def main(options):

    #take options from the yaml config
    with open(options.config, 'r') as config_file:
        config            = yaml.load(config_file)
        output_tag        = config['output_tag']

        mc_dir            = config['mc_file_dir']
        mc_fnames         = config['mc_file_names']
  
        data_dir          = config['data_file_dir']
        data_fnames       = config['data_file_names']

        proc_to_tree_name = config['proc_to_tree_name']       

        object_vars       = config['object_vars']
        flat_obj_vars     = [var for i_object in object_vars for var in i_object]
        event_vars        = config['event_vars']
        vars_to_add       = config['vars_to_add']
        presel            = config['preselection']
                 
                                           #Data handling stuff#
                 
        #load the mc dataframe for all years
        root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir, data_fnames, proc_to_tree_name, flat_obj_vars+event_vars, vars_to_add, presel) 

        for sig_obj in root_obj.sig_objects:
            root_obj.load_mc(sig_obj, reload_samples=options.reload_samples)
        if not options.data_as_bkg:
            for bkg_obj in root_obj.bkg_objects:
                root_obj.load_mc(bkg_obj, bkg=True, reload_samples=options.reload_samples)
        else:
            for data_obj in root_obj.data_objects:
                root_obj.load_data(data_obj, reload_samples=options.reload_samples)
            #overwrite background attribute, for compat with DNN class
            root_obj.mc_df_bkg = root_obj.data_df
        root_obj.concat()

        #apply cut-based selection if not optimising BDT score (pred probs still evaluated for compatability w exisiting catOpt constructor). 
        if len(options.cut_based_str)>0:
            root_obj.apply_more_cuts(options.cut_based_str)

                                           # DNN evaluation stuff #

        #load architecture and model weights
        print 'loading DNN: {}'.format(options.model_architecture)
        with open('{}'.format(options.model_architecture), 'r') as model_json:
            model_architecture = model_json.read()
        model = keras.models.model_from_json(model_architecture)
        model.load_weights('{}'.format(options.model))

        LSTM = LSTM_DNN(root_obj, object_vars, event_vars, 1.0, False, True)

        # set up X and y Matrices. Log variables that have GeV units
        LSTM.var_transform(do_data=False) #bkg=data here. This option is for plotting purposes
        X_tot, y_tot     = LSTM.create_X_y()
        X_tot            = X_tot[flat_obj_vars+event_vars] #filter unused vars

        #scale X_vars to mean=0 and std=1. Use scaler fit during previous dnn training
        LSTM.load_X_scaler(out_tag=output_tag)
        X_tot            = LSTM.X_scaler.transform(X_tot)

        #make 2D vars for LSTM layers
        X_tot            = pd.DataFrame(X_tot, columns=flat_obj_vars+event_vars)
        X_tot_high_level = X_tot[event_vars].values
        X_tot_low_level  = LSTM.join_objects(X_tot[flat_obj_vars])

        #predict probs
        pred_prob_tot    = model.predict([X_tot_high_level, X_tot_low_level], batch_size=1024).flatten()

        sig_weights   = root_obj.mc_df_sig['weight'].values
        sig_m_ee      = root_obj.mc_df_sig['dielectronMass'].values
        pred_prob_sig = pred_prob_tot[y_tot==1] 

        bkg_weights   = root_obj.data_df['weight'].values
        bkg_m_ee      = root_obj.data_df['dielectronMass'].values
        pred_prob_bkg = pred_prob_tot[y_tot==0]

                                             #category optimisation stuff#

        #set up optimiser ranges and no. categories to test if non-cut based
        ranges    = [ [0.3,1.] ]
        names     = ['{} score'.format(output_tag)] #arbitrary
        print_str = ''
        cats = [1,2,3,4]
        AMS  = []

        #just to use class methods here
        if len(options.cut_based_str)>0:
            optimiser = CatOptim(sig_weights, sig_m_ee, [pred_prob_sig], bkg_weights, bkg_m_ee, [pred_prob_bkg], 0, ranges, names)
            AMS = optimiser.cutBasedAMS()
            print 'String for cut based optimimastion: {}'.format(options.cut_based_str)
            print 'Cut-based optimimsation gives AMS = {:1.8f}'.format(AMS)

        else:
            for n_cats in cats:
                optimiser = CatOptim(sig_weights, sig_m_ee, [pred_prob_sig], bkg_weights, bkg_m_ee, [pred_prob_bkg], n_cats, ranges, names)
                optimiser.optimise(1, options.n_iters) #set lumi to 1 as already scaled when loading in
                print_str += 'Results for {} categories : \n'.format(n_cats)
                print_str += optimiser.getPrintableResult()
                AMS.append(optimiser.bests.totSignif)
            print '\n {}'.format(print_str)

        #make nCat vs AMS plots
        Plotter.cats_vs_ams(cats, AMS, output_tag)
Exemplo n.º 4
0
def main(options):

    #take options from the yaml config
    with open(options.config, 'r') as config_file:
        config = yaml.load(config_file)
        output_tag = config['output_tag']

        mc_dir = config['mc_file_dir']
        mc_fnames = config['mc_file_names']

        #data not needed yet, but stil specify in the config for compatibility with constructor
        data_dir = config['data_file_dir']
        data_fnames = config['data_file_names']

        proc_to_tree_name = config['proc_to_tree_name']

        train_vars = config['train_vars']
        vars_to_add = config['vars_to_add']
        presel = config['preselection']

        #Data handling stuff#

        #load the mc dataframe for all years
        if options.pt_reweight:
            cr_selection = config['reweight_cr']
            output_tag += '_pt_reweighted'
            root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir,
                                   data_fnames, proc_to_tree_name, train_vars,
                                   vars_to_add, cr_selection)
        else:
            root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir,
                                   data_fnames, proc_to_tree_name, train_vars,
                                   vars_to_add, presel)

        for sig_obj in root_obj.sig_objects:
            root_obj.load_mc(sig_obj, reload_samples=options.reload_samples)
        for bkg_obj in root_obj.bkg_objects:
            root_obj.load_mc(bkg_obj,
                             bkg=True,
                             reload_samples=options.reload_samples)
        for data_obj in root_obj.data_objects:
            root_obj.load_data(data_obj, reload_samples=options.reload_samples)
        root_obj.concat()

        #reweight samples in bins of pT (and maybe Njets), for each year separely. Note targetted selection
        # is applied here and all df's are resaved for smaller mem
        if options.pt_reweight and options.reload_samples:  #FIXME what about reading files in first time, wanting to pT rew, but not including options.reload samples? It wont reweight and save the reweighted df's
            root_obj.apply_pt_rew('DYMC', presel)
            #root_obj.pt_njet_reweight('DYMC', year, presel)

            #BDT stuff#

        #set up X, w and y, train-test
        bdt_hee = BDTHelpers(root_obj,
                             train_vars,
                             options.train_frac,
                             eq_train=options.eq_train)
        bdt_hee.create_X_and_y(mass_res_reweight=True)

        #submit the HP search if option true
        if options.hp_perm is not None:
            if options.opt_hps and options.train_best:
                raise Exception(
                    'Cannot optimise HPs and train best model. Run optimal training after hyper paramter optimisation'
                )
            elif options.opt_hps and options.hp_perm:
                raise Exception(
                    'opt_hps option submits scripts with the hp_perm option; Cannot submit a script with both!'
                )
            else:
                print(
                    'About to train + validate on dataset with {} fold splitting'
                    .format(options.k_folds))
                bdt_hee.set_hyper_parameters(options.hp_perm)
                bdt_hee.set_k_folds(options.k_folds)
                for i_fold in range(options.k_folds):
                    bdt_hee.set_i_fold(i_fold)
                    bdt_hee.train_classifier(root_obj.mc_dir, save=False)
                    bdt_hee.validation_rocs.append(bdt_hee.compute_roc())
                with open('{}/bdt_hp_opt_{}.txt'.format(mc_dir, output_tag),
                          'a+') as val_roc_file:
                    bdt_hee.compare_rocs(val_roc_file, options.hp_perm)
                    val_roc_file.close()

        elif options.opt_hps:
            #FIXME: add warning that many jobs are about to be submiited
            if options.k_folds < 2:
                raise ValueError('K-folds option must be at least 2')
            if path.isfile('{}/bdt_hp_opt_{}.txt'.format(mc_dir, output_tag)):
                system('rm {}/bdt_hp_opt_{}.txt'.format(mc_dir, output_tag))
                print('deleting: {}/bdt_hp_opt_{}.txt'.format(
                    mc_dir, output_tag))
            bdt_hee.batch_gs_cv(k_folds=options.k_folds,
                                pt_rew=options.pt_reweight)

        elif options.train_best:
            output_tag += '_best'
            with open('{}/bdt_hp_opt_{}.txt'.format(mc_dir, output_tag),
                      'r') as val_roc_file:
                hp_roc = val_roc_file.readlines()
                best_params = hp_roc[-1].split(';')[0]
                print('Best classifier params are: {}'.format(best_params))
                bdt_hee.set_hyper_parameters(best_params)
                bdt_hee.train_classifier(root_obj.mc_dir,
                                         save=True,
                                         model_name=output_tag)
                bdt_hee.compute_roc()
                bdt_hee.plot_roc(output_tag)
                bdt_hee.plot_output_score(
                    output_tag,
                    ratio_plot=True,
                    norm_to_data=(not options.pt_reweight))

        #else just train BDT with default HPs
        else:
            bdt_hee.train_classifier(root_obj.mc_dir,
                                     save=True,
                                     model_name=output_tag + '_clf')
            #bdt_hee.train_classifier(root_obj.mc_dir, save=False, model_name=output_tag+'_clf')
            bdt_hee.compute_roc()
            bdt_hee.plot_roc(output_tag)
            #bdt_hee.plot_output_score(output_tag, ratio_plot=True, norm_to_data=(not options.pt_reweight), log=False)
            bdt_hee.plot_output_score(output_tag,
                                      ratio_plot=True,
                                      norm_to_data=(not options.pt_reweight),
                                      log=True)
Exemplo n.º 5
0
def main(options):

    #take options from the yaml config
    with open(options.config, 'r') as config_file:
        config = yaml.load(config_file)
        output_tag = config['output_tag']

        mc_dir = config['mc_file_dir']
        mc_fnames = config['mc_file_names']

        data_dir = config['data_file_dir']
        data_fnames = config['data_file_names']

        proc_to_tree_name = config['proc_to_tree_name']

        train_vars = config['train_vars']
        vars_to_add = config['vars_to_add']
        presel = config['preselection']

        #load the mc dataframe for all years
        root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir,
                               data_fnames, proc_to_tree_name, train_vars,
                               vars_to_add, presel)

        for sig_obj in root_obj.sig_objects:
            root_obj.load_mc(sig_obj, reload_samples=options.reload_samples)
        if not options.data_as_bkg:
            for bkg_obj in root_obj.bkg_objects:
                root_obj.load_mc(bkg_obj,
                                 bkg=True,
                                 reload_samples=options.reload_samples)
        else:
            for data_obj in root_obj.data_objects:
                root_obj.load_data(data_obj,
                                   reload_samples=options.reload_samples)
        root_obj.concat()

        print 'loading classifier: {}'.format(options.model)
        clf = pickle.load(open("{}".format(options.model), "rb"))

        #apply cut-based selection if not optimising BDT score (pred probs still evaluated for compatability w exisiting constructor).
        if len(options.cut_based_str) > 0:
            root_obj.apply_more_cuts(options.cut_based_str)

        sig_weights = root_obj.mc_df_sig['weight'].values
        sig_m_ee = root_obj.mc_df_sig['dielectronMass'].values
        pred_prob_sig = clf.predict_proba(
            root_obj.mc_df_sig[train_vars].values)[:, 1:].ravel()

        if options.data_as_bkg:
            bkg_weights = root_obj.data_df['weight'].values
            bkg_m_ee = root_obj.data_df['dielectronMass'].values
            pred_prob_bkg = clf.predict_proba(
                root_obj.data_df[train_vars].values)[:, 1:].ravel()

        else:
            bkg_weights = root_obj.mc_df_bkg['weight'].values
            bkg_m_ee = root_obj.mc_df_bkg['dielectronMass'].values
            pred_prob_bkg = clf.predict_proba(
                root_obj.mc_df_bkg[train_vars].values)[:, 1:].ravel()

        #set up optimiser ranges and no. categories to test if non-cut based
        ranges = [[0.15, 1.]]
        names = ['{} score'.format(output_tag)]  #arbitrary
        print_str = ''
        cats = [1, 2, 3, 4, 5]
        AMS = []

        #just to use class methods here
        if len(options.cut_based_str) > 0:
            optimiser = CatOptim(sig_weights, sig_m_ee, [pred_prob_sig],
                                 bkg_weights, bkg_m_ee, [pred_prob_bkg], 0,
                                 ranges, names)
            AMS = optimiser.cutBasedAMS()
            print 'String for cut based optimimastion: {}'.format(
                options.cut_based_str)
            print 'Cut-based optimimsation gives AMS = {:1.8f}'.format(AMS)

        else:
            for n_cats in cats:
                optimiser = CatOptim(sig_weights, sig_m_ee, [pred_prob_sig],
                                     bkg_weights, bkg_m_ee, [pred_prob_bkg],
                                     n_cats, ranges, names)
                optimiser.optimise(
                    1, options.n_iters
                )  #set lumi to 1 as already scaled when loading in
                print_str += 'Results for {} categories : \n'.format(n_cats)
                print_str += optimiser.getPrintableResult()
                AMS.append(optimiser.bests.totSignif)
            print '\n {}'.format(print_str)

        #make nCat vs AMS plots
        Plotter.cats_vs_ams(cats, AMS, output_tag)
Exemplo n.º 6
0
def main(options):

    #take options from the yaml config
    with open(options.config, 'r') as config_file:
        config = yaml.load(config_file)
        output_tag = config['output_tag']

        mc_dir = config['mc_file_dir']
        mc_fnames = config['mc_file_names']

        data_dir = config['data_file_dir']
        data_fnames = config['data_file_names']

        proc_to_tree_name = config['proc_to_tree_name']

        object_vars = config['object_vars']
        flat_obj_vars = [var for i_object in object_vars for var in i_object]
        event_vars = config['event_vars']
        vars_to_add = config['vars_to_add']
        presel = config['preselection']

        #Data handling stuff#

        if options.pt_reweight:
            cr_selection = config['reweight_cr']
            output_tag += '_pt_reweighted'
            root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir,
                                   data_fnames, proc_to_tree_name,
                                   flat_obj_vars + event_vars, vars_to_add,
                                   cr_selection)
        else:
            root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir,
                                   data_fnames, proc_to_tree_name,
                                   flat_obj_vars + event_vars, vars_to_add,
                                   presel)

        #load the dataframes for all years
        for sig_obj in root_obj.sig_objects:
            root_obj.load_mc(sig_obj, reload_samples=options.reload_samples)
        for bkg_obj in root_obj.bkg_objects:
            root_obj.load_mc(bkg_obj,
                             bkg=True,
                             reload_samples=options.reload_samples)
        for data_obj in root_obj.data_objects:  # for plotting
            root_obj.load_data(data_obj, reload_samples=options.reload_samples)
        root_obj.concat()

        #reweight samples in bins of pT (and maybe Njets), for each year separely.
        if options.pt_reweight and options.reload_samples:
            root_obj.apply_pt_rew('DYMC', presel)

            #LSTM stuff#

        LSTM = LSTM_DNN(root_obj, object_vars, event_vars, options.train_frac,
                        options.eq_weights, options.batch_boost)

        if not options.opt_hps:
            LSTM.var_transform(do_data=True)
            X_tot, y_tot = LSTM.create_X_y(mass_res_reweight=True)
            LSTM.split_X_y(X_tot, y_tot, do_data=True)

            if options.hp_perm is not None:
                LSTM.get_X_scaler(LSTM.all_vars_X_train,
                                  out_tag=output_tag,
                                  save=False)
            else:
                LSTM.get_X_scaler(LSTM.all_vars_X_train, out_tag=output_tag)
            LSTM.X_scale_train_test(do_data=True)
            LSTM.set_low_level_2D_test_train(do_data=True,
                                             ignore_train=options.batch_boost)

        #functions called in subbed job, if options.opt_hps was true
        if options.hp_perm is not None:
            if options.opt_hps and options.train_best:
                raise Exception(
                    'Cannot optimise HPs and train best model. Run optimal training after hyper paramter optimisation'
                )
            elif options.opt_hps and options.hp_perm:
                raise Exception(
                    'opt_hps option submits scripts with the hp_perm option; Cannot submit a script with both!'
                )
            else:
                LSTM.set_hyper_parameters(options.hp_perm)
                LSTM.model.summary()
                LSTM.train_w_batch_boost(out_tag=output_tag, save=False)
                with open('{}/lstm_hp_opt_{}.txt'.format(mc_dir, output_tag),
                          'a+') as val_roc_file:
                    LSTM.compare_rocs(val_roc_file, options.hp_perm)
                    val_roc_file.close()

        elif options.opt_hps:
            #FIXME: add warning that many jobs are about to be submiited
            if path.isfile('{}/lstm_hp_opt_{}.txt'.format(mc_dir, output_tag)):
                system('rm {}/lstm_hp_opt_{}.txt'.format(mc_dir, output_tag))
                print('deleting: {}/lstm_hp_opt_{}.txt'.format(
                    mc_dir, output_tag))
            LSTM.batch_gs_cv(pt_rew=options.pt_reweight)

        elif options.train_best:
            output_tag += '_best'
            with open('{}/lstm_hp_opt_{}.txt'.format(mc_dir, output_tag),
                      'r') as val_roc_file:
                hp_roc = val_roc_file.readlines()
                best_params = hp_roc[-1].split(';')[0]
                print 'Best classifier params are: {}'.format(best_params)
                LSTM.set_hyper_parameters(best_params)
                LSTM.model.summary()
                LSTM.train_w_batch_boost(out_tag=output_tag)
                #compute final roc on test set
                LSTM.compute_roc(batch_size=1024)
                LSTM.plot_roc(output_tag)
                LSTM.plot_output_score(output_tag,
                                       batch_size=1024,
                                       ratio_plot=True,
                                       norm_to_data=(not options.pt_reweight))

        #else train with basic parameters/architecture
        else:
            LSTM.model.summary()
            if options.batch_boost:  #type of model selection so need validation set
                LSTM.train_w_batch_boost(
                    out_tag=output_tag
                )  #handles creating validation set and 2D vars and sequential saving
            else:
                LSTM.train_network(epochs=5, batch_size=1024)
                #LSTM.train_network(epochs=7, batch_size=32)
                LSTM.save_model(out_tag=output_tag)
            LSTM.compute_roc(batch_size=1024)
            #compute final roc on test set
            LSTM.plot_roc(output_tag)
            LSTM.plot_output_score(output_tag,
                                   batch_size=1024,
                                   ratio_plot=True,
                                   norm_to_data=(not options.pt_reweight))
Exemplo n.º 7
0
def main(options):

    #take options from the yaml config
    with open(options.config, 'r') as config_file:
        config            = yaml.load(config_file)
        output_tag        = config['output_tag']

        mc_dir            = config['mc_file_dir']
        mc_fnames         = config['mc_file_names']
  
        #data not needed yet, could use this for validation later. keep for compat with class
        data_dir          = config['data_file_dir']
        data_fnames       = config['data_file_names']

        train_vars        = config['train_vars']
        vars_to_add       = config['vars_to_add']
        presel            = config['preselection']

        proc_to_tree_name = config['proc_to_tree_name']

        sig_colour        = 'forestgreen'
        #sig_colour        = 'red'
        bkg_colour        = 'violet'
 
                                           #Data handling stuff#

        #load the mc dataframe for all years
        if options.pt_reweight: 
            cr_selection = config['reweight_cr']
            output_tag += '_pt_reweighted'
            root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir, data_fnames, proc_to_tree_name, train_vars, vars_to_add, cr_selection)
        else: root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir, data_fnames, proc_to_tree_name, train_vars, vars_to_add, presel)

        for sig_obj in root_obj.sig_objects:
            root_obj.load_mc(sig_obj, reload_samples=options.reload_samples)
        for bkg_obj in root_obj.bkg_objects:
            root_obj.load_mc(bkg_obj, bkg=True, reload_samples=options.reload_samples)
        root_obj.concat()

        if options.pt_reweight and options.reload_samples: 
            for year in root_obj.years:
                root_obj.pt_reweight('DYMC', year, presel)

                                            #Plotter stuff#
        with open('plotting/var_to_xrange.yaml', 'r') as plot_config_file:
            plot_config        = yaml.load(plot_config_file)
            var_to_xrange      = plot_config['var_to_xrange']

        #get x string replacements from yaml config
        with open('plotting/var_to_xstring.yaml', 'r') as plot_config_file:
            plot_string_cfg    = yaml.load(plot_config_file)
            var_to_xstring     = plot_string_cfg['var_to_xstring']
 
        #set up X, w and y, train-test 
        plotter = Plotter(root_obj, train_vars, sig_col=sig_colour, norm_to_data=True)
        for var in train_vars:
        #for var in ['dielectronCosPhi']:

            fig  = plt.figure(1)
            axes = fig.gca()

            var_sig     = root_obj.mc_df_sig[var].values
            sig_weights = root_obj.mc_df_sig['weight'].values
            var_bkg     = root_obj.mc_df_bkg[var].values
            bkg_weights = root_obj.mc_df_bkg['weight'].values

            bins = np.linspace(var_to_xrange[var][0], var_to_xrange[var][1], 56)

            #add sig mc
            axes.hist(var_sig, bins=bins, label=plotter.sig_labels[0]+r' ($\mathrm{H}\rightarrow\mathrm{ee}$)', weights=sig_weights, histtype='stepfilled', color='red', zorder=10, alpha=0.4, normed=True)
            axes.hist(var_bkg, bins=bins, label='Simulated background', weights=bkg_weights, histtype='stepfilled', color='blue', zorder=0, alpha=0.4, normed=True)

            axes.set_ylabel('Arbitrary Units', ha='right', y=1, size=13)
            current_bottom, current_top = axes.get_ylim()
            axes.set_ylim(bottom=0, top=1.2*current_top)
            axes.set_xlim(left=var_to_xrange[var][0], right=var_to_xrange[var][1])
            axes.legend(bbox_to_anchor=(0.97,0.97), ncol=1)
            plotter.plot_cms_labels(axes, lumi='')
               
            axes.set_xlabel('{}'.format(var_to_xstring[var]), ha='right', x=1, size=13)

            Utils.check_dir('{}/plotting/plots/{}/normed/'.format(os.getcwd(), output_tag))
            fig.savefig('{0}/plotting/plots/{1}/normed/{1}_{2}_normalised.pdf'.format(os.getcwd(), output_tag, var))
            plt.close()
Exemplo n.º 8
0
def main(options):

    #take options from the yaml config
    with open(options.config, 'r') as config_file:
        config            = yaml.load(config_file)
        output_tag        = config['output_tag']

        mc_dir            = config['mc_file_dir']
        mc_fnames         = config['mc_file_names']
  
        #data not needed yet, could use this for validation later. keep for compat with class
        data_dir          = config['data_file_dir']
        data_fnames       = config['data_file_names']

        train_vars        = config['train_vars']
        vars_to_add       = config['vars_to_add']
        presel            = config['preselection']

        proc_to_tree_name = config['proc_to_tree_name']
        colours           = ['#d7191c', '#fdae61', '#f2f229', '#abdda4', '#2b83ba']

                                           #Data handling stuff#

        #load the mc dataframe for all years
        if options.pt_reweight: 
            cr_selection = config['reweight_cr']
            output_tag += '_pt_reweighted'
            root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir, data_fnames, proc_to_tree_name, train_vars, vars_to_add, cr_selection)
        else: root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir, data_fnames, proc_to_tree_name, train_vars, vars_to_add, presel)

        for sig_obj in root_obj.sig_objects:
            root_obj.load_mc(sig_obj, reload_samples=options.reload_samples)
        for bkg_obj in root_obj.bkg_objects:
            root_obj.load_mc(bkg_obj, bkg=True, reload_samples=options.reload_samples)
        #for data_obj in root_obj.data_objects:
        #    root_obj.load_data(data_obj, reload_samples=options.reload_samples)
        root_obj.concat()

        if options.pt_reweight and options.reload_samples: 
            for year in root_obj.years:
                root_obj.pt_reweight('DYMC', year, presel)

                                            #Plotter stuff#

        #add model predictions to sig df
        print 'loading classifier: {}'.format(options.model)
        clf = pickle.load(open("{}".format(options.model), "rb"))
        sig_df = root_obj.mc_df_sig
        sig_df['bdt_score'] = clf.predict_proba(sig_df[train_vars].values)[:,1:].ravel()
        bkg_df = root_obj.mc_df_bkg
        bkg_df['bdt_score'] = clf.predict_proba(bkg_df[train_vars].values)[:,1:].ravel()
 
        plotter  = Plotter(root_obj, train_vars)
        #for VBF, good set is: [0.30 0.50 0.70 0.80 0.90 1.0]
        #for ggH, good set is: [0.10 0.30 0.45 0.53 0.60 0.8]
        bdt_bins = np.array(options.boundaries)
        Utils.check_dir('{}/plotting/plots/{}_sig_bkg_evo'.format(os.getcwd(), output_tag))
        i_hist = 0

        for var in train_vars+['dielectronMass']:
            fig  = plt.figure(1)
            axes = fig.gca()
            var_bins = np.linspace(plotter.var_to_xrange[var][0], plotter.var_to_xrange[var][1], options.n_bins)
            for ibin in range(len(bdt_bins)-1):
                sig_cut = sig_df[np.logical_and( sig_df['bdt_score'] > bdt_bins[ibin], sig_df['bdt_score'] < bdt_bins[ibin+1])][var]
                weights_cut = sig_df[np.logical_and( sig_df['bdt_score'] > bdt_bins[ibin], sig_df['bdt_score'] < bdt_bins[ibin+1])]['weight']
                axes.hist(sig_cut, bins=var_bins, label='{:.2f} $<$ MVA $<$ {:.2f}'.format(bdt_bins[ibin], bdt_bins[ibin+1]), weights=weights_cut, histtype='step', color=colours[i_hist], normed=True)
                i_hist += 1
            i_hist=0
            annotate_and_save(axes, plotter, var)
            fig.savefig('{0}/plotting/plots/{1}_sig_bkg_evo/{1}_{2}.pdf'.format(os.getcwd(), output_tag, var))
            print('saving: {0}/plotting/plots/{1}_sig_bkg_evo/{1}_{2}.pdf'.format(os.getcwd(), output_tag, var))
            plt.close()

        #plot background (check mass is not being sculpted)
        for var in ['dielectronMass']:
            fig  = plt.figure(1)
            axes = fig.gca()
            var_bins = np.linspace(plotter.var_to_xrange[var][0], plotter.var_to_xrange[var][1], options.n_bins)
            for ibin in range(len(bdt_bins)-1):
                bkg_cut = bkg_df[np.logical_and( bkg_df['bdt_score'] > bdt_bins[ibin], bkg_df['bdt_score'] < bdt_bins[ibin+1])][var]
                bkg_weights_cut = bkg_df[np.logical_and( bkg_df['bdt_score'] > bdt_bins[ibin], bkg_df['bdt_score'] < bdt_bins[ibin+1])]['weight']
                axes.hist(bkg_cut, bins=var_bins, label='{:.2f} $<$ MVA $<$ {:.2f}'.format(bdt_bins[ibin], bdt_bins[ibin+1]), weights=bkg_weights_cut, histtype='step', color=colours[i_hist], normed=True)
                i_hist+=1
            i_hist=0

            annotate_and_save(axes, plotter, var)
            fig.savefig('{0}/plotting/plots/{1}_sig_bkg_evo/{1}_{2}_bkg.pdf'.format(os.getcwd(), output_tag, var))
            plt.close()
Exemplo n.º 9
0
def main(options):

    with open(options.config, 'r') as config_file:
        config = yaml.load(config_file)
        output_tag = config['output_tag']

        mc_dir = config['mc_file_dir']
        mc_fnames = config['mc_file_names']

        data_dir = config['data_file_dir']
        data_fnames = config['data_file_names']

        proc_to_tree_name = config['proc_to_tree_name']

        #check if dnn (lstm) variables need to be read in
        varrs = config['train_vars']
        all_train_vars = []
        if isinstance(varrs, dict):
            object_vars = varrs['object_vars']
            flat_obj_vars = [
                var for i_object in object_vars for var in i_object
            ]
            event_vars = varrs['event_vars']
            all_train_vars += (flat_obj_vars + event_vars)
        else:
            all_train_vars = varrs

        vars_to_add = config['vars_to_add']
        presel = config['preselection']
        cut_map = config['cut_map']

        #Data handling stuff#

        #get the dataframe for all years. Do not apply any specific preselection to sim samples
        root_obj = ROOTHelpers(output_tag,
                               mc_dir,
                               mc_fnames,
                               data_dir,
                               data_fnames,
                               proc_to_tree_name,
                               all_train_vars,
                               vars_to_add,
                               presel,
                               read_systs=True)

        #for sig_obj in root_obj.sig_objects:
        #    root_obj.load_mc(sig_obj, reload_samples=options.reload_samples)
        for bkg_obj in root_obj.bkg_objects:
            root_obj.load_mc(bkg_obj,
                             bkg=True,
                             reload_samples=options.reload_samples)
        for data_obj in root_obj.data_objects:
            root_obj.load_data(data_obj, reload_samples=options.reload_samples)
        root_obj.concat()

        #--------------------------------------------------------------------------------------------------

        dy_plotter = DYPlotter(root_obj, cut_map)
        if options.reload_samples:  #FIXME: reading in for the first time  wont re-weight sample! (OR BEING READ IN THE FOR THE FIRST TIME) (or get the reload samples flag out of the DataHandler object since trigger if no sample exist
            dy_plotter.pt_reweight()
        #FIXME still need this else dont remove variables again!
        dy_plotter.manage_memory(options.systematics,
                                 save=options.reload_samples)

        #DEBUG
        print 'Background columns'
        print root_obj.mc_df_bkg.columns[:]

        print 'Background columns'
        print root_obj.data_df.columns[:]

        if (options.var_name is None) or ('mva' in options.var_name.lower()):
            dy_plotter.eval_mva(
                options.mva_config, output_tag
            )  #little bit hard coded - be careful if 'mva' not in MVA ouput name. Below line is safer but longer.
        #dy_plotter.eval_mva(options.mva_config, output_tag)
        #--------------------------------------------------------------------------------------------------

        with open('plotting/var_to_xrange.yaml', 'r') as plot_config_file:
            plot_config = yaml.load(plot_config_file)
            var_to_xrange = plot_config['var_to_xrange']

        with open('plotting/var_to_xstring.yaml', 'r') as plot_config_file:
            plot_string_cfg = yaml.load(plot_config_file)
            var_to_xstring = plot_string_cfg['var_to_xstring']

        if options.var_name is not None: vars_to_plot = [options.var_name]
        else: var_to_plot = all_train_vars + [dy_plotter.proc + '_mva']

        for var in vars_to_plot:

            if 'mva' in var: var_bins = np.linspace(0, 1, options.n_bins)
            else:
                var_bins = np.linspace(var_to_xrange[var][0],
                                       var_to_xrange[var][1], options.n_bins)
            print 'plotting var: {}'.format(var)
            fig, axes = plt.subplots(nrows=2,
                                     ncols=1,
                                     dpi=200,
                                     sharex=True,
                                     gridspec_kw={
                                         'height_ratios': [3, 0.8],
                                         'hspace': 0.08
                                     })

            cut_str = dy_plotter.get_cut_string(var)

            #data stuff
            data_binned, bin_centres, data_stat_down_up = dy_plotter.plot_data(
                cut_str, axes, var, var_bins)
            dy_plotter.plot_bkgs(cut_str, axes, var, var_bins, data_binned,
                                 bin_centres, data_stat_down_up)

            #syst stuff
            dy_plotter.plot_systematics(
                cut_str,
                axes,
                var,
                var_bins,
                options.systematics,
                do_mva=('mva'
                        in options.var_name))  #FIXME: make this more general

            axes = dy_plotter.set_canv_style(axes, var, var_bins)
            axes[0].legend(bbox_to_anchor=(0.97, 0.97), ncol=1)
            axes[1].set_xlabel(var_to_xstring[var], size=14, ha='right', x=1)
            Utils.check_dir('{}/plotting/plots/{}'.format(
                os.getcwd(), output_tag))
            #fig.savefig('{0}/plotting/plots/{1}/{1}_{2}.pdf'.format(os.getcwd(), output_tag, var))
            fig.savefig(
                '/vols/cms/jwd18/Hee/MLCategorisation/CMSSW_10_2_0/src/HToEE/plotting/plots/{0}/{0}_{1}.pdf'
                .format(output_tag, var))  #temp hardcode
Exemplo n.º 10
0
def main(options):

    with open(options.config, 'r') as config_file:
        config = yaml.load(config_file)
        output_tag = config['output_tag']

        mc_dir = config['mc_file_dir']
        mc_fnames = config['mc_file_names']

        #data not needed yet, but stil specify in the config for compatibility with constructor
        data_dir = config['data_file_dir']
        data_fnames = config['data_file_names']

        proc_to_tree_name = config['proc_to_tree_name']

        proc_to_train_vars = config['train_vars']
        all_train_vars = [
            item for sublist in proc_to_train_vars.values() for item in sublist
        ]

        vars_to_add = config['vars_to_add']

        #Data handling stuff#
        #apply loosest selection (ggh) first, else memory requirements are ridiculous. Fine to do this since all cuts all looser than VBF (not removing events with higher priority)
        loosest_selection = 'dielectronMass > 110 and dielectronMass < 150'

        #load the mc dataframe for all years. Do not apply any specific preselection
        root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir,
                               data_fnames, proc_to_tree_name, all_train_vars,
                               vars_to_add, loosest_selection)
        root_obj.no_lumi_scale()
        for sig_obj in root_obj.sig_objects:
            root_obj.load_mc(sig_obj, reload_samples=options.reload_samples)
        if options.data_as_bkg:
            for data_obj in root_obj.data_objects:
                root_obj.load_data(data_obj,
                                   reload_samples=options.reload_samples)
        else:
            for bkg_obj in root_obj.bkg_objects:
                root_obj.load_mc(bkg_obj,
                                 bkg=True,
                                 reload_samples=options.reload_samples)
        root_obj.concat()

        #Tag sequence stuff#
    if options.data_as_bkg:
        combined_df = pd.concat([root_obj.mc_df_sig, root_obj.data_df])
    else:
        combined_df = pd.concat([root_obj.mc_df_sig, root_obj.mc_df_bkg])
    del root_obj

    #decide sequence of tags and specify preselection for use with numpy.select:
    tag_sequence = ['VBF', 'ggH']
    proc_to_preselection = {
        'VBF': [
            combined_df['dielectronMass'].gt(110)
            & combined_df['dielectronMass'].lt(150)
            & combined_df['leadElectronPToM'].gt(0.333)
            & combined_df['subleadElectronPToM'].gt(0.25)
            & combined_df['dijetMass'].gt(350)
            & combined_df['leadJetPt'].gt(40)
            & combined_df['subleadJetPt'].gt(30)
        ],
        'ggH': [
            combined_df['dielectronMass'].gt(110)
            & combined_df['dielectronMass'].lt(150)
            & combined_df['leadElectronPToM'].gt(0.333)
            & combined_df['subleadElectronPToM'].gt(0.25)
        ]
    }

    with open(options.bdt_config, 'r') as bdt_config_file:
        config = yaml.load(bdt_config_file)
        proc_to_model = config['models']
        proc_to_tags = config['boundaries']

        #evaluate MVA scores used in categorisation
        for proc, model in proc_to_model.iteritems():
            print 'evaluating classifier: {}'.format(model)
            clf = pickle.load(open('models/{}'.format(model), "rb"))
            train_vars = proc_to_train_vars[proc]
            combined_df[proc + '_bdt'] = clf.predict_proba(
                combined_df[train_vars].values)[:, 1:].ravel()

        # TAG NUMBER #

        #decide on tag
        for proc in tag_sequence:
            presel = proc_to_preselection[proc]
            tag_bounds = proc_to_tags[proc].values()
            tag_masks = []
            for i_bound in range(
                    len(tag_bounds)):  #c++ type looping for index reasons
                if i_bound == 0:  #first bound, tag 0
                    tag_masks.append(presel[0] & combined_df['{}_bdt'.format(
                        proc)].gt(tag_bounds[i_bound]))
                else:  #intermed bound
                    tag_masks.append(presel[0] & combined_df['{}_bdt'.format(
                        proc)].lt(tag_bounds[i_bound - 1]) & combined_df[
                            '{}_bdt'.format(proc)].gt(tag_bounds[i_bound]))

            mask_key = [icat for icat in range(len(tag_bounds))]

            combined_df['{}_analysis_tag'.format(proc)] = np.select(
                tag_masks, mask_key, default=-999)

        # PROC PRIORITY #

        # deduce tag priority: if two or more tags satisfied then set final tag to highest priority tag. make this non hardcoded i.e. compare proc in position 1 to all lower prioty positions. then compare proc in pos 2 ...
        tag_priority_filter = [
            combined_df['VBF_analysis_tag'].ne(-999)
            & combined_df['ggH_analysis_tag'].ne(-999),  # 1) if both filled...
            combined_df['VBF_analysis_tag'].ne(-999)
            & combined_df['ggH_analysis_tag'].eq(
                -999),  # 2) if VBF filled and ggH not, take VBF
            combined_df['VBF_analysis_tag'].eq(-999)
            & combined_df['ggH_analysis_tag'].ne(
                -999),  # 3) if ggH filled and VBF not, take ggH
        ]

        tag_priority_key = [
            'VBF',  #1) take VBF
            'VBF',  #2) take VBF
            'ggH',  #3) take ggH
        ]
        combined_df['priority_tag'.format(proc)] = np.select(
            tag_priority_filter, tag_priority_key,
            default='NOTAG')  # else keep -999 i.e. NOTAG

        #some debug checks:
        #print combined_df[['dipho_mass', 'dipho_leadIDMVA', 'dipho_subleadIDMVA', 'dipho_lead_ptoM', 'dipho_sublead_ptoM', 'dijet_Mjj', 'dijet_LeadJPt', 'dijet_SubJPt', 'ggH_bdt', 'VBF_bdt', 'VBF_analysis_tag', 'ggH_analysis_tag', 'priority_tag']]
        #print combined_df[combined_df.VBF_analysis_tag>-1][['dipho_mass', 'dipho_leadIDMVA', 'dipho_subleadIDMVA', 'dipho_lead_ptoM', 'dipho_sublead_ptoM', 'dijet_Mjj', 'dijet_LeadJPt', 'dijet_SubJPt', 'ggH_bdt', 'VBF_bdt', 'VBF_analysis_tag', 'ggH_analysis_tag', 'priority_tag']]
        #print combined_df[combined_df.ggH_analysis_tag>-1][['dipho_mass', 'dipho_leadIDMVA', 'dipho_subleadIDMVA', 'dipho_lead_ptoM', 'dipho_sublead_ptoM', 'dijet_Mjj', 'dijet_LeadJPt', 'dijet_SubJPt', 'ggH_bdt', 'VBF_bdt', 'VBF_analysis_tag', 'ggH_analysis_tag', 'priority_tag']]

        # FILL TREES BASED ON BOTH OF ABOVE
        tree_vars = ['dZ', 'CMS_hgg_mass', 'weight']
        combined_df['dZ'] = float(0.)
        combined_df['CMS_hgg_mass'] = combined_df['dielectronMass']

        # FIXME: dont loop through events eventually but for now I cba to use numpy to vectorise it again
        #for true_proc in tag_sequence+['Data']:
        #    #isolate true proc
        #    true_proc_df = combined_df[combined_df.proc==true_proc.lower()]
        #    #how much true proc landed in each of our analysis cats?
        #    for target_proc in tag_sequence:  #for all events that got the proc tag, which tag did they fall into?
        #        true_proc_target_proc_df = true_proc_df[true_proc_df.priority_tag==target_proc]
        #        for i_tag in range(len(proc_to_tags[target_proc].values())):#for each tag corresponding to the category we target, which events go in which tag
        #             true_procs_target_proc_tag_i  = true_proc_target_proc_df[true_proc_target_proc_df['{}_analysis_tag'.format(target_proc)].eq(i_tag)]
        #
        #             branch_name = '{}_125_13TeV_{}cat{} :'.format(true_proc.lower(), target_proc.lower(), i_tag )
        #             print true_procs_target_proc_tag_i[['dipho_mass', 'dipho_leadIDMVA', 'dipho_subleadIDMVA', 'dipho_lead_ptoM', 'dipho_sublead_ptoM', 'dijet_Mjj', 'dijet_LeadJPt', 'dijet_SubJPt', 'ggH_bdt', 'VBF_bdt', 'VBF_analysis_tag', 'ggH_analysis_tag', 'priority_tag']].head(10)
        #             print branch_name

        #get tree names
        branch_names = {}
        #print 'DEBUG: {}'.format(np.unique(combined_df['proc']))
        for true_proc in tag_sequence + ['Data']:
            branch_names[true_proc] = []
            for target_proc in tag_sequence:  #for all events that got the proc tag, which tag did they fall into?
                for i_tag in range(
                        len(proc_to_tags[target_proc].values())
                ):  #for each tag corresponding to the category we target, which events go in which tag
                    if true_proc is not 'Data':
                        branch_names[true_proc].append(
                            '{}_125_13TeV_{}cat{}'.format(
                                true_proc.lower(), target_proc.lower(), i_tag))
                    else:
                        branch_names[true_proc].append(
                            '{}_13TeV_{}cat{}'.format(true_proc,
                                                      target_proc.lower(),
                                                      i_tag))

        #debug_procs = ['dipho_mass', 'dipho_leadIDMVA', 'dipho_subleadIDMVA', 'dipho_lead_ptoM', 'dipho_sublead_ptoM', 'dijet_Mjj', 'dijet_LeadJPt', 'dijet_SubJPt', 'ggH_bdt', 'VBF_bdt', 'VBF_analysis_tag', 'ggH_analysis_tag', 'priority_tag']
        debug_vars = [
            'proc', 'VBF_analysis_tag', 'ggH_analysis_tag', 'priority_tag'
        ]
        combined_df['tree_name'] = combined_df.apply(assign_tree, axis=1)
        print combined_df[debug_vars + ['tree_name']]

        if not path.isdir('output_trees/'):
            print 'making directory: {}'.format('output_trees/')
            system('mkdir -p %s' % 'output_trees/')

        #have to save individual trees then hadd procs together on the command line.
        for proc in tag_sequence + ['Data']:
            selected_df = combined_df[combined_df.proc == proc]
            for bn in branch_names[proc]:
                print bn
                branch_selected_df = selected_df[selected_df.tree_name == bn]
                print branch_selected_df[debug_vars + ['tree_name']].head(20)
                root_pandas.to_root(branch_selected_df[tree_vars],
                                    'output_trees/{}.root'.format(bn),
                                    key=bn)
                print
Exemplo n.º 11
0
def main(options):

    with open(options.config, 'r') as config_file:
        config = yaml.load(config_file)
        output_tag = config['output_tag']

        mc_dir = config['mc_file_dir']
        mc_fnames = config['mc_file_names']

        #data not needed yet, but stil specify in the config for compatibility with constructor
        data_dir = config['data_file_dir']
        data_fnames = config['data_file_names']

        proc_to_tree_name = config['proc_to_tree_name']

        #check if dnn (lstm) variables need to be read in
        proc_to_train_vars = config['train_vars']
        all_train_vars = []
        for proc, varrs in proc_to_train_vars.iteritems():
            if isinstance(varrs, dict):
                object_vars = proc_to_train_vars[proc]['object_vars']
                flat_obj_vars = [
                    var for i_object in object_vars for var in i_object
                ]
                event_vars = proc_to_train_vars[proc]['event_vars']
                all_train_vars += (flat_obj_vars + event_vars)
            else:
                all_train_vars += varrs

        vars_to_add = config['vars_to_add']

        if options.syst_name is not None:
            syst = options.syst_name
            read_syst = True
        else:
            read_syst = False

        if read_syst and options.dump_weight_systs:
            raise IOError(
                'Cannot dump weight variations and tree systematics at the same time. Please run separately for each.'
            )
        if options.data_only and (read_syst or options.dump_weight_systs):
            raise IOError('Cannot read Data and apply sysetmatic shifts')

        #Data handling stuff#
        #apply loosest selection (ggh) first, else memory requirements are ridiculous. Fine to do this since all cuts all looser than VBF (not removing events with higher priority)
        #also note we norm the MC before applying this cut. In data we apply it when reading in.
        #loosest_selection = 'dielectronMass > 110 and dielectronMass < 150 and leadElectronPtOvM > 0.333 and subleadElectronPtOvM > 0.25' cant do this since these vars change with systematics!
        loosest_selection = 'dielectronMass > 100'

        #load the mc dataframe for all years. Do not apply any specific preselection to sim samples
        root_obj = ROOTHelpers(output_tag,
                               mc_dir,
                               mc_fnames,
                               data_dir,
                               data_fnames,
                               proc_to_tree_name,
                               all_train_vars,
                               vars_to_add,
                               loosest_selection,
                               read_systs=(read_syst
                                           or options.dump_weight_systs))
        root_obj.no_lumi_scale()
        for sig_obj in root_obj.sig_objects:
            root_obj.load_mc(sig_obj, reload_samples=options.reload_samples)
        #if not read_syst:
        if not options.data_as_bkg:
            for bkg_obj in root_obj.bkg_objects:
                root_obj.load_mc(bkg_obj,
                                 bkg=True,
                                 reload_samples=options.reload_samples)
        else:
            for data_obj in root_obj.data_objects:
                root_obj.load_data(data_obj,
                                   reload_samples=options.reload_samples)
            #overwrite background attribute, for compat with DNN class
            root_obj.mc_df_bkg = root_obj.data_df
        root_obj.concat()

        #get year of samples for roob obj and check we didn't accidentally read in more than 1 year
        if len(root_obj.years) != 1:
            raise IOError(
                'Reading in more than one year at a time! Tagging should be split by year'
            )
        else:
            year = list(root_obj.years)[0]
        if ("2016" in year) and (not options.data_only):
            root_obj.scale_sig_partial_2016(
            )  #FIXME: check this is actually called

    #if read_syst: combined_df = root_obj.mc_df_sig doesnt work with DNN set up since need bkg class in _init_
    #else: combined_df = pd.concat([root_obj.mc_df_sig, root_obj.mc_df_bkg])
    combined_df = pd.concat([root_obj.mc_df_sig, root_obj.mc_df_bkg])

    #Tag sequence stuff#
    #specify sequence of tags and preselection targetting each

    tag_sequence = ['VBF', 'ggH']  #categories targetted
    true_procs = ['VBF', 'ggH', 'ttH']  #procs to run through cats
    #true_procs        = ['ggH', 'ttH'] #procs to run through cats
    if (not read_syst) and (not options.dump_weight_systs):
        true_procs.append(
            'Data'
        )  #is this line needed? guess so since could run mc and data together in a stat-only config
    if options.data_only:
        true_procs = ['Data']  #do data on its own (for memory really)

    #create tag object
    tag_obj = taggerBase(tag_sequence,
                         true_procs,
                         combined_df,
                         syst_name=options.syst_name)
    if read_syst:
        tag_obj.relabel_syst_vars()  #not run if reading weight systematics

    #get number models and tag boundaries from config
    with open(options.mva_config, 'r') as mva_config_file:
        config = yaml.load(mva_config_file)
        proc_to_model = config['models']
        tag_boundaries = config['boundaries']

        #evaluate MVA scores used in categorisation
        for proc, model in proc_to_model.iteritems():
            #for BDT - proc:[var list]. For DNN - proc:{var_type1:[var_list_type1], var_type2: [...], ...}
            if isinstance(model, dict):
                object_vars = proc_to_train_vars[proc]['object_vars']
                flat_obj_vars = [
                    var for i_object in object_vars for var in i_object
                ]
                event_vars = proc_to_train_vars[proc]['event_vars']

                dnn_loaded = tag_obj.load_dnn(proc, model)
                train_tag = model['architecture'].split('_model')[0]
                tag_obj.eval_lstm(dnn_loaded, train_tag, root_obj, proc,
                                  object_vars, flat_obj_vars, event_vars)

            elif isinstance(model, str):
                tag_obj.eval_bdt(proc, model, proc_to_train_vars[proc])
            else:
                raise IOError(
                    'Did not get a classifier models in correct format in config'
                )

    del root_obj

    #need to do this after eval MVAs, since LSTM class used in eval_lstm needs some Data in df for constructor
    if (read_syst or options.dump_weight_systs):
        tag_obj.combined_df = tag_obj.combined_df[
            tag_obj.combined_df.proc != 'Data'].copy(
            )  #avoid copy warnings later
    tag_preselection = tag_obj.get_tag_preselection()

    #set up tag boundaries for each process being targeted
    tag_obj.decide_tag(tag_preselection, tag_boundaries)
    tag_obj.decide_priority()
    branch_names = tag_obj.get_tree_names(tag_boundaries, year)
    tag_obj.set_tree_names(tag_boundaries, options.dump_weight_systs, year)
    tag_obj.fill_trees(branch_names, year, print_yields=not read_syst)
    if not read_syst:
        pass  #tag_obj.plot_matrix(branch_names, output_tag)  #struct error?
Exemplo n.º 12
0
def main(options):

    #take options from the yaml config
    with open(options.config, 'r') as config_file:
        config = yaml.load(config_file)
        output_tag = config['output_tag']

        mc_dir = config['mc_file_dir']
        mc_fnames = config['mc_file_names']

        #data not needed yet, could use this for validation later. keep for compat with class
        data_dir = config['data_file_dir']
        data_fnames = config['data_file_names']

        train_vars = config['train_vars']
        vars_to_add = config['vars_to_add']
        presel = config['preselection']

        proc_to_tree_name = config['proc_to_tree_name']

        #sig_colour        = 'forestgreen'
        sig_colour = 'red'

        #Data handling stuff#

        #load the mc dataframe for all years
        if options.pt_reweight:
            cr_selection = config['reweight_cr']
            output_tag += '_pt_reweighted'
            root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir,
                                   data_fnames, proc_to_tree_name, train_vars,
                                   vars_to_add, cr_selection)
        else:
            root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir,
                                   data_fnames, proc_to_tree_name, train_vars,
                                   vars_to_add, presel)

        for sig_obj in root_obj.sig_objects:
            root_obj.load_mc(sig_obj, reload_samples=options.reload_samples)
        for bkg_obj in root_obj.bkg_objects:
            root_obj.load_mc(bkg_obj,
                             bkg=True,
                             reload_samples=options.reload_samples)
        for data_obj in root_obj.data_objects:
            root_obj.load_data(data_obj, reload_samples=options.reload_samples)
        root_obj.concat()

        if options.pt_reweight and options.reload_samples:
            root_obj.apply_pt_rew('DYMC', presel)

    #load MVA
        with open(options.mva_config, 'r') as mva_config_file:
            config = yaml.load(mva_config_file)
            model = config['models'][options.mva_proc]
            boundaries = config['boundaries'][options.mva_proc]

            #add DNN later
            if isinstance(model, str):
                print 'evaluating BDT: {}'.format(model)
                clf = pickle.load(open('models/{}'.format(model), "rb"))
                root_obj.mc_df_sig[
                    options.mva_proc + '_mva'] = clf.predict_proba(
                        root_obj.mc_df_sig[train_vars].values)[:, 1:].ravel()
                root_obj.mc_df_bkg[
                    options.mva_proc + '_mva'] = clf.predict_proba(
                        root_obj.mc_df_bkg[train_vars].values)[:, 1:].ravel()
                root_obj.data_df[
                    options.mva_proc + '_mva'] = clf.predict_proba(
                        root_obj.data_df[train_vars].values)[:, 1:].ravel()

            else:
                raise IOError(
                    'Did not get a classifier models in correct format in config'
                )

            #Plotter stuff#

        plotter = Plotter(root_obj,
                          train_vars,
                          sig_col=sig_colour,
                          norm_to_data=True)
        cat_counter = 0
        for b in boundaries:
            if cat_counter == 0:
                extra_cuts = options.mva_proc + '_mva >' + str(
                    boundaries['tag_0'])
            else:
                extra_cuts = (options.mva_proc + '_mva <' + str(
                    boundaries['tag_' + str(cat_counter - 1)])) + ' and ' + (
                        options.mva_proc + '_mva >' +
                        str(boundaries['tag_' + str(cat_counter)]))
            plotter.plot_input(options.mass_var_name,
                               options.n_bins,
                               output_tag,
                               options.ratio_plot,
                               norm_to_data=True,
                               extra_cuts=extra_cuts,
                               extra_tag=cat_counter,
                               blind=True)
            cat_counter += 1
Exemplo n.º 13
0
def main(options):

    with open(options.config, 'r') as config_file:
        config = yaml.load(config_file)
        output_tag = config['output_tag']

        mc_dir = config['mc_file_dir']
        mc_fnames = config['mc_file_names']

        data_dir = config['data_file_dir']
        data_fnames = config['data_file_names']

        proc_to_tree_name = config['proc_to_tree_name']

        proc_to_train_vars = config['train_vars']
        object_vars = proc_to_train_vars['VBF']['object_vars']
        flat_obj_vars = [var for i_object in object_vars for var in i_object]
        event_vars = proc_to_train_vars['VBF']['event_vars']

        #used to check all vars we need for categorisation are in our dfs
        all_train_vars = proc_to_train_vars['ggH'] + flat_obj_vars + event_vars

        vars_to_add = config['vars_to_add']

        #Data handling stuff#
        #apply loosest selection (ggh) first, else memory requirements are ridiculous. Fine to do this since all cuts all looser than VBF (not removing events with higher priority)
        loosest_selection = 'dielectronMass > 110 and dielectronMass < 150'

        #load the mc dataframe for all years. Do not apply any specific preselection
        root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir,
                               data_fnames, proc_to_tree_name, all_train_vars,
                               vars_to_add, loosest_selection)
        root_obj.no_lumi_scale()
        for sig_obj in root_obj.sig_objects:
            root_obj.load_mc(sig_obj, reload_samples=options.reload_samples)
        if not options.data_as_bkg:
            for bkg_obj in root_obj.bkg_objects:
                root_obj.load_mc(bkg_obj,
                                 bkg=True,
                                 reload_samples=options.reload_samples)
        else:
            for data_obj in root_obj.data_objects:
                root_obj.load_data(data_obj,
                                   reload_samples=options.reload_samples)
            #overwrite background attribute, for compat with DNN class
            root_obj.mc_df_bkg = root_obj.data_df
        root_obj.concat()

        #Tag sequence stuff#
    #NOTE: these must be concatted in the same way they are concatted in LSTM.create_X_y(), else predicts are misaligned
    combined_df = pd.concat([root_obj.mc_df_sig, root_obj.mc_df_bkg])

    #decide sequence of tags and specify preselection for use with numpy.select:
    tag_sequence = ['VBF', 'ggH']
    proc_to_preselection = {
        'VBF': [
            combined_df['dielectronMass'].gt(110)
            & combined_df['dielectronMass'].lt(150)
            & combined_df['leadElectronPToM'].gt(0.333)
            & combined_df['subleadElectronPToM'].gt(0.25)
            & combined_df['dijetMass'].gt(350)
            & combined_df['leadJetPt'].gt(40)
            & combined_df['subleadJetPt'].gt(30)
        ],
        'ggH': [
            combined_df['dielectronMass'].gt(110)
            & combined_df['dielectronMass'].lt(150)
            & combined_df['leadElectronPToM'].gt(0.333)
            & combined_df['subleadElectronPToM'].gt(0.25)
        ]
    }

    # GET MVA SCORES #

    with open(options.mva_config, 'r') as mva_config_file:
        config = yaml.load(mva_config_file)
        proc_to_model = config['models']
        proc_to_tags = config['boundaries']

        #evaluate ggH BDT scores
        print 'evaluating ggH classifier: {}'.format(proc_to_model['ggH'])
        clf = pickle.load(open('models/{}'.format(proc_to_model['ggH']), "rb"))
        train_vars = proc_to_train_vars['ggH']
        combined_df['ggH_mva'] = clf.predict_proba(
            combined_df[train_vars].values)[:, 1:].ravel()

        #Evaluate VBF LSTM
        print 'loading VBF DNN:'
        with open('models/{}'.format(proc_to_model['VBF']['architecture']),
                  'r') as model_json:
            model_architecture = model_json.read()
        model = keras.models.model_from_json(model_architecture)
        model.load_weights('models/{}'.format(proc_to_model['VBF']['model']))

        LSTM = LSTM_DNN(root_obj, object_vars, event_vars, 1.0, False, True)

        # set up X and y Matrices. Log variables that have GeV units
        LSTM.var_transform(do_data=False)
        X_tot, y_tot = LSTM.create_X_y()
        X_tot = X_tot[flat_obj_vars + event_vars]  #filter unused vars
        print np.isnan(X_tot).any()

        #scale X_vars to mean=0 and std=1. Use scaler fit during previous dnn training
        LSTM.load_X_scaler(out_tag='VBF_DNN')
        X_tot = LSTM.X_scaler.transform(X_tot)

        #make 2D vars for LSTM layers
        X_tot = pd.DataFrame(X_tot, columns=flat_obj_vars + event_vars)
        X_tot_high_level = X_tot[event_vars].values
        X_tot_low_level = LSTM.join_objects(X_tot[flat_obj_vars])

        #predict probs. Corresponds to same events, since dfs are concattened internally in the same
        combined_df['VBF_mva'] = model.predict(
            [X_tot_high_level, X_tot_low_level], batch_size=1).flatten()

        # TAG NUMBER #

        #decide on tag
        for proc in tag_sequence:
            presel = proc_to_preselection[proc]
            tag_bounds = proc_to_tags[proc].values()
            tag_masks = []
            for i_bound in range(
                    len(tag_bounds)):  #c++ type looping for index reasons
                if i_bound == 0:  #first bound, tag 0
                    tag_masks.append(presel[0] & combined_df['{}_mva'.format(
                        proc)].gt(tag_bounds[i_bound]))
                else:  #intermed bound
                    tag_masks.append(presel[0] & combined_df['{}_mva'.format(
                        proc)].lt(tag_bounds[i_bound - 1]) & combined_df[
                            '{}_mva'.format(proc)].gt(tag_bounds[i_bound]))

            mask_key = [icat for icat in range(len(tag_bounds))]

            combined_df['{}_analysis_tag'.format(proc)] = np.select(
                tag_masks, mask_key, default=-999)

        # PROC PRIORITY #

        # deduce tag priority: if two or more tags satisfied then set final tag to highest priority tag. make this non hardcoded i.e. compare proc in position 1 to all lower prioty positions. then compare proc in pos 2 ...
        tag_priority_filter = [
            combined_df['VBF_analysis_tag'].ne(-999)
            & combined_df['ggH_analysis_tag'].ne(-999),  # 1) if both filled...
            combined_df['VBF_analysis_tag'].ne(-999)
            & combined_df['ggH_analysis_tag'].eq(
                -999),  # 2) if VBF filled and ggH not, take VBF
            combined_df['VBF_analysis_tag'].eq(-999)
            & combined_df['ggH_analysis_tag'].ne(
                -999),  # 3) if ggH filled and VBF not, take ggH
        ]

        tag_priority_key = [
            'VBF',  #1) take VBF
            'VBF',  #2) take VBF
            'ggH',  #3) take ggH
        ]
        combined_df['priority_tag'.format(proc)] = np.select(
            tag_priority_filter, tag_priority_key,
            default='NOTAG')  # else keep -999 i.e. NOTAG

        #some debug checks:
        #print combined_df[['dipho_mass', 'dipho_leadIDMVA', 'dipho_subleadIDMVA', 'dipho_lead_ptoM', 'dipho_sublead_ptoM', 'dijet_Mjj', 'dijet_LeadJPt', 'dijet_SubJPt', 'ggH_bdt', 'VBF_bdt', 'VBF_analysis_tag', 'ggH_analysis_tag', 'priority_tag']]
        #print combined_df[combined_df.VBF_analysis_tag>-1][['dipho_mass', 'dipho_leadIDMVA', 'dipho_subleadIDMVA', 'dipho_lead_ptoM', 'dipho_sublead_ptoM', 'dijet_Mjj', 'dijet_LeadJPt', 'dijet_SubJPt', 'ggH_bdt', 'VBF_bdt', 'VBF_analysis_tag', 'ggH_analysis_tag', 'priority_tag']]
        #print combined_df[combined_df.ggH_analysis_tag>-1][['dipho_mass', 'dipho_leadIDMVA', 'dipho_subleadIDMVA', 'dipho_lead_ptoM', 'dipho_sublead_ptoM', 'dijet_Mjj', 'dijet_LeadJPt', 'dijet_SubJPt', 'ggH_bdt', 'VBF_bdt', 'VBF_analysis_tag', 'ggH_analysis_tag', 'priority_tag']]

        # FILL TREES BASED ON BOTH OF ABOVE
        tree_vars = ['dZ', 'CMS_hgg_mass', 'weight']
        combined_df['dZ'] = float(0.)
        combined_df['CMS_hgg_mass'] = combined_df['dielectronMass']

        # FIXME: dont loop through events eventually but for now I cba to use numpy to vectorise it again
        #for true_proc in tag_sequence+['Data']:
        #    #isolate true proc
        #    true_proc_df = combined_df[combined_df.proc==true_proc.lower()]
        #    #how much true proc landed in each of our analysis cats?
        #    for target_proc in tag_sequence:  #for all events that got the proc tag, which tag did they fall into?
        #        true_proc_target_proc_df = true_proc_df[true_proc_df.priority_tag==target_proc]
        #        for i_tag in range(len(proc_to_tags[target_proc].values())):#for each tag corresponding to the category we target, which events go in which tag
        #             true_procs_target_proc_tag_i  = true_proc_target_proc_df[true_proc_target_proc_df['{}_analysis_tag'.format(target_proc)].eq(i_tag)]
        #
        #             branch_name = '{}_125_13TeV_{}cat{} :'.format(true_proc.lower(), target_proc.lower(), i_tag )
        #             print true_procs_target_proc_tag_i[['dipho_mass', 'dipho_leadIDMVA', 'dipho_subleadIDMVA', 'dipho_lead_ptoM', 'dipho_sublead_ptoM', 'dijet_Mjj', 'dijet_LeadJPt', 'dijet_SubJPt', 'ggH_bdt', 'VBF_bdt', 'VBF_analysis_tag', 'ggH_analysis_tag', 'priority_tag']].head(10)
        #             print branch_name

        #get tree names
        branch_names = {}
        #print 'DEBUG: {}'.format(np.unique(combined_df['proc']))
        for true_proc in tag_sequence + ['Data']:
            branch_names[true_proc] = []
            for target_proc in tag_sequence:  #for all events that got the proc tag, which tag did they fall into?
                for i_tag in range(
                        len(proc_to_tags[target_proc].values())
                ):  #for each tag corresponding to the category we target, which events go in which tag
                    if true_proc is not 'Data':
                        branch_names[true_proc].append(
                            '{}_125_13TeV_{}cat{}'.format(
                                true_proc.lower(), target_proc.lower(), i_tag))
                    else:
                        branch_names[true_proc].append(
                            '{}_13TeV_{}cat{}'.format(true_proc,
                                                      target_proc.lower(),
                                                      i_tag))

        #debug_procs = ['dipho_mass', 'dipho_leadIDMVA', 'dipho_subleadIDMVA', 'dipho_lead_ptoM', 'dipho_sublead_ptoM', 'dijet_Mjj', 'dijet_LeadJPt', 'dijet_SubJPt', 'ggH_bdt', 'VBF_bdt', 'VBF_analysis_tag', 'ggH_analysis_tag', 'priority_tag']
        debug_vars = [
            'proc', 'VBF_analysis_tag', 'ggH_analysis_tag', 'priority_tag'
        ]
        combined_df['tree_name'] = combined_df.apply(assign_tree, axis=1)
        print combined_df[debug_vars + ['tree_name']]

        if not path.isdir('output_trees/'):
            print 'making directory: {}'.format('output_trees/')
            system('mkdir -p %s' % 'output_trees/')

        #have to save individual trees then hadd procs together on the command line.
        for proc in tag_sequence + ['Data']:
            selected_df = combined_df[combined_df.proc == proc]
            for bn in branch_names[proc]:
                print bn
                branch_selected_df = selected_df[selected_df.tree_name == bn]
                print branch_selected_df[debug_vars + ['tree_name']].head(20)
                root_pandas.to_root(branch_selected_df[tree_vars],
                                    'output_trees/{}.root'.format(bn),
                                    key=bn)
                print