Python ROOTHelpers.concat примеры использования

Язык программирования: Python

Пространство имен/Пакет: DataHandling

Класс/Тип: ROOTHelpers

Метод/Функция: concat

Примеров на hotexamples.com: 13

Python ROOTHelpers.concat - 13 примеров найдено. Это лучшие примеры Python кода для DataHandling.ROOTHelpers.concat, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

ROOTHelpers(13)

concat(13)

load_mc(13)

load_data(10)

apply_pt_rew(5)

pt_reweight(5)

mc_df_bkg(3)

no_lumi_scale(3)

apply_more_cuts(2)

scale_sig_partial_2016(1)

Пример #1

Показать файл

Файл: plot_input_features.py Проект: Joe-W-Davies/HToEE

def main(options):

    #take options from the yaml config
    with open(options.config, 'r') as config_file:
        config            = yaml.load(config_file)
        output_tag        = config['output_tag']

        mc_dir            = config['mc_file_dir']
        mc_fnames         = config['mc_file_names']
  
        #data not needed yet, could use this for validation later. keep for compat with class
        data_dir          = config['data_file_dir']
        data_fnames       = config['data_file_names']

        train_vars        = config['train_vars']
        vars_to_add       = config['vars_to_add']
        presel            = config['preselection']

        proc_to_tree_name = config['proc_to_tree_name']

        #sig_colour        = 'forestgreen'
        sig_colour        = 'red'
 
                                           #Data handling stuff#
        sys.exit(1)

        #load the mc dataframe for all years
        if options.pt_reweight: 
            cr_selection = config['reweight_cr']
            output_tag += '_pt_reweighted'
            root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir, data_fnames, proc_to_tree_name, train_vars, vars_to_add, cr_selection)
        else: root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir, data_fnames, proc_to_tree_name, train_vars, vars_to_add, presel)

        for sig_obj in root_obj.sig_objects:
            root_obj.load_mc(sig_obj, reload_samples=options.reload_samples)
        for bkg_obj in root_obj.bkg_objects:
            root_obj.load_mc(bkg_obj, bkg=True, reload_samples=options.reload_samples)
        for data_obj in root_obj.data_objects:
            root_obj.load_data(data_obj, reload_samples=options.reload_samples)
        root_obj.concat()

        if options.pt_reweight and options.reload_samples: 
            root_obj.apply_pt_rew('DYMC', presel)

                                            #Plotter stuff#
 
        #set up X, w and y, train-test 
        plotter = Plotter(root_obj, train_vars, sig_col=sig_colour, norm_to_data=True)
        for var in train_vars:
            plotter.plot_input(var, options.n_bins, output_tag, options.ratio_plot, norm_to_data=True)

Пример #2

Показать файл

Файл: plot_feature_evolution_dnn.py Проект: edjtscott/HToEE

def main(options):

    #take options from the yaml config
    with open(options.config, 'r') as config_file:
        config = yaml.load(config_file)
        output_tag = config['output_tag']

        mc_dir = config['mc_file_dir']
        mc_fnames = config['mc_file_names']

        data_dir = config['data_file_dir']
        data_fnames = config['data_file_names']

        proc_to_tree_name = config['proc_to_tree_name']

        object_vars = config['object_vars']
        flat_obj_vars = [var for i_object in object_vars for var in i_object]
        event_vars = config['event_vars']
        vars_to_add = config['vars_to_add']
        presel = config['preselection']
        colours = ['#d7191c', '#fdae61', '#f2f229', '#abdda4', '#2b83ba']

        #Data handling stuff#

        #load the mc dataframe for all years
        root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir,
                               data_fnames, proc_to_tree_name,
                               flat_obj_vars + event_vars, vars_to_add, presel)

        for sig_obj in root_obj.sig_objects:
            print sig_obj.file_name
            print sig_obj.tree_name
            root_obj.load_mc(sig_obj, reload_samples=options.reload_samples)
        for bkg_obj in root_obj.bkg_objects:
            root_obj.load_mc(bkg_obj,
                             bkg=True,
                             reload_samples=options.reload_samples)

        root_obj.concat()

        #Plotter stuff#

        #add model predictions to sig df
        print 'loading DNN: {}'.format(options.model_architecture)
        with open('{}'.format(options.model_architecture), 'r') as model_json:
            model_architecture = model_json.read()
        model = keras.models.model_from_json(model_architecture)
        model.load_weights('{}'.format(options.model))

        LSTM = LSTM_DNN(root_obj, object_vars, event_vars, 1.0, False, True)
        unscaled_sig_df = root_obj.mc_df_sig.copy()
        unscaled_bkg_df = root_obj.mc_df_bkg.copy()

        # set up X and y Matrices
        LSTM.var_transform(do_data=False)
        X_tot, y_tot = LSTM.create_X_y()

        X_tot = X_tot[flat_obj_vars + event_vars]  #filter unused vars
        LSTM.load_X_scaler(out_tag=output_tag)
        X_tot = LSTM.X_scaler.transform(X_tot)

        X_tot = pd.DataFrame(X_tot, columns=flat_obj_vars + event_vars)
        X_tot_high_level = X_tot[event_vars].values
        X_tot_low_level = LSTM.join_objects(X_tot[flat_obj_vars])
        pred_prob_tot = model.predict([X_tot_high_level, X_tot_low_level],
                                      batch_size=1024).flatten()

        unscaled_sig_df['bdt_score'] = pred_prob_tot[y_tot == 1]
        unscaled_bkg_df['bdt_score'] = pred_prob_tot[y_tot == 0]

        train_vars = flat_obj_vars + event_vars
        plotter = Plotter(root_obj, train_vars, norm_to_data=True)
        #for VBF, good set is: [0.30 0.50 0.70 0.80 0.90 1.0]
        bdt_bins = np.array(options.boundaries)
        Utils.check_dir('{}/plotting/plots/{}_sig_bkg_evo'.format(
            os.getcwd(), output_tag))
        i_hist = 0

        for var in train_vars + ['dielectronMass']:
            fig = plt.figure(1)
            axes = fig.gca()
            var_bins = np.linspace(plotter.var_to_xrange[var][0],
                                   plotter.var_to_xrange[var][1],
                                   options.n_bins)
            for ibin in range(len(bdt_bins) - 1):
                sig_cut = unscaled_sig_df[np.logical_and(
                    unscaled_sig_df['bdt_score'] > bdt_bins[ibin],
                    unscaled_sig_df['bdt_score'] < bdt_bins[ibin + 1])][var]
                weights_cut = unscaled_sig_df[np.logical_and(
                    unscaled_sig_df['bdt_score'] > bdt_bins[ibin],
                    unscaled_sig_df['bdt_score'] <
                    bdt_bins[ibin + 1])]['weight']
                weights_cut /= np.sum(weights_cut)
                axes.hist(sig_cut,
                          bins=var_bins,
                          label='{:.2f} $<$ MVA $<$ {:.2f}'.format(
                              bdt_bins[ibin], bdt_bins[ibin + 1]),
                          weights=weights_cut,
                          histtype='step',
                          color=colours[i_hist])
                i_hist += 1
            i_hist = 0
            annotate_and_save(axes, plotter, var)
            axes.text(0.95,
                      0.6,
                      'Simulated VBF signal',
                      ha='right',
                      va='bottom',
                      transform=axes.transAxes,
                      size=14)
            fig.savefig(
                '{0}/plotting/plots/{1}_sig_bkg_evo/{1}_{2}.pdf'.format(
                    os.getcwd(), output_tag, var))
            plt.close()

        #plot background (check mass is not being sculpted)
        for var in ['dielectronMass']:
            fig = plt.figure(1)
            axes = fig.gca()
            var_bins = np.linspace(plotter.var_to_xrange[var][0],
                                   plotter.var_to_xrange[var][1],
                                   options.n_bins)
            for ibin in range(len(bdt_bins) - 1):
                bkg_cut = unscaled_bkg_df[np.logical_and(
                    unscaled_bkg_df['bdt_score'] > bdt_bins[ibin],
                    unscaled_bkg_df['bdt_score'] < bdt_bins[ibin + 1])][var]
                bkg_weights_cut = unscaled_bkg_df[np.logical_and(
                    unscaled_bkg_df['bdt_score'] > bdt_bins[ibin],
                    unscaled_bkg_df['bdt_score'] <
                    bdt_bins[ibin + 1])]['weight']
                bkg_weights_cut /= np.sum(bkg_weights_cut)
                axes.hist(bkg_cut,
                          bins=var_bins,
                          label='{:.2f} $<$ MVA $<$ {:.2f}'.format(
                              bdt_bins[ibin], bdt_bins[ibin + 1]),
                          weights=bkg_weights_cut,
                          histtype='step',
                          color=colours[i_hist])
                i_hist += 1
            i_hist = 0

            annotate_and_save(axes, plotter, var)
            axes.text(0.95,
                      0.6,
                      'Simulated background',
                      ha='right',
                      va='bottom',
                      transform=axes.transAxes,
                      size=14)
            fig.savefig(
                '{0}/plotting/plots/{1}_sig_bkg_evo/{1}_{2}_bkg.pdf'.format(
                    os.getcwd(), output_tag, var))
            plt.close()

Пример #3

Показать файл

Файл: dnn_category_opt.py Проект: edjtscott/HToEE

def main(options):

    #take options from the yaml config
    with open(options.config, 'r') as config_file:
        config            = yaml.load(config_file)
        output_tag        = config['output_tag']

        mc_dir            = config['mc_file_dir']
        mc_fnames         = config['mc_file_names']
  
        data_dir          = config['data_file_dir']
        data_fnames       = config['data_file_names']

        proc_to_tree_name = config['proc_to_tree_name']       

        object_vars       = config['object_vars']
        flat_obj_vars     = [var for i_object in object_vars for var in i_object]
        event_vars        = config['event_vars']
        vars_to_add       = config['vars_to_add']
        presel            = config['preselection']
                 
                                           #Data handling stuff#
                 
        #load the mc dataframe for all years
        root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir, data_fnames, proc_to_tree_name, flat_obj_vars+event_vars, vars_to_add, presel) 

        for sig_obj in root_obj.sig_objects:
            root_obj.load_mc(sig_obj, reload_samples=options.reload_samples)
        if not options.data_as_bkg:
            for bkg_obj in root_obj.bkg_objects:
                root_obj.load_mc(bkg_obj, bkg=True, reload_samples=options.reload_samples)
        else:
            for data_obj in root_obj.data_objects:
                root_obj.load_data(data_obj, reload_samples=options.reload_samples)
            #overwrite background attribute, for compat with DNN class
            root_obj.mc_df_bkg = root_obj.data_df
        root_obj.concat()

        #apply cut-based selection if not optimising BDT score (pred probs still evaluated for compatability w exisiting catOpt constructor). 
        if len(options.cut_based_str)>0:
            root_obj.apply_more_cuts(options.cut_based_str)

                                           # DNN evaluation stuff #

        #load architecture and model weights
        print 'loading DNN: {}'.format(options.model_architecture)
        with open('{}'.format(options.model_architecture), 'r') as model_json:
            model_architecture = model_json.read()
        model = keras.models.model_from_json(model_architecture)
        model.load_weights('{}'.format(options.model))

        LSTM = LSTM_DNN(root_obj, object_vars, event_vars, 1.0, False, True)

        # set up X and y Matrices. Log variables that have GeV units
        LSTM.var_transform(do_data=False) #bkg=data here. This option is for plotting purposes
        X_tot, y_tot     = LSTM.create_X_y()
        X_tot            = X_tot[flat_obj_vars+event_vars] #filter unused vars

        #scale X_vars to mean=0 and std=1. Use scaler fit during previous dnn training
        LSTM.load_X_scaler(out_tag=output_tag)
        X_tot            = LSTM.X_scaler.transform(X_tot)

        #make 2D vars for LSTM layers
        X_tot            = pd.DataFrame(X_tot, columns=flat_obj_vars+event_vars)
        X_tot_high_level = X_tot[event_vars].values
        X_tot_low_level  = LSTM.join_objects(X_tot[flat_obj_vars])

        #predict probs
        pred_prob_tot    = model.predict([X_tot_high_level, X_tot_low_level], batch_size=1024).flatten()

        sig_weights   = root_obj.mc_df_sig['weight'].values
        sig_m_ee      = root_obj.mc_df_sig['dielectronMass'].values
        pred_prob_sig = pred_prob_tot[y_tot==1] 

        bkg_weights   = root_obj.data_df['weight'].values
        bkg_m_ee      = root_obj.data_df['dielectronMass'].values
        pred_prob_bkg = pred_prob_tot[y_tot==0]

                                             #category optimisation stuff#

        #set up optimiser ranges and no. categories to test if non-cut based
        ranges    = [ [0.3,1.] ]
        names     = ['{} score'.format(output_tag)] #arbitrary
        print_str = ''
        cats = [1,2,3,4]
        AMS  = []

        #just to use class methods here
        if len(options.cut_based_str)>0:
            optimiser = CatOptim(sig_weights, sig_m_ee, [pred_prob_sig], bkg_weights, bkg_m_ee, [pred_prob_bkg], 0, ranges, names)
            AMS = optimiser.cutBasedAMS()
            print 'String for cut based optimimastion: {}'.format(options.cut_based_str)
            print 'Cut-based optimimsation gives AMS = {:1.8f}'.format(AMS)

        else:
            for n_cats in cats:
                optimiser = CatOptim(sig_weights, sig_m_ee, [pred_prob_sig], bkg_weights, bkg_m_ee, [pred_prob_bkg], n_cats, ranges, names)
                optimiser.optimise(1, options.n_iters) #set lumi to 1 as already scaled when loading in
                print_str += 'Results for {} categories : \n'.format(n_cats)
                print_str += optimiser.getPrintableResult()
                AMS.append(optimiser.bests.totSignif)
            print '\n {}'.format(print_str)

        #make nCat vs AMS plots
        Plotter.cats_vs_ams(cats, AMS, output_tag)

Пример #4

Показать файл

Файл: train_bdt.py Проект: Joe-W-Davies/HToEE

def main(options):

    #take options from the yaml config
    with open(options.config, 'r') as config_file:
        config = yaml.load(config_file)
        output_tag = config['output_tag']

        mc_dir = config['mc_file_dir']
        mc_fnames = config['mc_file_names']

        #data not needed yet, but stil specify in the config for compatibility with constructor
        data_dir = config['data_file_dir']
        data_fnames = config['data_file_names']

        proc_to_tree_name = config['proc_to_tree_name']

        train_vars = config['train_vars']
        vars_to_add = config['vars_to_add']
        presel = config['preselection']

        #Data handling stuff#

        #load the mc dataframe for all years
        if options.pt_reweight:
            cr_selection = config['reweight_cr']
            output_tag += '_pt_reweighted'
            root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir,
                                   data_fnames, proc_to_tree_name, train_vars,
                                   vars_to_add, cr_selection)
        else:
            root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir,
                                   data_fnames, proc_to_tree_name, train_vars,
                                   vars_to_add, presel)

        for sig_obj in root_obj.sig_objects:
            root_obj.load_mc(sig_obj, reload_samples=options.reload_samples)
        for bkg_obj in root_obj.bkg_objects:
            root_obj.load_mc(bkg_obj,
                             bkg=True,
                             reload_samples=options.reload_samples)
        for data_obj in root_obj.data_objects:
            root_obj.load_data(data_obj, reload_samples=options.reload_samples)
        root_obj.concat()

        #reweight samples in bins of pT (and maybe Njets), for each year separely. Note targetted selection
        # is applied here and all df's are resaved for smaller mem
        if options.pt_reweight and options.reload_samples:  #FIXME what about reading files in first time, wanting to pT rew, but not including options.reload samples? It wont reweight and save the reweighted df's
            root_obj.apply_pt_rew('DYMC', presel)
            #root_obj.pt_njet_reweight('DYMC', year, presel)

            #BDT stuff#

        #set up X, w and y, train-test
        bdt_hee = BDTHelpers(root_obj,
                             train_vars,
                             options.train_frac,
                             eq_train=options.eq_train)
        bdt_hee.create_X_and_y(mass_res_reweight=True)

        #submit the HP search if option true
        if options.hp_perm is not None:
            if options.opt_hps and options.train_best:
                raise Exception(
                    'Cannot optimise HPs and train best model. Run optimal training after hyper paramter optimisation'
                )
            elif options.opt_hps and options.hp_perm:
                raise Exception(
                    'opt_hps option submits scripts with the hp_perm option; Cannot submit a script with both!'
                )
            else:
                print(
                    'About to train + validate on dataset with {} fold splitting'
                    .format(options.k_folds))
                bdt_hee.set_hyper_parameters(options.hp_perm)
                bdt_hee.set_k_folds(options.k_folds)
                for i_fold in range(options.k_folds):
                    bdt_hee.set_i_fold(i_fold)
                    bdt_hee.train_classifier(root_obj.mc_dir, save=False)
                    bdt_hee.validation_rocs.append(bdt_hee.compute_roc())
                with open('{}/bdt_hp_opt_{}.txt'.format(mc_dir, output_tag),
                          'a+') as val_roc_file:
                    bdt_hee.compare_rocs(val_roc_file, options.hp_perm)
                    val_roc_file.close()

        elif options.opt_hps:
            #FIXME: add warning that many jobs are about to be submiited
            if options.k_folds < 2:
                raise ValueError('K-folds option must be at least 2')
            if path.isfile('{}/bdt_hp_opt_{}.txt'.format(mc_dir, output_tag)):
                system('rm {}/bdt_hp_opt_{}.txt'.format(mc_dir, output_tag))
                print('deleting: {}/bdt_hp_opt_{}.txt'.format(
                    mc_dir, output_tag))
            bdt_hee.batch_gs_cv(k_folds=options.k_folds,
                                pt_rew=options.pt_reweight)

        elif options.train_best:
            output_tag += '_best'
            with open('{}/bdt_hp_opt_{}.txt'.format(mc_dir, output_tag),
                      'r') as val_roc_file:
                hp_roc = val_roc_file.readlines()
                best_params = hp_roc[-1].split(';')[0]
                print('Best classifier params are: {}'.format(best_params))
                bdt_hee.set_hyper_parameters(best_params)
                bdt_hee.train_classifier(root_obj.mc_dir,
                                         save=True,
                                         model_name=output_tag)
                bdt_hee.compute_roc()
                bdt_hee.plot_roc(output_tag)
                bdt_hee.plot_output_score(
                    output_tag,
                    ratio_plot=True,
                    norm_to_data=(not options.pt_reweight))

        #else just train BDT with default HPs
        else:
            bdt_hee.train_classifier(root_obj.mc_dir,
                                     save=True,
                                     model_name=output_tag + '_clf')
            #bdt_hee.train_classifier(root_obj.mc_dir, save=False, model_name=output_tag+'_clf')
            bdt_hee.compute_roc()
            bdt_hee.plot_roc(output_tag)
            #bdt_hee.plot_output_score(output_tag, ratio_plot=True, norm_to_data=(not options.pt_reweight), log=False)
            bdt_hee.plot_output_score(output_tag,
                                      ratio_plot=True,
                                      norm_to_data=(not options.pt_reweight),
                                      log=True)

Пример #5

Показать файл

Файл: bdt_category_opt.py Проект: Joe-W-Davies/HToEE

def main(options):

    #take options from the yaml config
    with open(options.config, 'r') as config_file:
        config = yaml.load(config_file)
        output_tag = config['output_tag']

        mc_dir = config['mc_file_dir']
        mc_fnames = config['mc_file_names']

        data_dir = config['data_file_dir']
        data_fnames = config['data_file_names']

        proc_to_tree_name = config['proc_to_tree_name']

        train_vars = config['train_vars']
        vars_to_add = config['vars_to_add']
        presel = config['preselection']

        #load the mc dataframe for all years
        root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir,
                               data_fnames, proc_to_tree_name, train_vars,
                               vars_to_add, presel)

        for sig_obj in root_obj.sig_objects:
            root_obj.load_mc(sig_obj, reload_samples=options.reload_samples)
        if not options.data_as_bkg:
            for bkg_obj in root_obj.bkg_objects:
                root_obj.load_mc(bkg_obj,
                                 bkg=True,
                                 reload_samples=options.reload_samples)
        else:
            for data_obj in root_obj.data_objects:
                root_obj.load_data(data_obj,
                                   reload_samples=options.reload_samples)
        root_obj.concat()

        print 'loading classifier: {}'.format(options.model)
        clf = pickle.load(open("{}".format(options.model), "rb"))

        #apply cut-based selection if not optimising BDT score (pred probs still evaluated for compatability w exisiting constructor).
        if len(options.cut_based_str) > 0:
            root_obj.apply_more_cuts(options.cut_based_str)

        sig_weights = root_obj.mc_df_sig['weight'].values
        sig_m_ee = root_obj.mc_df_sig['dielectronMass'].values
        pred_prob_sig = clf.predict_proba(
            root_obj.mc_df_sig[train_vars].values)[:, 1:].ravel()

        if options.data_as_bkg:
            bkg_weights = root_obj.data_df['weight'].values
            bkg_m_ee = root_obj.data_df['dielectronMass'].values
            pred_prob_bkg = clf.predict_proba(
                root_obj.data_df[train_vars].values)[:, 1:].ravel()

        else:
            bkg_weights = root_obj.mc_df_bkg['weight'].values
            bkg_m_ee = root_obj.mc_df_bkg['dielectronMass'].values
            pred_prob_bkg = clf.predict_proba(
                root_obj.mc_df_bkg[train_vars].values)[:, 1:].ravel()

        #set up optimiser ranges and no. categories to test if non-cut based
        ranges = [[0.15, 1.]]
        names = ['{} score'.format(output_tag)]  #arbitrary
        print_str = ''
        cats = [1, 2, 3, 4, 5]
        AMS = []

        #just to use class methods here
        if len(options.cut_based_str) > 0:
            optimiser = CatOptim(sig_weights, sig_m_ee, [pred_prob_sig],
                                 bkg_weights, bkg_m_ee, [pred_prob_bkg], 0,
                                 ranges, names)
            AMS = optimiser.cutBasedAMS()
            print 'String for cut based optimimastion: {}'.format(
                options.cut_based_str)
            print 'Cut-based optimimsation gives AMS = {:1.8f}'.format(AMS)

        else:
            for n_cats in cats:
                optimiser = CatOptim(sig_weights, sig_m_ee, [pred_prob_sig],
                                     bkg_weights, bkg_m_ee, [pred_prob_bkg],
                                     n_cats, ranges, names)
                optimiser.optimise(
                    1, options.n_iters
                )  #set lumi to 1 as already scaled when loading in
                print_str += 'Results for {} categories : \n'.format(n_cats)
                print_str += optimiser.getPrintableResult()
                AMS.append(optimiser.bests.totSignif)
            print '\n {}'.format(print_str)

        #make nCat vs AMS plots
        Plotter.cats_vs_ams(cats, AMS, output_tag)

Пример #6

Показать файл

Файл: train_lstm.py Проект: Joe-W-Davies/HToEE

def main(options):

    #take options from the yaml config
    with open(options.config, 'r') as config_file:
        config = yaml.load(config_file)
        output_tag = config['output_tag']

        mc_dir = config['mc_file_dir']
        mc_fnames = config['mc_file_names']

        data_dir = config['data_file_dir']
        data_fnames = config['data_file_names']

        proc_to_tree_name = config['proc_to_tree_name']

        object_vars = config['object_vars']
        flat_obj_vars = [var for i_object in object_vars for var in i_object]
        event_vars = config['event_vars']
        vars_to_add = config['vars_to_add']
        presel = config['preselection']

        #Data handling stuff#

        if options.pt_reweight:
            cr_selection = config['reweight_cr']
            output_tag += '_pt_reweighted'
            root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir,
                                   data_fnames, proc_to_tree_name,
                                   flat_obj_vars + event_vars, vars_to_add,
                                   cr_selection)
        else:
            root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir,
                                   data_fnames, proc_to_tree_name,
                                   flat_obj_vars + event_vars, vars_to_add,
                                   presel)

        #load the dataframes for all years
        for sig_obj in root_obj.sig_objects:
            root_obj.load_mc(sig_obj, reload_samples=options.reload_samples)
        for bkg_obj in root_obj.bkg_objects:
            root_obj.load_mc(bkg_obj,
                             bkg=True,
                             reload_samples=options.reload_samples)
        for data_obj in root_obj.data_objects:  # for plotting
            root_obj.load_data(data_obj, reload_samples=options.reload_samples)
        root_obj.concat()

        #reweight samples in bins of pT (and maybe Njets), for each year separely.
        if options.pt_reweight and options.reload_samples:
            root_obj.apply_pt_rew('DYMC', presel)

            #LSTM stuff#

        LSTM = LSTM_DNN(root_obj, object_vars, event_vars, options.train_frac,
                        options.eq_weights, options.batch_boost)

        if not options.opt_hps:
            LSTM.var_transform(do_data=True)
            X_tot, y_tot = LSTM.create_X_y(mass_res_reweight=True)
            LSTM.split_X_y(X_tot, y_tot, do_data=True)

            if options.hp_perm is not None:
                LSTM.get_X_scaler(LSTM.all_vars_X_train,
                                  out_tag=output_tag,
                                  save=False)
            else:
                LSTM.get_X_scaler(LSTM.all_vars_X_train, out_tag=output_tag)
            LSTM.X_scale_train_test(do_data=True)
            LSTM.set_low_level_2D_test_train(do_data=True,
                                             ignore_train=options.batch_boost)

        #functions called in subbed job, if options.opt_hps was true
        if options.hp_perm is not None:
            if options.opt_hps and options.train_best:
                raise Exception(
                    'Cannot optimise HPs and train best model. Run optimal training after hyper paramter optimisation'
                )
            elif options.opt_hps and options.hp_perm:
                raise Exception(
                    'opt_hps option submits scripts with the hp_perm option; Cannot submit a script with both!'
                )
            else:
                LSTM.set_hyper_parameters(options.hp_perm)
                LSTM.model.summary()
                LSTM.train_w_batch_boost(out_tag=output_tag, save=False)
                with open('{}/lstm_hp_opt_{}.txt'.format(mc_dir, output_tag),
                          'a+') as val_roc_file:
                    LSTM.compare_rocs(val_roc_file, options.hp_perm)
                    val_roc_file.close()

        elif options.opt_hps:
            #FIXME: add warning that many jobs are about to be submiited
            if path.isfile('{}/lstm_hp_opt_{}.txt'.format(mc_dir, output_tag)):
                system('rm {}/lstm_hp_opt_{}.txt'.format(mc_dir, output_tag))
                print('deleting: {}/lstm_hp_opt_{}.txt'.format(
                    mc_dir, output_tag))
            LSTM.batch_gs_cv(pt_rew=options.pt_reweight)

        elif options.train_best:
            output_tag += '_best'
            with open('{}/lstm_hp_opt_{}.txt'.format(mc_dir, output_tag),
                      'r') as val_roc_file:
                hp_roc = val_roc_file.readlines()
                best_params = hp_roc[-1].split(';')[0]
                print 'Best classifier params are: {}'.format(best_params)
                LSTM.set_hyper_parameters(best_params)
                LSTM.model.summary()
                LSTM.train_w_batch_boost(out_tag=output_tag)
                #compute final roc on test set
                LSTM.compute_roc(batch_size=1024)
                LSTM.plot_roc(output_tag)
                LSTM.plot_output_score(output_tag,
                                       batch_size=1024,
                                       ratio_plot=True,
                                       norm_to_data=(not options.pt_reweight))

        #else train with basic parameters/architecture
        else:
            LSTM.model.summary()
            if options.batch_boost:  #type of model selection so need validation set
                LSTM.train_w_batch_boost(
                    out_tag=output_tag
                )  #handles creating validation set and 2D vars and sequential saving
            else:
                LSTM.train_network(epochs=5, batch_size=1024)
                #LSTM.train_network(epochs=7, batch_size=32)
                LSTM.save_model(out_tag=output_tag)
            LSTM.compute_roc(batch_size=1024)
            #compute final roc on test set
            LSTM.plot_roc(output_tag)
            LSTM.plot_output_score(output_tag,
                                   batch_size=1024,
                                   ratio_plot=True,
                                   norm_to_data=(not options.pt_reweight))

Пример #7

Показать файл

Файл: plot_inputs_normed.py Проект: Joe-W-Davies/HToEE

def main(options):

    #take options from the yaml config
    with open(options.config, 'r') as config_file:
        config            = yaml.load(config_file)
        output_tag        = config['output_tag']

        mc_dir            = config['mc_file_dir']
        mc_fnames         = config['mc_file_names']
  
        #data not needed yet, could use this for validation later. keep for compat with class
        data_dir          = config['data_file_dir']
        data_fnames       = config['data_file_names']

        train_vars        = config['train_vars']
        vars_to_add       = config['vars_to_add']
        presel            = config['preselection']

        proc_to_tree_name = config['proc_to_tree_name']

        sig_colour        = 'forestgreen'
        #sig_colour        = 'red'
        bkg_colour        = 'violet'
 
                                           #Data handling stuff#

        #load the mc dataframe for all years
        if options.pt_reweight: 
            cr_selection = config['reweight_cr']
            output_tag += '_pt_reweighted'
            root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir, data_fnames, proc_to_tree_name, train_vars, vars_to_add, cr_selection)
        else: root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir, data_fnames, proc_to_tree_name, train_vars, vars_to_add, presel)

        for sig_obj in root_obj.sig_objects:
            root_obj.load_mc(sig_obj, reload_samples=options.reload_samples)
        for bkg_obj in root_obj.bkg_objects:
            root_obj.load_mc(bkg_obj, bkg=True, reload_samples=options.reload_samples)
        root_obj.concat()

        if options.pt_reweight and options.reload_samples: 
            for year in root_obj.years:
                root_obj.pt_reweight('DYMC', year, presel)

                                            #Plotter stuff#
        with open('plotting/var_to_xrange.yaml', 'r') as plot_config_file:
            plot_config        = yaml.load(plot_config_file)
            var_to_xrange      = plot_config['var_to_xrange']

        #get x string replacements from yaml config
        with open('plotting/var_to_xstring.yaml', 'r') as plot_config_file:
            plot_string_cfg    = yaml.load(plot_config_file)
            var_to_xstring     = plot_string_cfg['var_to_xstring']
 
        #set up X, w and y, train-test 
        plotter = Plotter(root_obj, train_vars, sig_col=sig_colour, norm_to_data=True)
        for var in train_vars:
        #for var in ['dielectronCosPhi']:

            fig  = plt.figure(1)
            axes = fig.gca()

            var_sig     = root_obj.mc_df_sig[var].values
            sig_weights = root_obj.mc_df_sig['weight'].values
            var_bkg     = root_obj.mc_df_bkg[var].values
            bkg_weights = root_obj.mc_df_bkg['weight'].values

            bins = np.linspace(var_to_xrange[var][0], var_to_xrange[var][1], 56)

            #add sig mc
            axes.hist(var_sig, bins=bins, label=plotter.sig_labels[0]+r' ($\mathrm{H}\rightarrow\mathrm{ee}$)', weights=sig_weights, histtype='stepfilled', color='red', zorder=10, alpha=0.4, normed=True)
            axes.hist(var_bkg, bins=bins, label='Simulated background', weights=bkg_weights, histtype='stepfilled', color='blue', zorder=0, alpha=0.4, normed=True)

            axes.set_ylabel('Arbitrary Units', ha='right', y=1, size=13)
            current_bottom, current_top = axes.get_ylim()
            axes.set_ylim(bottom=0, top=1.2*current_top)
            axes.set_xlim(left=var_to_xrange[var][0], right=var_to_xrange[var][1])
            axes.legend(bbox_to_anchor=(0.97,0.97), ncol=1)
            plotter.plot_cms_labels(axes, lumi='')
               
            axes.set_xlabel('{}'.format(var_to_xstring[var]), ha='right', x=1, size=13)

            Utils.check_dir('{}/plotting/plots/{}/normed/'.format(os.getcwd(), output_tag))
            fig.savefig('{0}/plotting/plots/{1}/normed/{1}_{2}_normalised.pdf'.format(os.getcwd(), output_tag, var))
            plt.close()

Пример #8

Показать файл

def main(options):

    #take options from the yaml config
    with open(options.config, 'r') as config_file:
        config            = yaml.load(config_file)
        output_tag        = config['output_tag']

        mc_dir            = config['mc_file_dir']
        mc_fnames         = config['mc_file_names']
  
        #data not needed yet, could use this for validation later. keep for compat with class
        data_dir          = config['data_file_dir']
        data_fnames       = config['data_file_names']

        train_vars        = config['train_vars']
        vars_to_add       = config['vars_to_add']
        presel            = config['preselection']

        proc_to_tree_name = config['proc_to_tree_name']
        colours           = ['#d7191c', '#fdae61', '#f2f229', '#abdda4', '#2b83ba']

                                           #Data handling stuff#

        #load the mc dataframe for all years
        if options.pt_reweight: 
            cr_selection = config['reweight_cr']
            output_tag += '_pt_reweighted'
            root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir, data_fnames, proc_to_tree_name, train_vars, vars_to_add, cr_selection)
        else: root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir, data_fnames, proc_to_tree_name, train_vars, vars_to_add, presel)

        for sig_obj in root_obj.sig_objects:
            root_obj.load_mc(sig_obj, reload_samples=options.reload_samples)
        for bkg_obj in root_obj.bkg_objects:
            root_obj.load_mc(bkg_obj, bkg=True, reload_samples=options.reload_samples)
        #for data_obj in root_obj.data_objects:
        #    root_obj.load_data(data_obj, reload_samples=options.reload_samples)
        root_obj.concat()

        if options.pt_reweight and options.reload_samples: 
            for year in root_obj.years:
                root_obj.pt_reweight('DYMC', year, presel)

                                            #Plotter stuff#

        #add model predictions to sig df
        print 'loading classifier: {}'.format(options.model)
        clf = pickle.load(open("{}".format(options.model), "rb"))
        sig_df = root_obj.mc_df_sig
        sig_df['bdt_score'] = clf.predict_proba(sig_df[train_vars].values)[:,1:].ravel()
        bkg_df = root_obj.mc_df_bkg
        bkg_df['bdt_score'] = clf.predict_proba(bkg_df[train_vars].values)[:,1:].ravel()
 
        plotter  = Plotter(root_obj, train_vars)
        #for VBF, good set is: [0.30 0.50 0.70 0.80 0.90 1.0]
        #for ggH, good set is: [0.10 0.30 0.45 0.53 0.60 0.8]
        bdt_bins = np.array(options.boundaries)
        Utils.check_dir('{}/plotting/plots/{}_sig_bkg_evo'.format(os.getcwd(), output_tag))
        i_hist = 0

        for var in train_vars+['dielectronMass']:
            fig  = plt.figure(1)
            axes = fig.gca()
            var_bins = np.linspace(plotter.var_to_xrange[var][0], plotter.var_to_xrange[var][1], options.n_bins)
            for ibin in range(len(bdt_bins)-1):
                sig_cut = sig_df[np.logical_and( sig_df['bdt_score'] > bdt_bins[ibin], sig_df['bdt_score'] < bdt_bins[ibin+1])][var]
                weights_cut = sig_df[np.logical_and( sig_df['bdt_score'] > bdt_bins[ibin], sig_df['bdt_score'] < bdt_bins[ibin+1])]['weight']
                axes.hist(sig_cut, bins=var_bins, label='{:.2f} $<$ MVA $<$ {:.2f}'.format(bdt_bins[ibin], bdt_bins[ibin+1]), weights=weights_cut, histtype='step', color=colours[i_hist], normed=True)
                i_hist += 1
            i_hist=0
            annotate_and_save(axes, plotter, var)
            fig.savefig('{0}/plotting/plots/{1}_sig_bkg_evo/{1}_{2}.pdf'.format(os.getcwd(), output_tag, var))
            print('saving: {0}/plotting/plots/{1}_sig_bkg_evo/{1}_{2}.pdf'.format(os.getcwd(), output_tag, var))
            plt.close()

        #plot background (check mass is not being sculpted)
        for var in ['dielectronMass']:
            fig  = plt.figure(1)
            axes = fig.gca()
            var_bins = np.linspace(plotter.var_to_xrange[var][0], plotter.var_to_xrange[var][1], options.n_bins)
            for ibin in range(len(bdt_bins)-1):
                bkg_cut = bkg_df[np.logical_and( bkg_df['bdt_score'] > bdt_bins[ibin], bkg_df['bdt_score'] < bdt_bins[ibin+1])][var]
                bkg_weights_cut = bkg_df[np.logical_and( bkg_df['bdt_score'] > bdt_bins[ibin], bkg_df['bdt_score'] < bdt_bins[ibin+1])]['weight']
                axes.hist(bkg_cut, bins=var_bins, label='{:.2f} $<$ MVA $<$ {:.2f}'.format(bdt_bins[ibin], bdt_bins[ibin+1]), weights=bkg_weights_cut, histtype='step', color=colours[i_hist], normed=True)
                i_hist+=1
            i_hist=0

            annotate_and_save(axes, plotter, var)
            fig.savefig('{0}/plotting/plots/{1}_sig_bkg_evo/{1}_{2}_bkg.pdf'.format(os.getcwd(), output_tag, var))
            plt.close()

Пример #9

Показать файл

def main(options):

    with open(options.config, 'r') as config_file:
        config = yaml.load(config_file)
        output_tag = config['output_tag']

        mc_dir = config['mc_file_dir']
        mc_fnames = config['mc_file_names']

        data_dir = config['data_file_dir']
        data_fnames = config['data_file_names']

        proc_to_tree_name = config['proc_to_tree_name']

        #check if dnn (lstm) variables need to be read in
        varrs = config['train_vars']
        all_train_vars = []
        if isinstance(varrs, dict):
            object_vars = varrs['object_vars']
            flat_obj_vars = [
                var for i_object in object_vars for var in i_object
            ]
            event_vars = varrs['event_vars']
            all_train_vars += (flat_obj_vars + event_vars)
        else:
            all_train_vars = varrs

        vars_to_add = config['vars_to_add']
        presel = config['preselection']
        cut_map = config['cut_map']

        #Data handling stuff#

        #get the dataframe for all years. Do not apply any specific preselection to sim samples
        root_obj = ROOTHelpers(output_tag,
                               mc_dir,
                               mc_fnames,
                               data_dir,
                               data_fnames,
                               proc_to_tree_name,
                               all_train_vars,
                               vars_to_add,
                               presel,
                               read_systs=True)

        #for sig_obj in root_obj.sig_objects:
        #    root_obj.load_mc(sig_obj, reload_samples=options.reload_samples)
        for bkg_obj in root_obj.bkg_objects:
            root_obj.load_mc(bkg_obj,
                             bkg=True,
                             reload_samples=options.reload_samples)
        for data_obj in root_obj.data_objects:
            root_obj.load_data(data_obj, reload_samples=options.reload_samples)
        root_obj.concat()

        #--------------------------------------------------------------------------------------------------

        dy_plotter = DYPlotter(root_obj, cut_map)
        if options.reload_samples:  #FIXME: reading in for the first time  wont re-weight sample! (OR BEING READ IN THE FOR THE FIRST TIME) (or get the reload samples flag out of the DataHandler object since trigger if no sample exist
            dy_plotter.pt_reweight()
        #FIXME still need this else dont remove variables again!
        dy_plotter.manage_memory(options.systematics,
                                 save=options.reload_samples)

        #DEBUG
        print 'Background columns'
        print root_obj.mc_df_bkg.columns[:]

        print 'Background columns'
        print root_obj.data_df.columns[:]

        if (options.var_name is None) or ('mva' in options.var_name.lower()):
            dy_plotter.eval_mva(
                options.mva_config, output_tag
            )  #little bit hard coded - be careful if 'mva' not in MVA ouput name. Below line is safer but longer.
        #dy_plotter.eval_mva(options.mva_config, output_tag)
        #--------------------------------------------------------------------------------------------------

        with open('plotting/var_to_xrange.yaml', 'r') as plot_config_file:
            plot_config = yaml.load(plot_config_file)
            var_to_xrange = plot_config['var_to_xrange']

        with open('plotting/var_to_xstring.yaml', 'r') as plot_config_file:
            plot_string_cfg = yaml.load(plot_config_file)
            var_to_xstring = plot_string_cfg['var_to_xstring']

        if options.var_name is not None: vars_to_plot = [options.var_name]
        else: var_to_plot = all_train_vars + [dy_plotter.proc + '_mva']

        for var in vars_to_plot:

            if 'mva' in var: var_bins = np.linspace(0, 1, options.n_bins)
            else:
                var_bins = np.linspace(var_to_xrange[var][0],
                                       var_to_xrange[var][1], options.n_bins)
            print 'plotting var: {}'.format(var)
            fig, axes = plt.subplots(nrows=2,
                                     ncols=1,
                                     dpi=200,
                                     sharex=True,
                                     gridspec_kw={
                                         'height_ratios': [3, 0.8],
                                         'hspace': 0.08
                                     })

            cut_str = dy_plotter.get_cut_string(var)

            #data stuff
            data_binned, bin_centres, data_stat_down_up = dy_plotter.plot_data(
                cut_str, axes, var, var_bins)
            dy_plotter.plot_bkgs(cut_str, axes, var, var_bins, data_binned,
                                 bin_centres, data_stat_down_up)

            #syst stuff
            dy_plotter.plot_systematics(
                cut_str,
                axes,
                var,
                var_bins,
                options.systematics,
                do_mva=('mva'
                        in options.var_name))  #FIXME: make this more general

            axes = dy_plotter.set_canv_style(axes, var, var_bins)
            axes[0].legend(bbox_to_anchor=(0.97, 0.97), ncol=1)
            axes[1].set_xlabel(var_to_xstring[var], size=14, ha='right', x=1)
            Utils.check_dir('{}/plotting/plots/{}'.format(
                os.getcwd(), output_tag))
            #fig.savefig('{0}/plotting/plots/{1}/{1}_{2}.pdf'.format(os.getcwd(), output_tag, var))
            fig.savefig(
                '/vols/cms/jwd18/Hee/MLCategorisation/CMSSW_10_2_0/src/HToEE/plotting/plots/{0}/{0}_{1}.pdf'
                .format(output_tag, var))  #temp hardcode

Пример #10

Показать файл

def main(options):

    with open(options.config, 'r') as config_file:
        config = yaml.load(config_file)
        output_tag = config['output_tag']

        mc_dir = config['mc_file_dir']
        mc_fnames = config['mc_file_names']

        #data not needed yet, but stil specify in the config for compatibility with constructor
        data_dir = config['data_file_dir']
        data_fnames = config['data_file_names']

        proc_to_tree_name = config['proc_to_tree_name']

        proc_to_train_vars = config['train_vars']
        all_train_vars = [
            item for sublist in proc_to_train_vars.values() for item in sublist
        ]

        vars_to_add = config['vars_to_add']

        #Data handling stuff#
        #apply loosest selection (ggh) first, else memory requirements are ridiculous. Fine to do this since all cuts all looser than VBF (not removing events with higher priority)
        loosest_selection = 'dielectronMass > 110 and dielectronMass < 150'

        #load the mc dataframe for all years. Do not apply any specific preselection
        root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir,
                               data_fnames, proc_to_tree_name, all_train_vars,
                               vars_to_add, loosest_selection)
        root_obj.no_lumi_scale()
        for sig_obj in root_obj.sig_objects:
            root_obj.load_mc(sig_obj, reload_samples=options.reload_samples)
        if options.data_as_bkg:
            for data_obj in root_obj.data_objects:
                root_obj.load_data(data_obj,
                                   reload_samples=options.reload_samples)
        else:
            for bkg_obj in root_obj.bkg_objects:
                root_obj.load_mc(bkg_obj,
                                 bkg=True,
                                 reload_samples=options.reload_samples)
        root_obj.concat()

        #Tag sequence stuff#
    if options.data_as_bkg:
        combined_df = pd.concat([root_obj.mc_df_sig, root_obj.data_df])
    else:
        combined_df = pd.concat([root_obj.mc_df_sig, root_obj.mc_df_bkg])
    del root_obj

    #decide sequence of tags and specify preselection for use with numpy.select:
    tag_sequence = ['VBF', 'ggH']
    proc_to_preselection = {
        'VBF': [
            combined_df['dielectronMass'].gt(110)
            & combined_df['dielectronMass'].lt(150)
            & combined_df['leadElectronPToM'].gt(0.333)
            & combined_df['subleadElectronPToM'].gt(0.25)
            & combined_df['dijetMass'].gt(350)
            & combined_df['leadJetPt'].gt(40)
            & combined_df['subleadJetPt'].gt(30)
        ],
        'ggH': [
            combined_df['dielectronMass'].gt(110)
            & combined_df['dielectronMass'].lt(150)
            & combined_df['leadElectronPToM'].gt(0.333)
            & combined_df['subleadElectronPToM'].gt(0.25)
        ]
    }

    with open(options.bdt_config, 'r') as bdt_config_file:
        config = yaml.load(bdt_config_file)
        proc_to_model = config['models']
        proc_to_tags = config['boundaries']

        #evaluate MVA scores used in categorisation
        for proc, model in proc_to_model.iteritems():
            print 'evaluating classifier: {}'.format(model)
            clf = pickle.load(open('models/{}'.format(model), "rb"))
            train_vars = proc_to_train_vars[proc]
            combined_df[proc + '_bdt'] = clf.predict_proba(
                combined_df[train_vars].values)[:, 1:].ravel()

        # TAG NUMBER #

        #decide on tag
        for proc in tag_sequence:
            presel = proc_to_preselection[proc]
            tag_bounds = proc_to_tags[proc].values()
            tag_masks = []
            for i_bound in range(
                    len(tag_bounds)):  #c++ type looping for index reasons
                if i_bound == 0:  #first bound, tag 0
                    tag_masks.append(presel[0] & combined_df['{}_bdt'.format(
                        proc)].gt(tag_bounds[i_bound]))
                else:  #intermed bound
                    tag_masks.append(presel[0] & combined_df['{}_bdt'.format(
                        proc)].lt(tag_bounds[i_bound - 1]) & combined_df[
                            '{}_bdt'.format(proc)].gt(tag_bounds[i_bound]))

            mask_key = [icat for icat in range(len(tag_bounds))]

            combined_df['{}_analysis_tag'.format(proc)] = np.select(
                tag_masks, mask_key, default=-999)

        # PROC PRIORITY #

        # deduce tag priority: if two or more tags satisfied then set final tag to highest priority tag. make this non hardcoded i.e. compare proc in position 1 to all lower prioty positions. then compare proc in pos 2 ...
        tag_priority_filter = [
            combined_df['VBF_analysis_tag'].ne(-999)
            & combined_df['ggH_analysis_tag'].ne(-999),  # 1) if both filled...
            combined_df['VBF_analysis_tag'].ne(-999)
            & combined_df['ggH_analysis_tag'].eq(
                -999),  # 2) if VBF filled and ggH not, take VBF
            combined_df['VBF_analysis_tag'].eq(-999)
            & combined_df['ggH_analysis_tag'].ne(
                -999),  # 3) if ggH filled and VBF not, take ggH
        ]

        tag_priority_key = [
            'VBF',  #1) take VBF
            'VBF',  #2) take VBF
            'ggH',  #3) take ggH
        ]
        combined_df['priority_tag'.format(proc)] = np.select(
            tag_priority_filter, tag_priority_key,
            default='NOTAG')  # else keep -999 i.e. NOTAG

        #some debug checks:
        #print combined_df[['dipho_mass', 'dipho_leadIDMVA', 'dipho_subleadIDMVA', 'dipho_lead_ptoM', 'dipho_sublead_ptoM', 'dijet_Mjj', 'dijet_LeadJPt', 'dijet_SubJPt', 'ggH_bdt', 'VBF_bdt', 'VBF_analysis_tag', 'ggH_analysis_tag', 'priority_tag']]
        #print combined_df[combined_df.VBF_analysis_tag>-1][['dipho_mass', 'dipho_leadIDMVA', 'dipho_subleadIDMVA', 'dipho_lead_ptoM', 'dipho_sublead_ptoM', 'dijet_Mjj', 'dijet_LeadJPt', 'dijet_SubJPt', 'ggH_bdt', 'VBF_bdt', 'VBF_analysis_tag', 'ggH_analysis_tag', 'priority_tag']]
        #print combined_df[combined_df.ggH_analysis_tag>-1][['dipho_mass', 'dipho_leadIDMVA', 'dipho_subleadIDMVA', 'dipho_lead_ptoM', 'dipho_sublead_ptoM', 'dijet_Mjj', 'dijet_LeadJPt', 'dijet_SubJPt', 'ggH_bdt', 'VBF_bdt', 'VBF_analysis_tag', 'ggH_analysis_tag', 'priority_tag']]

        # FILL TREES BASED ON BOTH OF ABOVE
        tree_vars = ['dZ', 'CMS_hgg_mass', 'weight']
        combined_df['dZ'] = float(0.)
        combined_df['CMS_hgg_mass'] = combined_df['dielectronMass']

        # FIXME: dont loop through events eventually but for now I cba to use numpy to vectorise it again
        #for true_proc in tag_sequence+['Data']:
        #    #isolate true proc
        #    true_proc_df = combined_df[combined_df.proc==true_proc.lower()]
        #    #how much true proc landed in each of our analysis cats?
        #    for target_proc in tag_sequence:  #for all events that got the proc tag, which tag did they fall into?
        #        true_proc_target_proc_df = true_proc_df[true_proc_df.priority_tag==target_proc]
        #        for i_tag in range(len(proc_to_tags[target_proc].values())):#for each tag corresponding to the category we target, which events go in which tag
        #             true_procs_target_proc_tag_i  = true_proc_target_proc_df[true_proc_target_proc_df['{}_analysis_tag'.format(target_proc)].eq(i_tag)]
        #
        #             branch_name = '{}_125_13TeV_{}cat{} :'.format(true_proc.lower(), target_proc.lower(), i_tag )
        #             print true_procs_target_proc_tag_i[['dipho_mass', 'dipho_leadIDMVA', 'dipho_subleadIDMVA', 'dipho_lead_ptoM', 'dipho_sublead_ptoM', 'dijet_Mjj', 'dijet_LeadJPt', 'dijet_SubJPt', 'ggH_bdt', 'VBF_bdt', 'VBF_analysis_tag', 'ggH_analysis_tag', 'priority_tag']].head(10)
        #             print branch_name

        #get tree names
        branch_names = {}
        #print 'DEBUG: {}'.format(np.unique(combined_df['proc']))
        for true_proc in tag_sequence + ['Data']:
            branch_names[true_proc] = []
            for target_proc in tag_sequence:  #for all events that got the proc tag, which tag did they fall into?
                for i_tag in range(
                        len(proc_to_tags[target_proc].values())
                ):  #for each tag corresponding to the category we target, which events go in which tag
                    if true_proc is not 'Data':
                        branch_names[true_proc].append(
                            '{}_125_13TeV_{}cat{}'.format(
                                true_proc.lower(), target_proc.lower(), i_tag))
                    else:
                        branch_names[true_proc].append(
                            '{}_13TeV_{}cat{}'.format(true_proc,
                                                      target_proc.lower(),
                                                      i_tag))

        #debug_procs = ['dipho_mass', 'dipho_leadIDMVA', 'dipho_subleadIDMVA', 'dipho_lead_ptoM', 'dipho_sublead_ptoM', 'dijet_Mjj', 'dijet_LeadJPt', 'dijet_SubJPt', 'ggH_bdt', 'VBF_bdt', 'VBF_analysis_tag', 'ggH_analysis_tag', 'priority_tag']
        debug_vars = [
            'proc', 'VBF_analysis_tag', 'ggH_analysis_tag', 'priority_tag'
        ]
        combined_df['tree_name'] = combined_df.apply(assign_tree, axis=1)
        print combined_df[debug_vars + ['tree_name']]

        if not path.isdir('output_trees/'):
            print 'making directory: {}'.format('output_trees/')
            system('mkdir -p %s' % 'output_trees/')

        #have to save individual trees then hadd procs together on the command line.
        for proc in tag_sequence + ['Data']:
            selected_df = combined_df[combined_df.proc == proc]
            for bn in branch_names[proc]:
                print bn
                branch_selected_df = selected_df[selected_df.tree_name == bn]
                print branch_selected_df[debug_vars + ['tree_name']].head(20)
                root_pandas.to_root(branch_selected_df[tree_vars],
                                    'output_trees/{}.root'.format(bn),
                                    key=bn)
                print

Пример #11

Показать файл

def main(options):

    with open(options.config, 'r') as config_file:
        config = yaml.load(config_file)
        output_tag = config['output_tag']

        mc_dir = config['mc_file_dir']
        mc_fnames = config['mc_file_names']

        #data not needed yet, but stil specify in the config for compatibility with constructor
        data_dir = config['data_file_dir']
        data_fnames = config['data_file_names']

        proc_to_tree_name = config['proc_to_tree_name']

        #check if dnn (lstm) variables need to be read in
        proc_to_train_vars = config['train_vars']
        all_train_vars = []
        for proc, varrs in proc_to_train_vars.iteritems():
            if isinstance(varrs, dict):
                object_vars = proc_to_train_vars[proc]['object_vars']
                flat_obj_vars = [
                    var for i_object in object_vars for var in i_object
                ]
                event_vars = proc_to_train_vars[proc]['event_vars']
                all_train_vars += (flat_obj_vars + event_vars)
            else:
                all_train_vars += varrs

        vars_to_add = config['vars_to_add']

        if options.syst_name is not None:
            syst = options.syst_name
            read_syst = True
        else:
            read_syst = False

        if read_syst and options.dump_weight_systs:
            raise IOError(
                'Cannot dump weight variations and tree systematics at the same time. Please run separately for each.'
            )
        if options.data_only and (read_syst or options.dump_weight_systs):
            raise IOError('Cannot read Data and apply sysetmatic shifts')

        #Data handling stuff#
        #apply loosest selection (ggh) first, else memory requirements are ridiculous. Fine to do this since all cuts all looser than VBF (not removing events with higher priority)
        #also note we norm the MC before applying this cut. In data we apply it when reading in.
        #loosest_selection = 'dielectronMass > 110 and dielectronMass < 150 and leadElectronPtOvM > 0.333 and subleadElectronPtOvM > 0.25' cant do this since these vars change with systematics!
        loosest_selection = 'dielectronMass > 100'

        #load the mc dataframe for all years. Do not apply any specific preselection to sim samples
        root_obj = ROOTHelpers(output_tag,
                               mc_dir,
                               mc_fnames,
                               data_dir,
                               data_fnames,
                               proc_to_tree_name,
                               all_train_vars,
                               vars_to_add,
                               loosest_selection,
                               read_systs=(read_syst
                                           or options.dump_weight_systs))
        root_obj.no_lumi_scale()
        for sig_obj in root_obj.sig_objects:
            root_obj.load_mc(sig_obj, reload_samples=options.reload_samples)
        #if not read_syst:
        if not options.data_as_bkg:
            for bkg_obj in root_obj.bkg_objects:
                root_obj.load_mc(bkg_obj,
                                 bkg=True,
                                 reload_samples=options.reload_samples)
        else:
            for data_obj in root_obj.data_objects:
                root_obj.load_data(data_obj,
                                   reload_samples=options.reload_samples)
            #overwrite background attribute, for compat with DNN class
            root_obj.mc_df_bkg = root_obj.data_df
        root_obj.concat()

        #get year of samples for roob obj and check we didn't accidentally read in more than 1 year
        if len(root_obj.years) != 1:
            raise IOError(
                'Reading in more than one year at a time! Tagging should be split by year'
            )
        else:
            year = list(root_obj.years)[0]
        if ("2016" in year) and (not options.data_only):
            root_obj.scale_sig_partial_2016(
            )  #FIXME: check this is actually called

    #if read_syst: combined_df = root_obj.mc_df_sig doesnt work with DNN set up since need bkg class in _init_
    #else: combined_df = pd.concat([root_obj.mc_df_sig, root_obj.mc_df_bkg])
    combined_df = pd.concat([root_obj.mc_df_sig, root_obj.mc_df_bkg])

    #Tag sequence stuff#
    #specify sequence of tags and preselection targetting each

    tag_sequence = ['VBF', 'ggH']  #categories targetted
    true_procs = ['VBF', 'ggH', 'ttH']  #procs to run through cats
    #true_procs        = ['ggH', 'ttH'] #procs to run through cats
    if (not read_syst) and (not options.dump_weight_systs):
        true_procs.append(
            'Data'
        )  #is this line needed? guess so since could run mc and data together in a stat-only config
    if options.data_only:
        true_procs = ['Data']  #do data on its own (for memory really)

    #create tag object
    tag_obj = taggerBase(tag_sequence,
                         true_procs,
                         combined_df,
                         syst_name=options.syst_name)
    if read_syst:
        tag_obj.relabel_syst_vars()  #not run if reading weight systematics

    #get number models and tag boundaries from config
    with open(options.mva_config, 'r') as mva_config_file:
        config = yaml.load(mva_config_file)
        proc_to_model = config['models']
        tag_boundaries = config['boundaries']

        #evaluate MVA scores used in categorisation
        for proc, model in proc_to_model.iteritems():
            #for BDT - proc:[var list]. For DNN - proc:{var_type1:[var_list_type1], var_type2: [...], ...}
            if isinstance(model, dict):
                object_vars = proc_to_train_vars[proc]['object_vars']
                flat_obj_vars = [
                    var for i_object in object_vars for var in i_object
                ]
                event_vars = proc_to_train_vars[proc]['event_vars']

                dnn_loaded = tag_obj.load_dnn(proc, model)
                train_tag = model['architecture'].split('_model')[0]
                tag_obj.eval_lstm(dnn_loaded, train_tag, root_obj, proc,
                                  object_vars, flat_obj_vars, event_vars)

            elif isinstance(model, str):
                tag_obj.eval_bdt(proc, model, proc_to_train_vars[proc])
            else:
                raise IOError(
                    'Did not get a classifier models in correct format in config'
                )

    del root_obj

    #need to do this after eval MVAs, since LSTM class used in eval_lstm needs some Data in df for constructor
    if (read_syst or options.dump_weight_systs):
        tag_obj.combined_df = tag_obj.combined_df[
            tag_obj.combined_df.proc != 'Data'].copy(
            )  #avoid copy warnings later
    tag_preselection = tag_obj.get_tag_preselection()

    #set up tag boundaries for each process being targeted
    tag_obj.decide_tag(tag_preselection, tag_boundaries)
    tag_obj.decide_priority()
    branch_names = tag_obj.get_tree_names(tag_boundaries, year)
    tag_obj.set_tree_names(tag_boundaries, options.dump_weight_systs, year)
    tag_obj.fill_trees(branch_names, year, print_yields=not read_syst)
    if not read_syst:
        pass  #tag_obj.plot_matrix(branch_names, output_tag)  #struct error?

Пример #12

Показать файл

def main(options):

    #take options from the yaml config
    with open(options.config, 'r') as config_file:
        config = yaml.load(config_file)
        output_tag = config['output_tag']

        mc_dir = config['mc_file_dir']
        mc_fnames = config['mc_file_names']

        #data not needed yet, could use this for validation later. keep for compat with class
        data_dir = config['data_file_dir']
        data_fnames = config['data_file_names']

        train_vars = config['train_vars']
        vars_to_add = config['vars_to_add']
        presel = config['preselection']

        proc_to_tree_name = config['proc_to_tree_name']

        #sig_colour        = 'forestgreen'
        sig_colour = 'red'

        #Data handling stuff#

        #load the mc dataframe for all years
        if options.pt_reweight:
            cr_selection = config['reweight_cr']
            output_tag += '_pt_reweighted'
            root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir,
                                   data_fnames, proc_to_tree_name, train_vars,
                                   vars_to_add, cr_selection)
        else:
            root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir,
                                   data_fnames, proc_to_tree_name, train_vars,
                                   vars_to_add, presel)

        for sig_obj in root_obj.sig_objects:
            root_obj.load_mc(sig_obj, reload_samples=options.reload_samples)
        for bkg_obj in root_obj.bkg_objects:
            root_obj.load_mc(bkg_obj,
                             bkg=True,
                             reload_samples=options.reload_samples)
        for data_obj in root_obj.data_objects:
            root_obj.load_data(data_obj, reload_samples=options.reload_samples)
        root_obj.concat()

        if options.pt_reweight and options.reload_samples:
            root_obj.apply_pt_rew('DYMC', presel)

    #load MVA
        with open(options.mva_config, 'r') as mva_config_file:
            config = yaml.load(mva_config_file)
            model = config['models'][options.mva_proc]
            boundaries = config['boundaries'][options.mva_proc]

            #add DNN later
            if isinstance(model, str):
                print 'evaluating BDT: {}'.format(model)
                clf = pickle.load(open('models/{}'.format(model), "rb"))
                root_obj.mc_df_sig[
                    options.mva_proc + '_mva'] = clf.predict_proba(
                        root_obj.mc_df_sig[train_vars].values)[:, 1:].ravel()
                root_obj.mc_df_bkg[
                    options.mva_proc + '_mva'] = clf.predict_proba(
                        root_obj.mc_df_bkg[train_vars].values)[:, 1:].ravel()
                root_obj.data_df[
                    options.mva_proc + '_mva'] = clf.predict_proba(
                        root_obj.data_df[train_vars].values)[:, 1:].ravel()

            else:
                raise IOError(
                    'Did not get a classifier models in correct format in config'
                )

            #Plotter stuff#

        plotter = Plotter(root_obj,
                          train_vars,
                          sig_col=sig_colour,
                          norm_to_data=True)
        cat_counter = 0
        for b in boundaries:
            if cat_counter == 0:
                extra_cuts = options.mva_proc + '_mva >' + str(
                    boundaries['tag_0'])
            else:
                extra_cuts = (options.mva_proc + '_mva <' + str(
                    boundaries['tag_' + str(cat_counter - 1)])) + ' and ' + (
                        options.mva_proc + '_mva >' +
                        str(boundaries['tag_' + str(cat_counter)]))
            plotter.plot_input(options.mass_var_name,
                               options.n_bins,
                               output_tag,
                               options.ratio_plot,
                               norm_to_data=True,
                               extra_cuts=extra_cuts,
                               extra_tag=cat_counter,
                               blind=True)
            cat_counter += 1

Пример #13

Показать файл

Файл: make_tag_sequence_w_lstm.py Проект: edjtscott/HToEE

def main(options):

    with open(options.config, 'r') as config_file:
        config = yaml.load(config_file)
        output_tag = config['output_tag']

        mc_dir = config['mc_file_dir']
        mc_fnames = config['mc_file_names']

        data_dir = config['data_file_dir']
        data_fnames = config['data_file_names']

        proc_to_tree_name = config['proc_to_tree_name']

        proc_to_train_vars = config['train_vars']
        object_vars = proc_to_train_vars['VBF']['object_vars']
        flat_obj_vars = [var for i_object in object_vars for var in i_object]
        event_vars = proc_to_train_vars['VBF']['event_vars']

        #used to check all vars we need for categorisation are in our dfs
        all_train_vars = proc_to_train_vars['ggH'] + flat_obj_vars + event_vars

        vars_to_add = config['vars_to_add']

        #Data handling stuff#
        #apply loosest selection (ggh) first, else memory requirements are ridiculous. Fine to do this since all cuts all looser than VBF (not removing events with higher priority)
        loosest_selection = 'dielectronMass > 110 and dielectronMass < 150'

        #load the mc dataframe for all years. Do not apply any specific preselection
        root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir,
                               data_fnames, proc_to_tree_name, all_train_vars,
                               vars_to_add, loosest_selection)
        root_obj.no_lumi_scale()
        for sig_obj in root_obj.sig_objects:
            root_obj.load_mc(sig_obj, reload_samples=options.reload_samples)
        if not options.data_as_bkg:
            for bkg_obj in root_obj.bkg_objects:
                root_obj.load_mc(bkg_obj,
                                 bkg=True,
                                 reload_samples=options.reload_samples)
        else:
            for data_obj in root_obj.data_objects:
                root_obj.load_data(data_obj,
                                   reload_samples=options.reload_samples)
            #overwrite background attribute, for compat with DNN class
            root_obj.mc_df_bkg = root_obj.data_df
        root_obj.concat()

        #Tag sequence stuff#
    #NOTE: these must be concatted in the same way they are concatted in LSTM.create_X_y(), else predicts are misaligned
    combined_df = pd.concat([root_obj.mc_df_sig, root_obj.mc_df_bkg])

    #decide sequence of tags and specify preselection for use with numpy.select:
    tag_sequence = ['VBF', 'ggH']
    proc_to_preselection = {
        'VBF': [
            combined_df['dielectronMass'].gt(110)
            & combined_df['dielectronMass'].lt(150)
            & combined_df['leadElectronPToM'].gt(0.333)
            & combined_df['subleadElectronPToM'].gt(0.25)
            & combined_df['dijetMass'].gt(350)
            & combined_df['leadJetPt'].gt(40)
            & combined_df['subleadJetPt'].gt(30)
        ],
        'ggH': [
            combined_df['dielectronMass'].gt(110)
            & combined_df['dielectronMass'].lt(150)
            & combined_df['leadElectronPToM'].gt(0.333)
            & combined_df['subleadElectronPToM'].gt(0.25)
        ]
    }

    # GET MVA SCORES #

    with open(options.mva_config, 'r') as mva_config_file:
        config = yaml.load(mva_config_file)
        proc_to_model = config['models']
        proc_to_tags = config['boundaries']

        #evaluate ggH BDT scores
        print 'evaluating ggH classifier: {}'.format(proc_to_model['ggH'])
        clf = pickle.load(open('models/{}'.format(proc_to_model['ggH']), "rb"))
        train_vars = proc_to_train_vars['ggH']
        combined_df['ggH_mva'] = clf.predict_proba(
            combined_df[train_vars].values)[:, 1:].ravel()

        #Evaluate VBF LSTM
        print 'loading VBF DNN:'
        with open('models/{}'.format(proc_to_model['VBF']['architecture']),
                  'r') as model_json:
            model_architecture = model_json.read()
        model = keras.models.model_from_json(model_architecture)
        model.load_weights('models/{}'.format(proc_to_model['VBF']['model']))

        LSTM = LSTM_DNN(root_obj, object_vars, event_vars, 1.0, False, True)

        # set up X and y Matrices. Log variables that have GeV units
        LSTM.var_transform(do_data=False)
        X_tot, y_tot = LSTM.create_X_y()
        X_tot = X_tot[flat_obj_vars + event_vars]  #filter unused vars
        print np.isnan(X_tot).any()

        #scale X_vars to mean=0 and std=1. Use scaler fit during previous dnn training
        LSTM.load_X_scaler(out_tag='VBF_DNN')
        X_tot = LSTM.X_scaler.transform(X_tot)

        #make 2D vars for LSTM layers
        X_tot = pd.DataFrame(X_tot, columns=flat_obj_vars + event_vars)
        X_tot_high_level = X_tot[event_vars].values
        X_tot_low_level = LSTM.join_objects(X_tot[flat_obj_vars])

        #predict probs. Corresponds to same events, since dfs are concattened internally in the same
        combined_df['VBF_mva'] = model.predict(
            [X_tot_high_level, X_tot_low_level], batch_size=1).flatten()

        # TAG NUMBER #

        #decide on tag
        for proc in tag_sequence:
            presel = proc_to_preselection[proc]
            tag_bounds = proc_to_tags[proc].values()
            tag_masks = []
            for i_bound in range(
                    len(tag_bounds)):  #c++ type looping for index reasons
                if i_bound == 0:  #first bound, tag 0
                    tag_masks.append(presel[0] & combined_df['{}_mva'.format(
                        proc)].gt(tag_bounds[i_bound]))
                else:  #intermed bound
                    tag_masks.append(presel[0] & combined_df['{}_mva'.format(
                        proc)].lt(tag_bounds[i_bound - 1]) & combined_df[
                            '{}_mva'.format(proc)].gt(tag_bounds[i_bound]))

            mask_key = [icat for icat in range(len(tag_bounds))]

            combined_df['{}_analysis_tag'.format(proc)] = np.select(
                tag_masks, mask_key, default=-999)

        # PROC PRIORITY #

        # deduce tag priority: if two or more tags satisfied then set final tag to highest priority tag. make this non hardcoded i.e. compare proc in position 1 to all lower prioty positions. then compare proc in pos 2 ...
        tag_priority_filter = [
            combined_df['VBF_analysis_tag'].ne(-999)
            & combined_df['ggH_analysis_tag'].ne(-999),  # 1) if both filled...
            combined_df['VBF_analysis_tag'].ne(-999)
            & combined_df['ggH_analysis_tag'].eq(
                -999),  # 2) if VBF filled and ggH not, take VBF
            combined_df['VBF_analysis_tag'].eq(-999)
            & combined_df['ggH_analysis_tag'].ne(
                -999),  # 3) if ggH filled and VBF not, take ggH
        ]

        tag_priority_key = [
            'VBF',  #1) take VBF
            'VBF',  #2) take VBF
            'ggH',  #3) take ggH
        ]
        combined_df['priority_tag'.format(proc)] = np.select(
            tag_priority_filter, tag_priority_key,
            default='NOTAG')  # else keep -999 i.e. NOTAG

        #some debug checks:
        #print combined_df[['dipho_mass', 'dipho_leadIDMVA', 'dipho_subleadIDMVA', 'dipho_lead_ptoM', 'dipho_sublead_ptoM', 'dijet_Mjj', 'dijet_LeadJPt', 'dijet_SubJPt', 'ggH_bdt', 'VBF_bdt', 'VBF_analysis_tag', 'ggH_analysis_tag', 'priority_tag']]
        #print combined_df[combined_df.VBF_analysis_tag>-1][['dipho_mass', 'dipho_leadIDMVA', 'dipho_subleadIDMVA', 'dipho_lead_ptoM', 'dipho_sublead_ptoM', 'dijet_Mjj', 'dijet_LeadJPt', 'dijet_SubJPt', 'ggH_bdt', 'VBF_bdt', 'VBF_analysis_tag', 'ggH_analysis_tag', 'priority_tag']]
        #print combined_df[combined_df.ggH_analysis_tag>-1][['dipho_mass', 'dipho_leadIDMVA', 'dipho_subleadIDMVA', 'dipho_lead_ptoM', 'dipho_sublead_ptoM', 'dijet_Mjj', 'dijet_LeadJPt', 'dijet_SubJPt', 'ggH_bdt', 'VBF_bdt', 'VBF_analysis_tag', 'ggH_analysis_tag', 'priority_tag']]

        # FILL TREES BASED ON BOTH OF ABOVE
        tree_vars = ['dZ', 'CMS_hgg_mass', 'weight']
        combined_df['dZ'] = float(0.)
        combined_df['CMS_hgg_mass'] = combined_df['dielectronMass']

        # FIXME: dont loop through events eventually but for now I cba to use numpy to vectorise it again
        #for true_proc in tag_sequence+['Data']:
        #    #isolate true proc
        #    true_proc_df = combined_df[combined_df.proc==true_proc.lower()]
        #    #how much true proc landed in each of our analysis cats?
        #    for target_proc in tag_sequence:  #for all events that got the proc tag, which tag did they fall into?
        #        true_proc_target_proc_df = true_proc_df[true_proc_df.priority_tag==target_proc]
        #        for i_tag in range(len(proc_to_tags[target_proc].values())):#for each tag corresponding to the category we target, which events go in which tag
        #             true_procs_target_proc_tag_i  = true_proc_target_proc_df[true_proc_target_proc_df['{}_analysis_tag'.format(target_proc)].eq(i_tag)]
        #
        #             branch_name = '{}_125_13TeV_{}cat{} :'.format(true_proc.lower(), target_proc.lower(), i_tag )
        #             print true_procs_target_proc_tag_i[['dipho_mass', 'dipho_leadIDMVA', 'dipho_subleadIDMVA', 'dipho_lead_ptoM', 'dipho_sublead_ptoM', 'dijet_Mjj', 'dijet_LeadJPt', 'dijet_SubJPt', 'ggH_bdt', 'VBF_bdt', 'VBF_analysis_tag', 'ggH_analysis_tag', 'priority_tag']].head(10)
        #             print branch_name

        #get tree names
        branch_names = {}
        #print 'DEBUG: {}'.format(np.unique(combined_df['proc']))
        for true_proc in tag_sequence + ['Data']:
            branch_names[true_proc] = []
            for target_proc in tag_sequence:  #for all events that got the proc tag, which tag did they fall into?
                for i_tag in range(
                        len(proc_to_tags[target_proc].values())
                ):  #for each tag corresponding to the category we target, which events go in which tag
                    if true_proc is not 'Data':
                        branch_names[true_proc].append(
                            '{}_125_13TeV_{}cat{}'.format(
                                true_proc.lower(), target_proc.lower(), i_tag))
                    else:
                        branch_names[true_proc].append(
                            '{}_13TeV_{}cat{}'.format(true_proc,
                                                      target_proc.lower(),
                                                      i_tag))

        #debug_procs = ['dipho_mass', 'dipho_leadIDMVA', 'dipho_subleadIDMVA', 'dipho_lead_ptoM', 'dipho_sublead_ptoM', 'dijet_Mjj', 'dijet_LeadJPt', 'dijet_SubJPt', 'ggH_bdt', 'VBF_bdt', 'VBF_analysis_tag', 'ggH_analysis_tag', 'priority_tag']
        debug_vars = [
            'proc', 'VBF_analysis_tag', 'ggH_analysis_tag', 'priority_tag'
        ]
        combined_df['tree_name'] = combined_df.apply(assign_tree, axis=1)
        print combined_df[debug_vars + ['tree_name']]

        if not path.isdir('output_trees/'):
            print 'making directory: {}'.format('output_trees/')
            system('mkdir -p %s' % 'output_trees/')

        #have to save individual trees then hadd procs together on the command line.
        for proc in tag_sequence + ['Data']:
            selected_df = combined_df[combined_df.proc == proc]
            for bn in branch_names[proc]:
                print bn
                branch_selected_df = selected_df[selected_df.tree_name == bn]
                print branch_selected_df[debug_vars + ['tree_name']].head(20)
                root_pandas.to_root(branch_selected_df[tree_vars],
                                    'output_trees/{}.root'.format(bn),
                                    key=bn)
                print