Пример #1
0
branch_names = [c.strip() for c in branch_names]
print branch_names

import pandas as pd

import root_pandas as rpd

# now need to shuffle here, we just count events
preprocessing.set_signals_and_backgrounds(
    "tagsDumper/trees/bbggtrees_13TeV_DoubleHTag_0",
    branch_names + extra_branches,
    shuffle=False)
X_bkg, y_bkg, weights_bkg, X_sig, y_sig, weights_sig = preprocessing.set_variables(
    branch_names)

X_data, y_data, weights_data = preprocessing.set_data(
    "tagsDumper/trees/Data_13TeV_DoubleHTag_0", branch_names)
X_data, y_data, weights_data = preprocessing.clean_signal_events_single_dataset(
    X_data, y_data, weights_data)

#bbggTrees have by default signal and CR events, let's be sure that we clean it
X_bkg, y_bkg, weights_bkg, X_sig, y_sig, weights_sig = preprocessing.clean_signal_events(
    X_bkg, y_bkg, weights_bkg, X_sig, y_sig, weights_sig)

# load the model from disk
from sklearn.externals import joblib
###########
##2016
loaded_model = joblib.load(
    os.path.expanduser(
        str(pklfolder) + '/' + mass_range + 'mass_XGB_training_file.pkl'))
branch_names = 'absCosThetaStar_CS,absCosTheta_bb,absCosTheta_gg,PhoJetMinDr,customLeadingPhotonIDMVA,customSubLeadingPhotonIDMVA,leadingJet_DeepFlavour,subleadingJet_DeepFlavour,leadingPhotonSigOverE,subleadingPhotonSigOverE,sigmaMOverM,diphotonCandidatePtOverdiHiggsM,dijetCandidatePtOverdiHiggsM,leadingJet_bRegNNResolution,subleadingJet_bRegNNResolution,sigmaMJets,noexpand:leadingPhoton_pt/CMS_hgg_mass,noexpand:subleadingPhoton_pt/CMS_hgg_mass,noexpand:leadingJet_pt/Mjj,noexpand:subleadingJet_pt/Mjj,PhoJetOtherDr,rho'.split(",")
#DeepJet + Mjj
#branch_names = 'absCosThetaStar_CS,absCosTheta_bb,absCosTheta_gg,PhoJetMinDr,customLeadingPhotonIDMVA,customSubLeadingPhotonIDMVA,leadingJet_DeepFlavour,subleadingJet_DeepFlavour,leadingPhotonSigOverE,subleadingPhotonSigOverE,sigmaMOverM,diphotonCandidatePtOverdiHiggsM,dijetCandidatePtOverdiHiggsM,leadingJet_bRegNNResolution,subleadingJet_bRegNNResolution,noexpand:sigmaMJets,Mjj,rho'.split(",")


branch_names = [c.strip() for c in branch_names]
print branch_names

import pandas as pd  
import root_pandas as rpd

# no need to shuffle here, we just count events
preprocessing.set_signals_and_backgrounds("bbggSelectionTree",branch_names,shuffle=False)
X_bkg,y_bkg,weights_bkg,X_sig,y_sig,weights_sig=preprocessing.set_variables(branch_names)

X_data,y_data,weights_data = preprocessing.set_data("bbggSelectionTree",branch_names)
X_data,y_data,weights_data = preprocessing.clean_signal_events_single_dataset(X_data,y_data,weights_data)

#bbggTrees have by default signal and CR events, let's be sure that we clean it
X_bkg,y_bkg,weights_bkg,X_sig,y_sig,weights_sig=preprocessing.clean_signal_events(X_bkg,y_bkg,weights_bkg,X_sig,y_sig,weights_sig)


# load the model from disk
from sklearn.externals import joblib
###########
###########
#st with add ptMgg+ptMjj+dR
#2018
#loaded_model = joblib.load(os.path.expanduser('/eos/user/i/ivovtin/HHggbb/HHbbggTraining/Training/output_files/2018/dev_legecy_runII_ptmgg_ptmjj_dR/simlple_Test_binary_st.pkl'))
#loaded_model = joblib.load(os.path.expanduser('/eos/user/i/ivovtin/HHggbb/HHbbggTraining/Training/output_files/2018/dev_legecy_runII_Mjj_woMjjcut_v2/simlple_Test_binary_st.pkl'))
loaded_model = joblib.load(os.path.expanduser('/eos/user/i/ivovtin/HHggbb/HHbbggTraining/Training/output_files/2018/dev_legecy_runII_ext_rho_rew_v3/simlple_Test_binary_st.pkl'))
def main(options, args):

    addSamples()

    #mva variables, use noexpand for root expressions, it needs this file https://github.com/ibab/root_pandas/blob/master/root_pandas/readwrite.py
    branch_names = 'leadingJet_bDis,subleadingJet_bDis,noexpand:fabs(CosThetaStar_CS),noexpand:fabs(CosTheta_bb),noexpand:fabs(CosTheta_gg)'.split(
        ",")
    branch_names += 'noexpand:diphotonCandidate.Pt()/diHiggsCandidate.M(),noexpand:dijetCandidate.Pt()/diHiggsCandidate.M()'.split(
        ",")
    branch_names += 'customLeadingPhotonIDMVA,customSubLeadingPhotonIDMVA,leadingPhotonSigOverE,subleadingPhotonSigOverE,sigmaMOverMDecorr,PhoJetMinDr'.split(
        ",")
    branch_names = [c.strip() for c in branch_names]
    print "using following variables for MVA: "
    print branch_names

    # no need to shuffle here, we just count events
    preprocessing.set_signals_and_backgrounds("bbggSelectionTree",
                                              branch_names,
                                              shuffle=False)
    X_bkg, y_bkg, weights_bkg, X_sig, y_sig, weights_sig = preprocessing.set_variables(
        branch_names)

    X_data, y_data, weights_data = preprocessing.set_data(
        "bbggSelectionTree", branch_names)
    X_data, y_data, weights_data = preprocessing.clean_signal_events_single_dataset(
        X_data, y_data, weights_data)

    #bbggTrees have by default signal and CR events, let's be sure that we clean it
    X_bkg, y_bkg, weights_bkg, X_sig, y_sig, weights_sig = preprocessing.clean_signal_events(
        X_bkg, y_bkg, weights_bkg, X_sig, y_sig, weights_sig)

    # load the model from disk
    from sklearn.externals import joblib
    loaded_model = joblib.load(
        os.path.expanduser('~/HHbbgg_ETH_devel/Training/output_files/' +
                           options.trainingVersion + '.pkl'))

    bkg = []
    for i in range(0, len(utils.IO.backgroundName) - 1):
        bkg.append(X_bkg[y_bkg == -i - 1])

    #compute the MVA
    if not options.addHHTagger:
        Y_pred_sig = loaded_model.predict_proba(
            X_sig)[:, loaded_model.n_classes_ - 1].astype(np.float64)
        Y_pred_bkg = []
        for i in range(0, len(utils.IO.backgroundName) - 1):
            print str(i)
            Y_pred_bkg.append(
                loaded_model.predict_proba(bkg[i])[:, loaded_model.n_classes_ -
                                                   1].astype(np.float64))

        Y_pred_data = loaded_model.predict_proba(
            X_data)[:, loaded_model.n_classes_ - 1].astype(np.float64)
        print Y_pred_data

    #define MVA cut and additional variables needed
    additionalCut_names = 'noexpand:diphotonCandidate.M(),noexpand:dijetCandidate.M(),MX,isSignal'.split(
        ",")
    #mva output
    if options.addHHTagger:
        additionalCut_names += 'HHTagger2017'.split(",")
    outTag = options.outTag
    outDir = os.path.expanduser("~/HHbbgg_ETH_devel/outfiles/" + outTag)
    if not os.path.exists(outDir):
        os.mkdir(outDir)

    sig_count_df = rpd.read_root(utils.IO.signalName[0],
                                 "bbggSelectionTree",
                                 columns=branch_names + additionalCut_names)
    preprocessing.define_process_weight(sig_count_df, utils.IO.sigProc[0],
                                        utils.IO.signalName[0])

    #nTot is a multidim vector with all additional variables, dictVar is a dictionary associating a name of the variable
    #to a position in the vector
    nTot, dictVar = postprocessing.stackFeatures(
        sig_count_df, branch_names + additionalCut_names)
    #apply isSignal cleaning
    nCleaned = nTot[np.where(nTot[:, dictVar['weight']] != 0), :][0]

    processPath = os.path.expanduser(
        '~/HHbbgg_ETH_devel/outfiles/') + outTag + '/' + utils.IO.signalName[
            0].split("/")[len(utils.IO.signalName[0].split("/")) - 1].replace(
                "output_", "").replace(".root", "") + "_preselection" + ".root"

    if not options.addHHTagger:
        postprocessing.saveTree(processPath, dictVar, nCleaned, Y_pred_sig)
    else:
        postprocessing.saveTree(processPath, dictVar, nCleaned)

    processPath = os.path.expanduser(
        '~/HHbbgg_ETH_devel/outfiles/') + outTag + '/' + utils.IO.signalName[
            0].split("/")[len(utils.IO.signalName[0].split("/")) - 1].replace(
                "output_", "").replace(
                    ".root", "") + "_preselection_diffNaming" + ".root"

    if not options.addHHTagger:
        postprocessing.saveTree(processPath,
                                dictVar,
                                nCleaned,
                                Y_pred_sig,
                                nameTree="reducedTree_sig")
    else:
        postprocessing.saveTree(processPath,
                                dictVar,
                                nCleaned,
                                nameTree="reducedTree_sig")

    ## do gJets not in the loop since they have two samples for one process, to be fixed
    bkg_1_count_df = rpd.read_root(utils.IO.backgroundName[1],
                                   "bbggSelectionTree",
                                   columns=branch_names + additionalCut_names)
    preprocessing.define_process_weight(bkg_1_count_df, utils.IO.bkgProc[1],
                                        utils.IO.backgroundName[1])

    crazySF_20 = 25
    nTot, dictVar = postprocessing.stackFeatures(bkg_1_count_df,
                                                 branch_names +
                                                 additionalCut_names,
                                                 SF=crazySF_20)

    print nTot.shape

    bkg_2_count_df = rpd.read_root(utils.IO.backgroundName[2],
                                   "bbggSelectionTree",
                                   columns=branch_names + additionalCut_names)
    preprocessing.define_process_weight(bkg_2_count_df, utils.IO.bkgProc[2],
                                        utils.IO.backgroundName[2])

    crazySF_40 = 3
    nTot_2, dictVar = postprocessing.stackFeatures(bkg_2_count_df,
                                                   branch_names +
                                                   additionalCut_names,
                                                   SF=crazySF_40)

    nTot_3 = np.concatenate((nTot, nTot_2))

    print nTot_3.shape
    nCleaned = nTot_3[np.where(nTot_3[:, dictVar['weight']] != 0), :][0]
    print "nCleaned"
    print nCleaned.shape

    processPath = (os.path.expanduser('~/HHbbgg_ETH_devel/outfiles/') +
                   outTag + '/' + utils.IO.backgroundName[1].split("/")[
                       len(utils.IO.backgroundName[1].split("/")) - 1].replace(
                           "output_", "").replace(".root", "") +
                   "_preselection" + ".root").replace("_Pt-20to40", "")
    if not options.addHHTagger:
        postprocessing.saveTree(processPath, dictVar, nCleaned, Y_pred_bkg[1])
    else:
        postprocessing.saveTree(processPath, dictVar, nCleaned)

    processPath = (os.path.expanduser('~/HHbbgg_ETH_devel/outfiles/') +
                   outTag + '/' + utils.IO.backgroundName[1].split("/")[
                       len(utils.IO.backgroundName[1].split("/")) - 1].replace(
                           "output_", "").replace(".root", "") +
                   "_preselection_diffNaming" + ".root").replace(
                       "_Pt-20to40", "")

    if not options.addHHTagger:
        postprocessing.saveTree(processPath,
                                dictVar,
                                nCleaned,
                                Y_pred_bkg[1],
                                nameTree="reducedTree_bkg_2")
    else:
        postprocessing.saveTree(processPath,
                                dictVar,
                                nCleaned,
                                nameTree="reducedTree_bkg_2")

    for iProcess in range(0, len(utils.IO.backgroundName)):
        ##gJets which are two samples for one process are skipped
        iSample = iProcess
        if iProcess == 1 or iProcess == 2:
            continue
        if iProcess > 2:
            iSample = iProcess - 1

        print "Processing sample: " + str(iProcess)
        bkg_count_df = rpd.read_root(utils.IO.backgroundName[iProcess],
                                     "bbggSelectionTree",
                                     columns=branch_names +
                                     additionalCut_names)
        preprocessing.define_process_weight(bkg_count_df,
                                            utils.IO.bkgProc[iProcess],
                                            utils.IO.backgroundName[iProcess])

        crazySF = 1
        ##scale diphoton + jets
        if iProcess == 0:
            crazySF = 1.45
        nTot, dictVar = postprocessing.stackFeatures(bkg_count_df,
                                                     branch_names +
                                                     additionalCut_names,
                                                     SF=crazySF)

        nCleaned = nTot[np.where(nTot[:, dictVar['weight']] != 0), :][0]
        print "nCleaned"
        print nCleaned.shape

        processPath = os.path.expanduser(
            '~/HHbbgg_ETH_devel/outfiles/'
        ) + outTag + '/' + utils.IO.backgroundName[iProcess].split("/")[
            len(utils.IO.backgroundName[7].split("/")) - 1].replace(
                "output_", "").replace(".root", "") + "_preselection" + ".root"
        if not options.addHHTagger:
            postprocessing.saveTree(processPath, dictVar, nCleaned,
                                    Y_pred_bkg[iSample])
        else:
            postprocessing.saveTree(processPath, dictVar, nCleaned)

        processPath = os.path.expanduser(
            '~/HHbbgg_ETH_devel/outfiles/'
        ) + outTag + '/' + utils.IO.backgroundName[iProcess].split("/")[
            len(utils.IO.backgroundName[7].split("/")) - 1].replace(
                "output_", "").replace(
                    ".root", "") + "_preselection_diffNaming" + ".root"
        if "GluGluToHHTo2B2G_node_" in processPath:
            treeName = "reducedTree_sig_node_" + str(iProcess - 6)
        else:
            treeName = "reducedTree_bkg_" + str(iProcess)
        if not options.addHHTagger:
            postprocessing.saveTree(processPath,
                                    dictVar,
                                    nCleaned,
                                    Y_pred_bkg[iSample],
                                    nameTree=treeName)
        else:
            postprocessing.saveTree(processPath,
                                    dictVar,
                                    nCleaned,
                                    nameTree=treeName)

    ##data
    data_count_df = rpd.read_root(utils.IO.dataName[0],
                                  "bbggSelectionTree",
                                  columns=branch_names + additionalCut_names)

    nTot, dictVar = postprocessing.stackFeatures(data_count_df,
                                                 branch_names +
                                                 additionalCut_names,
                                                 isData=1)

    #apply isSignal cleaning
    nCleaned = nTot[np.where(nTot[:, dictVar['weight']] != 0), :][0]
    print "nCleaned"
    print nCleaned.shape

    #save preselection data
    processPath = os.path.expanduser(
        '~/HHbbgg_ETH_devel/outfiles/') + outTag + '/' + utils.IO.dataName[
            0].split("/")[len(utils.IO.dataName[0].split("/")) - 1].replace(
                "output_", "").replace(".root", "") + "_preselection" + ".root"
    if not options.addHHTagger:
        postprocessing.saveTree(processPath, dictVar, nCleaned, Y_pred_data)
    else:
        postprocessing.saveTree(processPath, dictVar, nCleaned)

    processPath = os.path.expanduser(
        '~/HHbbgg_ETH_devel/outfiles/') + outTag + '/' + utils.IO.dataName[
            0].split("/")[len(utils.IO.dataName[0].split("/")) - 1].replace(
                "output_", "").replace(
                    ".root", "") + "_preselection_diffNaming" + ".root"
    if not options.addHHTagger:
        postprocessing.saveTree(processPath,
                                dictVar,
                                nCleaned,
                                Y_pred_data,
                                nameTree="reducedTree_bkg")
    else:
        postprocessing.saveTree(processPath,
                                dictVar,
                                nCleaned,
                                nameTree="reducedTree_bkg")

    os.system('hadd ' + os.path.expanduser('~/HHbbgg_ETH_devel/outfiles/') +
              outTag + '/' + 'Total_preselection_diffNaming.root ' +
              os.path.expanduser('~/HHbbgg_ETH_devel/outfiles/') + outTag +
              '/' + '*diffNaming.root')