Exemplo n.º 1
0
def createSamples(channel,
                  analysis_dir,
                  total_weight,
                  server,
                  add_data_cut=None,
                  dataset='2017'):
    sample_dict = {}
    # print "creating samples from %s"%(analysis_dir)
    samples_all, samples_singlefake, samples_doublefake, samples_nonprompt, samples_mc, samples_data = createSampleLists(
        analysis_dir=analysis_dir,
        server=server,
        channel=channel,
        add_data_cut=add_data_cut,
        dataset=dataset)

    #select here the samples you wish to use
    # working_samples = samples_data_dde
    working_samples = samples_all
    working_samples = setSumWeights(working_samples)
    sample_dict['working_samples'] = working_samples
    print('')

    print('###########################################################')
    print('# %d samples to be used:' % (len(working_samples)))
    print('###########################################################')
    for sample in working_samples:
        print(
            '{:<20}{:<20}'.format(*[sample.name, ('path: ' + sample.ana_dir)]))
    # for w in working_samples: print('{:<20}{:<20}'.format(*[w.name,('path: '+w.ana_dir+w.dir_name+'/'+tree_prod_name)]))

    return sample_dict
Exemplo n.º 2
0
    def makeDataFrame(self):
        sample_dict = {}
        samples_all, samples_singlefake, samples_doublefake = createSampleLists(analysis_dir=self.analysis_dir, server = self.server, channel=self.channel)
        working_samples = samples_doublefake
        working_samples = setSumWeights(working_samples)
        print('###########################################################')
        print('# measuring doublefakerake...')
        print('# %d samples to be used:'%(len(working_samples)))
        print('###########################################################')
        for w in working_samples: print('{:<20}{:<20}'.format(*[w.name,('path: '+w.ana_dir)]))
        chain = TChain('tree') #TChain'ing all data samples together
        for i,s in enumerate(working_samples):
            sample = working_samples[0]
            file_name = '/'.join([sample.ana_dir, sample.dir_name, sample.tree_prod_name, 'tree.root'])
            chain.Add(file_name)
            
        dataframe = RDataFrame(chain)
        weight = 'weight * lhe_weight'
        dataframe = dataframe.Define('w',weight)\
                            .Define('ptCone',self.ptCone())\
                            .Define('abs_hnl_hn_vis_eta','abs(hnl_hn_vis_eta)')\
                            .Define('abs_hnl_hn_eta','abs(hnl_hn_eta)')\
                            .Define('abs_l1_eta','abs(l1_eta)')\
                            .Define('abs_l2_eta','abs(l2_eta)')\
                            .Define('abs_l1_jet_flavour_parton','abs(l1_jet_flavour_parton)')\
                            .Define('abs_l2_jet_flavour_parton','abs(l2_jet_flavour_parton)')\

        return dataframe
Exemplo n.º 3
0
def makeCfgs(signalDict, channel, dataset, ana_dir, signals):
    if ('mmm' in channel) or ('mem' in channel): ch = 'mu'
    if ('eee' in channel) or ('eem' in channel): ch = 'e'
    samples = []
    Vs = [\
        '0p00001', # v2 = 1em10
        '0p00001414213562', # v2 = 2em10
        '0p00001732050808', # v2 = 3em10
        '0p00002', # v2 = 4em10
        '0p000022360679774997898', # v2 = 5em10
        '0p00002449489743', # v2 = 6em10
        '0p00002645751311', # v2 = 7em10
        '0p00002828427125', # v2 = 8em10
        '0p00003', # v2 = 9em10

        '0p000031622776601683795', # v2 = 1em09 
        '0p00004472135955', # v2 = 2em09
        '0p00005477225575', # v2 = 3em09
        '0p0000632455532', # v2 = 4em09
        '0p00007071067811865475', # v2 = 5em09
        '0p00007745966692', #v2 = 6em09
        '0p00008366600265', #v2 = 7em09
        '0p0000894427191', #v2 = 8em09
        '0p00009486832981', #v2 = 9em09

        '0p0001', # v2 = 1em08
        '0p0001414213562', # v2 = 2em08
        '0p0001732050808', # v2 = 3em08
        '0p0002', # v2 = 4em08
        '0p00022360679774997898', # v2 = 5em08
        '0p0002449489743', # v2 = 6em08
        '0p0002645751311', # v2 = 7em08
        '0p0002828427125', # v2 = 8em08
        '0p0003', # v2 = 9em08

        '0p00031622776601683795', # v2 = 1em07 
        '0p0004472135955', # v2 = 2em07
        '0p0005477225575', # v2 = 3em07
        '0p000632455532', # v2 = 4em07
        '0p0007071067811865475', # v2 = 5em07
        '0p0007745966692', #v2 = 6em07
        '0p0008366600265', #v2 = 7em07
        '0p000894427191', #v2 = 8em07
        '0p0009486832981', #v2 = 9em07

        '0p001', # v2 = 1em06
        '0p001414213562', # v2 = 2em06
        '0p001732050808', # v2 = 3em06
        '0p002', # v2 = 4em06
        '0p0022360679774997898', # v2 = 5em06
        '0p002449489743', # v2 = 6em06
        '0p002645751311', # v2 = 7em06
        '0p002828427125', # v2 = 8em06
        '0p003', # v2 = 9em06

        '0p0031622776601683795', # v2 = 1em05 
        '0p004472135955', # v2 = 2em05
        '0p005477225575', # v2 = 3em05
        '0p00632455532', # v2 = 4em05
        '0p007071067811865475', # v2 = 5em05
        '0p007745966692', #v2 = 6em05
        '0p008366600265', #v2 = 7em05
        '0p00894427191', #v2 = 8em05
        '0p009486832981', #v2 = 9em05

        '0p01', # v2 = 1em04
        '0p01414213562', # v2 = 2em04
        '0p01732050808', # v2 = 3em04
        '0p02', # v2 = 4em04
        '0p022360679774997898', # v2 = 5em04
        '0p02449489743', # v2 = 6em04
        '0p02645751311', # v2 = 7em04
        '0p02828427125', # v2 = 8em04
        '0p03', # v2 = 9em04

        '0p031622776601683795', # v2 = 1em03 
        '0p04472135955', # v2 = 2em03
        '0p05477225575', # v2 = 3em03
        '0p0632455532', # v2 = 4em03
        '0p07071067811865475', # v2 = 5em03
        '0p07745966692', #v2 = 6em03
        '0p08366600265', #v2 = 7em03
        '0p0894427191', #v2 = 8em03
        '0p09486832981', #v2 = 9em03

        '0p1', # v2 = 1em02
        '0p1414213562', # v2 = 2em02
        '0p1732050808', # v2 = 3em02
        '0p2', # v2 = 4em02
        '0p22360679774997898', # v2 = 5em02
        '0p2449489743', # v2 = 6em02
        '0p2645751311', # v2 = 7em02
        '0p2828427125', # v2 = 8em02
        '0p3', # v2 = 9em02

        '0p31622776601683795', # v2 = 1em01 
        '0p4472135955', # v2 = 2em01
        '0p5477225575', # v2 = 3em01
        '0p632455532', # v2 = 4em01
        '0p7071067811865475', # v2 = 5em01
        '0p7745966692', #v2 = 6em01
        '0p8366600265', #v2 = 7em01
        '0p894427191', #v2 = 8em01
        '0p9486832981', #v2 = 9em01
        ]

    for mass in signalDict:
        if signalDict[mass] == {}: continue
        maxEntries = 0
        maxEntriesSampleKey = None
        for v in signalDict[mass]:
            entries = signalDict[mass][v]['count']
            if entries >= maxEntries:
                maxEntries = entries
                maxEntriesSampleKey = v
        try:
            subdir = signalDict[mass][maxEntriesSampleKey]['name']
        except:
            set_trace()
        for newV in Vs:
            name = 'HN3L_M_%s_V_%s_%s_massiveAndCKM_LO_reweighted' % (mass,
                                                                      newV, ch)
            sample = makeSample(name,
                                subdir=subdir,
                                signals=signals,
                                dataset=dataset,
                                channel=channel,
                                analysis_dir=ana_dir)
            samples.append(sample)

    samples = setSumWeights(samples)

    return samples
Exemplo n.º 4
0
def createArrays(features,
                 branches,
                 path_to_NeuralNet,
                 faketype='DoubleFake',
                 channel='mmm',
                 multiprocess=True,
                 dataset='2017',
                 analysis_dir='/home/dehuazhu/SESSD/4_production/'):
    #define basic environmental parameters
    hostname = gethostname()
    channel = channel
    sample_dict = {}

    # call samples
    samples_all, samples_singlefake, samples_doublefake, samples_nonprompt, samples_mc, samples_data = createSampleLists(
        analysis_dir=analysis_dir,
        server=hostname,
        channel=channel,
        dataset=dataset)
    working_samples = samples_data
    # working_samples = samples_nonprompt
    # working_samples = samples_mc

    # necessary if you want to compare data with MC
    working_samples = setSumWeights(working_samples)
    samples_mc = setSumWeights(samples_mc)

    # make a TChain object by combining all necessary data samples
    print('###########################################################')
    if faketype == 'DoubleFake': print('# measuring doublefakerate...')
    if faketype == 'SingleFake1':
        print('# measuring singlefakerate for lepton 1...')
    if faketype == 'SingleFake2':
        print('# measuring singlefakerate for lepton 2...')
    print('# %d samples to be used:' % (len(working_samples)))
    print('###########################################################')
    for w in working_samples:
        print('{:<20}{:<20}'.format(*[w.name, ('path: ' + w.ana_dir)]))

    chain = TChain('tree')  #TChain'ing all data samples together
    for i, s in enumerate(working_samples):
        # sample = working_samples[0] #super stupid mistake, I'm keeping it here as a painful reminder
        sample = working_samples[i]
        file_name = '/'.join([
            sample.ana_dir, sample.dir_name, sample.tree_prod_name, 'tree.root'
        ])
        chain.Add(file_name)

    # define the selections
    if faketype == 'SingleFake1':
        region = Selections.Region('MR_SF1', channel, 'MR_SF1')
        selection_passing = region.data
        selection_failing = region.SF_LT

    if faketype == 'SingleFake2':
        region = Selections.Region('MR_SF2', channel, 'MR_SF2')
        selection_passing = region.data
        selection_failing = region.SF_TL

    if faketype == 'DoubleFake':
        region = Selections.Region('MR_DF', channel, 'MR_DF')
        selection_passing = region.data
        selection_failing = region.DF

    if faketype == 'nonprompt':
        region = Selections.Region('AN_Feb', channel, 'AN_Feb')
        selection_passing = region.data
        selection_failing = region.nonprompt
        selection_passing_MC = region.MC_contamination_pass
        selection_failing_MC = region.MC_contamination_fail

    # convert TChain object into numpy arrays for the training
    start = time.time()
    if multiprocess == True:
        queue = multiprocessing.Queue()
        result = []
        processes = []

        for key in ['pass', 'fail']:
            if key == 'pass': selection = selection_passing
            if key == 'fail': selection = selection_failing
            processes.append(
                multiprocessing.Process(target=tree2array_process,
                                        args=(queue, chain, branches,
                                              selection, key)))

        for p in processes:
            p.start()

        for p in processes:
            result.append(queue.get())
            p.join()

        for r in result:
            if r[0] == 'pass':
                array_pass = r[1]
            if r[0] == 'fail':
                array_fail = r[1]

    if multiprocess == False:
        print('converting .root ntuples to numpy arrays... (passed events)')
        array_pass = tree2array(chain,
                                branches=branches,
                                selection=selection_passing)
        print('nevents from array_pass: '******'converting .root ntuples to numpy arrays... (failed events)')
        array_fail = tree2array(chain,
                                branches=branches,
                                selection=selection_failing)
        print('nevents from array_fail: ' + str(array_fail.size))

    delta = time.time() - start
    print('It took %.2f seconds to create the arrays' % delta)

    df_pass = pd.DataFrame(array_pass)
    df_fail = pd.DataFrame(array_fail)

    #giving data the contamination weight '1' (i.e. ignore it)
    for array in [df_pass, df_fail]:
        array['contamination_weight'] = array.weight * array.lhe_weight
        # array['contamination_weight'] = array.weight * array.lhe_weight * lumi *  xsec / sumweights

    # adding MC prompt contamination
    print('###########################################################')
    print('now adding MC prompt contamination to the training')
    print('# %d samples to be used:' % (len(samples_mc)))
    print('###########################################################')
    for w in samples_mc:
        print('{:<20}{:<20}'.format(*[w.name, ('path: ' + w.ana_dir)]))

    lumi = 41530  # all eras
    # lumi = 4792 # only era B

    if multiprocess == True:
        pool = multiprocessing.Pool(len(samples_mc))
        input_array = []

        for i, sample in enumerate(samples_mc):
            for key in ['pass', 'fail']:
                file_in = '/'.join([
                    sample.ana_dir, sample.dir_name, sample.tree_prod_name,
                    'tree.root'
                ])
                if key == 'pass': selection = selection_passing_MC
                if key == 'fail': selection = selection_failing_MC
                entry = [
                    file_in, branches, selection, sample.name, key,
                    sample.xsec, sample.sumweights
                ]
                input_array.append(entry)

        result = pool.map(root2array_PoolProcess, input_array)

        for i, sample in enumerate(result):
            array = sample[1]
            xsec = sample[2]
            sumweights = sample[3]
            try:
                array[
                    'contamination_weight'] = array.weight * array.lhe_weight * lumi * (
                        -1) * xsec / sumweights
                # array['contamination_weight'] = array.weight * array.lhe_weight * lumi *  xsec /sumweights
                # array['contamination_weight'] = array.weight * array.lhe_weight
            except:
                set_trace()

            if sample[0] == 'pass':
                df_pass = pd.concat([df_pass, array])
                # df_fail = pd.concat([df_fail,array])
                # print ('added pass events to df_pass: %d'%len(array))

            if sample[0] == 'fail':
                # df_pass = pd.concat([df_pass,array])
                df_fail = pd.concat([df_fail, array])
                # print ('added fail events to df_pass: %d'%len(array))

    if multiprocess == False:
        for i, s in enumerate(samples_mc):
            sample = samples_mc[i]
            print('computing %s' % sample.name)
            file_in = '/'.join([
                sample.ana_dir, sample.dir_name, sample.tree_prod_name,
                'tree.root'
            ])

            selection_pass = selection_passing_MC
            selection_fail = selection_failing_MC

            passing = pd.DataFrame(
                root2array(file_in,
                           'tree',
                           branches=branches,
                           selection=selection_passing_MC))
            failing = pd.DataFrame(
                root2array(file_in,
                           'tree',
                           branches=branches,
                           selection=selection_failing_MC))

            for array in [passing, failing]:
                array[
                    'contamination_weight'] = array.weight * array.lhe_weight * lumi * (
                        -1) * sample.xsec / sample.sumweights
                # array['contamination_weight'] = array.weight * array.lhe_weight * lumi *  sample.xsec / sample.sumweights
            df_pass = pd.concat([df_pass, passing])
            # df_pass = pd.concat([df_fail,failing])
            # df_fail = pd.concat([df_fail,passing])
            df_fail = pd.concat([df_fail, failing])

    print('array size after including MC: %d(pass); %d(fail)' %
          (len(df_pass), len(df_fail)))

    # add the target column
    df_pass['target'] = np.ones(df_pass.shape[0]).astype(np.int)
    df_fail['target'] = np.zeros(df_fail.shape[0]).astype(np.int)

    # concatenate the events and shuffle
    data = pd.concat([df_pass, df_fail])
    data = data.sample(
        frac=1, replace=False,
        random_state=1986)  # shuffle (and DON'T replace the sample)
    data.index = np.array(range(len(data)))

    data.to_pickle(path_to_NeuralNet + 'training_data.pkl')
Exemplo n.º 5
0
    def measureSFR(self, drawPlot = False):
        sample_dict = {}
        samples_all, samples_singlefake, samples_doublefake = createSampleLists(analysis_dir=self.analysis_dir, server = self.server, channel=self.channel)
        working_samples = samples_singlefake
        working_samples = setSumWeights(working_samples)
        print('###########################################################')
        print('# measuring singlefakerake...')
        print('# %d samples to be used:'%(len(working_samples)))
        print('###########################################################')
        for w in working_samples: print('{:<20}{:<20}'.format(*[w.name,('path: '+w.ana_dir)]))
        chain = TChain('tree') #TChain'ing all data samples together
        for i,s in enumerate(working_samples):
            sample = working_samples[0]
            file_name = '/'.join([sample.ana_dir, sample.dir_name, sample.tree_prod_name, 'tree.root'])
            chain.Add(file_name)
            
        dataframe = RDataFrame(chain)
        weight = 'weight * lhe_weight'
        dataframe = dataframe.Define('w',weight)\
                            .Define('ptCone',self.ptCone())\
                            .Define('abs_hnl_hn_vis_eta','abs(hnl_hn_vis_eta)')\
                            .Define('abs_hnl_hn_eta','abs(hnl_hn_eta)')\
                            .Define('abs_l1_eta','abs(l1_eta)')\
                            .Define('abs_l2_eta','abs(l2_eta)')\
                            .Define('abs_l1_jet_flavour_parton','abs(l1_jet_flavour_parton)')\
                            .Define('abs_l2_jet_flavour_parton','abs(l2_jet_flavour_parton)')\

        # bins_ptCone = np.array([5.,10., 20., 30., 40.,70., 2000])
        # bins_eta    = np.array([0., 0.8, 1.2, 2.4]) 
        bins_ptCone = np.array([5.,10., 20., 30., 40.,70.])
        bins_eta    = np.array([0., 0.8, 1.2, 2.4]) 

        selection_baseline      = getSelection(self.channel,'MR_SF')  

        selection_LL_uncorrelated = '(' + ' & '\
                                    .join([\
                                    selection_baseline,\
                                    getSelection(self.channel,'L_L_uncorrelated')\
                                    ]) + ')' 
        selection_TT_uncorrelated = '(' + ' & '\
                                    .join([\
                                    selection_baseline,\
                                    getSelection(self.channel,'L_L_uncorrelated'),\
                                    getSelection(self.channel,'T_T')\
                                    ]) + ')' 

        h_LL_uncorrelated = dataframe\
                .Filter(selection_LL_uncorrelated)\
                .Histo2D(('h_LL_uncorrelated','h_LL_uncorrelated',len(bins_ptCone)-1,bins_ptCone, len(bins_eta)-1, bins_eta),'ptCone','abs_hnl_hn_vis_eta','w')
        #name the axis, also initiate the dataframe call
        h_LL_uncorrelated.SetTitle(';ptCone [GeV]; dimuon #eta')

        h_TT_uncorrelated = dataframe\
                .Filter(selection_TT_uncorrelated)\
                .Histo2D(('h_TT_uncorrelated','h_TT_uncorrelated',len(bins_ptCone)-1,bins_ptCone, len(bins_eta)-1, bins_eta),'ptCone','abs_hnl_hn_vis_eta','w')
        #name the axis, also initiate the dataframe call
        h_TT_uncorrelated.SetTitle(';ptCone [GeV]; dimuon #eta')

        # preparing the histo and save it into a .root file
        sfr_TH2_dir = '/home/dehuazhu/HNL/CMSSW_9_4_6_patch1/src/PlotFactory/DataBkgPlots/modules/DDE_singlefake.root' 
        sfr_hist = h_TT_uncorrelated.Clone()
        # sfr_hist = h_LL_uncorrelated.Clone()
        # sfrhist = h_baseline.Clone()
        # sfr_hist.Divide(h_LL_uncorrelated.Clone())
        # dfr_hist.SaveAs(sfr_TH2_dir) #uncomment this to save the TH2

        # draw the histo if required 
        if drawPlot == True:
            can = TCanvas('can', '')
            # sfr_hist.Draw('colzTextE')
            # sfr_hist.Draw('colz')
            sfr_hist.Draw()
            pf.showlumi('%d entries'%(sfr_hist.GetEntries()))
            # pf.showlogopreliminary()
            can.Update()
            set_trace()