예제 #1
0
    def makeDataFrame(self):
        sample_dict = {}
        samples_all, samples_singlefake, samples_doublefake = createSampleLists(analysis_dir=self.analysis_dir, server = self.server, channel=self.channel)
        working_samples = samples_doublefake
        working_samples = setSumWeights(working_samples)
        print('###########################################################')
        print('# measuring doublefakerake...')
        print('# %d samples to be used:'%(len(working_samples)))
        print('###########################################################')
        for w in working_samples: print('{:<20}{:<20}'.format(*[w.name,('path: '+w.ana_dir)]))
        chain = TChain('tree') #TChain'ing all data samples together
        for i,s in enumerate(working_samples):
            sample = working_samples[0]
            file_name = '/'.join([sample.ana_dir, sample.dir_name, sample.tree_prod_name, 'tree.root'])
            chain.Add(file_name)
            
        dataframe = RDataFrame(chain)
        weight = 'weight * lhe_weight'
        dataframe = dataframe.Define('w',weight)\
                            .Define('ptCone',self.ptCone())\
                            .Define('abs_hnl_hn_vis_eta','abs(hnl_hn_vis_eta)')\
                            .Define('abs_hnl_hn_eta','abs(hnl_hn_eta)')\
                            .Define('abs_l1_eta','abs(l1_eta)')\
                            .Define('abs_l2_eta','abs(l2_eta)')\
                            .Define('abs_l1_jet_flavour_parton','abs(l1_jet_flavour_parton)')\
                            .Define('abs_l2_jet_flavour_parton','abs(l2_jet_flavour_parton)')\

        return dataframe
예제 #2
0
def createSamples(channel, analysis_dir, total_weight, server, add_data_cut=None, dataset = '2017'):
    sample_dict = {}
    # print "creating samples from %s"%(analysis_dir)
    samples_all, samples_singlefake, samples_doublefake, samples_nonprompt, samples_mc, samples_data = createSampleLists(analysis_dir=analysis_dir, server = server, channel=channel, add_data_cut=add_data_cut, dataset = dataset)

    #select here the samples you wish to use
    # working_samples = samples_data_dde
    working_samples = samples_all
    working_samples = setSumWeights(working_samples)
    sample_dict['working_samples'] = working_samples
    print('')

    print('###########################################################')
    print('# %d samples to be used:'%(len(working_samples)))
    print('###########################################################')
    for sample in working_samples: print('{:<20}{:<20}'.format(*[sample.name,('path: '+sample.ana_dir)]))
    # for w in working_samples: print('{:<20}{:<20}'.format(*[w.name,('path: '+w.ana_dir+w.dir_name+'/'+tree_prod_name)]))

    return sample_dict
예제 #3
0
def make_all_friendtrees(multiprocess,
                         server,
                         analysis_dir,
                         channel,
                         path_to_NeuralNet,
                         overwrite,
                         dataset='2017'):
    print('making friendtrees for all datasamples')
    start = time.time()
    # call samples
    samples_all, samples_singlefake, samples_doublefake, samples_nonprompt, samples_mc, samples_data = createSampleLists(
        analysis_dir=analysis_dir,
        server=server,
        channel=channel,
        dataset=dataset)
    working_samples = samples_nonprompt
    for w in working_samples:
        print('{:<20}{:<20}'.format(*[w.name, ('path: ' + w.ana_dir)]))

    if multiprocess == True:
        pool = multiprocessing.Pool(len(working_samples))
        input_array = []

        for i, sample in enumerate(working_samples):
            sample = working_samples[i]
            file_name = '/'.join([
                sample.ana_dir, sample.dir_name, sample.tree_prod_name,
                'tree.root'
            ])
            input_array.append([
                file_name,
                sample.name,
                path_to_NeuralNet + 'net.h5',
                path_to_NeuralNet,
                get_branches_nonprompt2(get_features_nonprompt2()),
                get_features_nonprompt2(),
                overwrite,
            ])
        result = pool.map(makeFriendtree_Process, input_array)

    if multiprocess == False:
        for i, s in enumerate(working_samples):
            sample = working_samples[i]
            file_name = '/'.join([
                sample.ana_dir, sample.dir_name, sample.tree_prod_name,
                'tree.root'
            ])
            friend_file_name = makeFriendtree(
                tree_file_name=file_name,
                sample_name=sample.name,
                net_name=path_to_NeuralNet + 'net.h5',
                path_to_NeuralNet=path_to_NeuralNet,
                branches=get_branches_nonprompt2(get_features_nonprompt2()),
                features=get_features_nonprompt2(),
                overwrite=overwrite,
            )
    duration = time.time() - start
    print('It took %.2f seconds to make all friendtrees.' % duration)
예제 #4
0
def createArrays(features,
                 branches,
                 path_to_NeuralNet,
                 faketype='DoubleFake',
                 channel='mmm',
                 multiprocess=True,
                 dataset='2017',
                 analysis_dir='/home/dehuazhu/SESSD/4_production/'):
    #define basic environmental parameters
    hostname = gethostname()
    channel = channel
    sample_dict = {}

    # call samples
    samples_all, samples_singlefake, samples_doublefake, samples_nonprompt, samples_mc, samples_data = createSampleLists(
        analysis_dir=analysis_dir,
        server=hostname,
        channel=channel,
        dataset=dataset)
    working_samples = samples_data
    # working_samples = samples_nonprompt
    # working_samples = samples_mc

    # necessary if you want to compare data with MC
    working_samples = setSumWeights(working_samples)
    samples_mc = setSumWeights(samples_mc)

    # make a TChain object by combining all necessary data samples
    print('###########################################################')
    if faketype == 'DoubleFake': print('# measuring doublefakerate...')
    if faketype == 'SingleFake1':
        print('# measuring singlefakerate for lepton 1...')
    if faketype == 'SingleFake2':
        print('# measuring singlefakerate for lepton 2...')
    print('# %d samples to be used:' % (len(working_samples)))
    print('###########################################################')
    for w in working_samples:
        print('{:<20}{:<20}'.format(*[w.name, ('path: ' + w.ana_dir)]))

    chain = TChain('tree')  #TChain'ing all data samples together
    for i, s in enumerate(working_samples):
        # sample = working_samples[0] #super stupid mistake, I'm keeping it here as a painful reminder
        sample = working_samples[i]
        file_name = '/'.join([
            sample.ana_dir, sample.dir_name, sample.tree_prod_name, 'tree.root'
        ])
        chain.Add(file_name)

    # define the selections
    if faketype == 'SingleFake1':
        region = Selections.Region('MR_SF1', channel, 'MR_SF1')
        selection_passing = region.data
        selection_failing = region.SF_LT

    if faketype == 'SingleFake2':
        region = Selections.Region('MR_SF2', channel, 'MR_SF2')
        selection_passing = region.data
        selection_failing = region.SF_TL

    if faketype == 'DoubleFake':
        region = Selections.Region('MR_DF', channel, 'MR_DF')
        selection_passing = region.data
        selection_failing = region.DF

    if faketype == 'nonprompt':
        region = Selections.Region('AN_Feb', channel, 'AN_Feb')
        selection_passing = region.data
        selection_failing = region.nonprompt
        selection_passing_MC = region.MC_contamination_pass
        selection_failing_MC = region.MC_contamination_fail

    # convert TChain object into numpy arrays for the training
    start = time.time()
    if multiprocess == True:
        queue = multiprocessing.Queue()
        result = []
        processes = []

        for key in ['pass', 'fail']:
            if key == 'pass': selection = selection_passing
            if key == 'fail': selection = selection_failing
            processes.append(
                multiprocessing.Process(target=tree2array_process,
                                        args=(queue, chain, branches,
                                              selection, key)))

        for p in processes:
            p.start()

        for p in processes:
            result.append(queue.get())
            p.join()

        for r in result:
            if r[0] == 'pass':
                array_pass = r[1]
            if r[0] == 'fail':
                array_fail = r[1]

    if multiprocess == False:
        print('converting .root ntuples to numpy arrays... (passed events)')
        array_pass = tree2array(chain,
                                branches=branches,
                                selection=selection_passing)
        print('nevents from array_pass: '******'converting .root ntuples to numpy arrays... (failed events)')
        array_fail = tree2array(chain,
                                branches=branches,
                                selection=selection_failing)
        print('nevents from array_fail: ' + str(array_fail.size))

    delta = time.time() - start
    print('It took %.2f seconds to create the arrays' % delta)

    df_pass = pd.DataFrame(array_pass)
    df_fail = pd.DataFrame(array_fail)

    #giving data the contamination weight '1' (i.e. ignore it)
    for array in [df_pass, df_fail]:
        array['contamination_weight'] = array.weight * array.lhe_weight
        # array['contamination_weight'] = array.weight * array.lhe_weight * lumi *  xsec / sumweights

    # adding MC prompt contamination
    print('###########################################################')
    print('now adding MC prompt contamination to the training')
    print('# %d samples to be used:' % (len(samples_mc)))
    print('###########################################################')
    for w in samples_mc:
        print('{:<20}{:<20}'.format(*[w.name, ('path: ' + w.ana_dir)]))

    lumi = 41530  # all eras
    # lumi = 4792 # only era B

    if multiprocess == True:
        pool = multiprocessing.Pool(len(samples_mc))
        input_array = []

        for i, sample in enumerate(samples_mc):
            for key in ['pass', 'fail']:
                file_in = '/'.join([
                    sample.ana_dir, sample.dir_name, sample.tree_prod_name,
                    'tree.root'
                ])
                if key == 'pass': selection = selection_passing_MC
                if key == 'fail': selection = selection_failing_MC
                entry = [
                    file_in, branches, selection, sample.name, key,
                    sample.xsec, sample.sumweights
                ]
                input_array.append(entry)

        result = pool.map(root2array_PoolProcess, input_array)

        for i, sample in enumerate(result):
            array = sample[1]
            xsec = sample[2]
            sumweights = sample[3]
            try:
                array[
                    'contamination_weight'] = array.weight * array.lhe_weight * lumi * (
                        -1) * xsec / sumweights
                # array['contamination_weight'] = array.weight * array.lhe_weight * lumi *  xsec /sumweights
                # array['contamination_weight'] = array.weight * array.lhe_weight
            except:
                set_trace()

            if sample[0] == 'pass':
                df_pass = pd.concat([df_pass, array])
                # df_fail = pd.concat([df_fail,array])
                # print ('added pass events to df_pass: %d'%len(array))

            if sample[0] == 'fail':
                # df_pass = pd.concat([df_pass,array])
                df_fail = pd.concat([df_fail, array])
                # print ('added fail events to df_pass: %d'%len(array))

    if multiprocess == False:
        for i, s in enumerate(samples_mc):
            sample = samples_mc[i]
            print('computing %s' % sample.name)
            file_in = '/'.join([
                sample.ana_dir, sample.dir_name, sample.tree_prod_name,
                'tree.root'
            ])

            selection_pass = selection_passing_MC
            selection_fail = selection_failing_MC

            passing = pd.DataFrame(
                root2array(file_in,
                           'tree',
                           branches=branches,
                           selection=selection_passing_MC))
            failing = pd.DataFrame(
                root2array(file_in,
                           'tree',
                           branches=branches,
                           selection=selection_failing_MC))

            for array in [passing, failing]:
                array[
                    'contamination_weight'] = array.weight * array.lhe_weight * lumi * (
                        -1) * sample.xsec / sample.sumweights
                # array['contamination_weight'] = array.weight * array.lhe_weight * lumi *  sample.xsec / sample.sumweights
            df_pass = pd.concat([df_pass, passing])
            # df_pass = pd.concat([df_fail,failing])
            # df_fail = pd.concat([df_fail,passing])
            df_fail = pd.concat([df_fail, failing])

    print('array size after including MC: %d(pass); %d(fail)' %
          (len(df_pass), len(df_fail)))

    # add the target column
    df_pass['target'] = np.ones(df_pass.shape[0]).astype(np.int)
    df_fail['target'] = np.zeros(df_fail.shape[0]).astype(np.int)

    # concatenate the events and shuffle
    data = pd.concat([df_pass, df_fail])
    data = data.sample(
        frac=1, replace=False,
        random_state=1986)  # shuffle (and DON'T replace the sample)
    data.index = np.array(range(len(data)))

    data.to_pickle(path_to_NeuralNet + 'training_data.pkl')
예제 #5
0
    def measureSFR(self, drawPlot = False):
        sample_dict = {}
        samples_all, samples_singlefake, samples_doublefake = createSampleLists(analysis_dir=self.analysis_dir, server = self.server, channel=self.channel)
        working_samples = samples_singlefake
        working_samples = setSumWeights(working_samples)
        print('###########################################################')
        print('# measuring singlefakerake...')
        print('# %d samples to be used:'%(len(working_samples)))
        print('###########################################################')
        for w in working_samples: print('{:<20}{:<20}'.format(*[w.name,('path: '+w.ana_dir)]))
        chain = TChain('tree') #TChain'ing all data samples together
        for i,s in enumerate(working_samples):
            sample = working_samples[0]
            file_name = '/'.join([sample.ana_dir, sample.dir_name, sample.tree_prod_name, 'tree.root'])
            chain.Add(file_name)
            
        dataframe = RDataFrame(chain)
        weight = 'weight * lhe_weight'
        dataframe = dataframe.Define('w',weight)\
                            .Define('ptCone',self.ptCone())\
                            .Define('abs_hnl_hn_vis_eta','abs(hnl_hn_vis_eta)')\
                            .Define('abs_hnl_hn_eta','abs(hnl_hn_eta)')\
                            .Define('abs_l1_eta','abs(l1_eta)')\
                            .Define('abs_l2_eta','abs(l2_eta)')\
                            .Define('abs_l1_jet_flavour_parton','abs(l1_jet_flavour_parton)')\
                            .Define('abs_l2_jet_flavour_parton','abs(l2_jet_flavour_parton)')\

        # bins_ptCone = np.array([5.,10., 20., 30., 40.,70., 2000])
        # bins_eta    = np.array([0., 0.8, 1.2, 2.4]) 
        bins_ptCone = np.array([5.,10., 20., 30., 40.,70.])
        bins_eta    = np.array([0., 0.8, 1.2, 2.4]) 

        selection_baseline      = getSelection(self.channel,'MR_SF')  

        selection_LL_uncorrelated = '(' + ' & '\
                                    .join([\
                                    selection_baseline,\
                                    getSelection(self.channel,'L_L_uncorrelated')\
                                    ]) + ')' 
        selection_TT_uncorrelated = '(' + ' & '\
                                    .join([\
                                    selection_baseline,\
                                    getSelection(self.channel,'L_L_uncorrelated'),\
                                    getSelection(self.channel,'T_T')\
                                    ]) + ')' 

        h_LL_uncorrelated = dataframe\
                .Filter(selection_LL_uncorrelated)\
                .Histo2D(('h_LL_uncorrelated','h_LL_uncorrelated',len(bins_ptCone)-1,bins_ptCone, len(bins_eta)-1, bins_eta),'ptCone','abs_hnl_hn_vis_eta','w')
        #name the axis, also initiate the dataframe call
        h_LL_uncorrelated.SetTitle(';ptCone [GeV]; dimuon #eta')

        h_TT_uncorrelated = dataframe\
                .Filter(selection_TT_uncorrelated)\
                .Histo2D(('h_TT_uncorrelated','h_TT_uncorrelated',len(bins_ptCone)-1,bins_ptCone, len(bins_eta)-1, bins_eta),'ptCone','abs_hnl_hn_vis_eta','w')
        #name the axis, also initiate the dataframe call
        h_TT_uncorrelated.SetTitle(';ptCone [GeV]; dimuon #eta')

        # preparing the histo and save it into a .root file
        sfr_TH2_dir = '/home/dehuazhu/HNL/CMSSW_9_4_6_patch1/src/PlotFactory/DataBkgPlots/modules/DDE_singlefake.root' 
        sfr_hist = h_TT_uncorrelated.Clone()
        # sfr_hist = h_LL_uncorrelated.Clone()
        # sfrhist = h_baseline.Clone()
        # sfr_hist.Divide(h_LL_uncorrelated.Clone())
        # dfr_hist.SaveAs(sfr_TH2_dir) #uncomment this to save the TH2

        # draw the histo if required 
        if drawPlot == True:
            can = TCanvas('can', '')
            # sfr_hist.Draw('colzTextE')
            # sfr_hist.Draw('colz')
            sfr_hist.Draw()
            pf.showlumi('%d entries'%(sfr_hist.GetEntries()))
            # pf.showlogopreliminary()
            can.Update()
            set_trace()