fileSet = {
            k: fileset[k]
            for k in fileset
            if not ('TTGamma' in k or 'TTbar' in k or 'DY' in k or 'ST' in k
                    or 'W1' in k or 'W2' in k or 'W3' in k or 'W4' in k)
        }
        mcType = 'MCOther'

    print(fileSet.keys())

    output = processor.run_uproot_job(
        fileSet,
        treename='Events',
        processor_instance=TTGammaProcessor(mcEventYields=mcEventYields),
        executor=processor.futures_executor,
        executor_args={
            'workers': 5,
            'flatten': True
        },
        chunksize=50000,
        # maxchunks=1,
    )

    elapsed = time.time() - tstart
    print("Total time: %.1f seconds" % elapsed)
    print("Total rate: %.1f events / second" %
          (output['EventCount'].value / elapsed))

    util.save(output, f"output{mcType}_ttgamma_condorFull_4jet.coffea")

if sys.argv[1] == 'Data':
    output = processor.run_uproot_job(
示例#2
0
def main():

    overwrite = True
    small = True

    # load the config and the cache
    cfg = loadConfig()

    cacheName = 'singleLep_small' if small else 'singleLep'
    
    # Inputs are defined in a dictionary
    # dataset : list of files
    from samples import fileset, fileset_small, fileset_1l

    # histograms
    histograms = ["MET_pt", "N_b", "N_jet", "MT", "N_spec", "pt_spec_max", "HT", "ST"]
    histograms += ['mbj_max', 'mjj_max', 'mlb_min', 'mlb_max', 'mlj_min', 'mlj_max']
    #histograms += ['FWMT1', 'FWMT2', 'FWMT3', 'FWMT4', 'FWMT5']
    #histograms += ['S', 'S_lep']

    # initialize cache
    cache = dir_archive(os.path.join(os.path.expandvars(cfg['caches']['base']), cfg['caches'][cacheName]), serialized=True)
    if not overwrite:
        cache.load()

    if cfg == cache.get('cfg') and histograms == cache.get('histograms') and cache.get('simple_output'):
        output = cache.get('simple_output')

    else:
        # Run the processor
        if small:
            fileset = fileset_small
            workers = 1
        else:
            fileset = fileset_1l
            workers = 6
        output = processor.run_uproot_job(fileset,
                                      treename='Events',
                                      processor_instance=exampleProcessor(),
                                      executor=processor.futures_executor,
                                      executor_args={'workers': workers, 'function_args': {'flatten': False}},
                                      chunksize=50000,
                                     )
        cache['fileset']        = fileset
        cache['cfg']            = cfg
        cache['histograms']     = histograms
        cache['simple_output']  = output
        cache.dump()

    # Make a few plots
    outdir = "./tmp_plots"
    if not os.path.exists(outdir):
        os.makedirs(outdir)

    for name in histograms:
        print (name)
        histogram = output[name]

        ax = hist.plot1d(histogram,overlay="dataset", stack=True) # make density plots because we don't care about x-sec differences
        ax.set_yscale('linear')
        ax.figure.savefig(os.path.join(outdir, "{}.pdf".format(name)))
        ax.clear()

    return output
import time
tstart = time.time()

workers = Factory("local", manager_host_port="localhost:9123")

workers.max_workers = 1
workers.min_workers = 1
workers.python_package = wq_env_tarball
with workers:
    output = processor.run_uproot_job(
        fileset,
        treename='Events',
        processor_instance=MyProcessor(),
        executor=processor.work_queue_executor,
        executor_args=work_queue_executor_args,
        chunksize=100000,

        # Change this to None for a large run:
        maxchunks=4,
    )

elapsed = time.time() - tstart

print(output)

# Expected output:
# {'sumw': defaultdict_accumulator(<class 'float'>, {'DoubleMuon': 400224.0}), 'mass': <Hist (dataset,mass) instance at 0x7f4e02708460>}

if output['sumw']['DoubleMuon'] == 400224.0:
    print("Output is correct.")
示例#4
0
    'savemetrics': 1,
    # 'xrootdconfig': {
    #     'chunkbytes': 1024*128,
    #     'limitbytes': 200 * 1024**2
    # },
    'cachestrategy': 'dask-worker',
    'worker_affinity': True,
}
chunksize = 100000

if True:
    tic = time.time()
    res = processor.run_uproot_job(filelist,
                                   'Events',
                                   NanoTestProcessor(),
                                   processor.dask_executor,
                                   config,
                                   chunksize=chunksize,
                                   maxchunks=None)
    toc = time.time()

    print("Dask client:", client)
    print("Total time: %.0f" % (toc - tic))
    print("Events / s / thread: {:,.0f}".format(res[1]['entries'].value /
                                                res[1]['processtime'].value))
    print("Bytes / s / thread: {:,.0f}".format(res[1]['bytesread'].value /
                                               res[1]['processtime'].value))
    print("Events / s: {:,.0f}".format(res[1]['entries'].value / (toc - tic)))
    print("Bytes / s: {:,.0f}".format(res[1]['bytesread'].value / (toc - tic)))

    from coffea.util import save
示例#5
0
    from os.path import join, isdir, splitext

    reldir = splitext(__file__)[0].replace('_', '/')
    outdir = join(os.getenv('FH_BASE'), "Imgs", reldir)
    if not isdir(outdir): os.makedirs(outdir)

    histos = {}

    print('[signal]')
    outputs = {}
    for k, ds in sigDS.items():
        outputs[k] = processor.run_uproot_job(
            {k: ds},
            treename='ffNtuplizer/ffNtuple',
            processor_instance=LeptonjetIsoProcessor(dphi_control=False,
                                                     data_type='sig'),
            executor=processor.futures_executor,
            executor_args=dict(workers=12, flatten=True),
            chunksize=500000,
        )
    print("Filling..")
    histos['sig'] = {}
    for k in outputs:
        histos['sig'][k] = root_filling(outputs[k], k)

    print('[background]')
    output = processor.run_uproot_job(
        bkgDS,
        treename='ffNtuplizer/ffNtuple',
        processor_instance=LeptonjetIsoProcessor(dphi_control=False,
                                                 data_type='bkg'),
示例#6
0
else:
    import batch
    dfk = batch.configure(nodes=8, nprocs=8)
    job_kwargs = {
        'executor': processor.parsl_executor,
        'executor_args': {
            'flatten': False,
            'xrootdtimeout': 30
        },
    }
    fileset = utils.ensure_local(fileset)

binning = np.arange(101).astype(float)  # yes, hardcoded binning
output = processor.run_uproot_job(fileset,
                                  treename='metaTree/PUDistribution',
                                  processor_instance=PUHists(
                                      fileset.keys(), binning),
                                  chunksize=500000,
                                  **job_kwargs)
print('DONE!')
print(output)
set_trace()  # to do: controllare che sovrascriva il pu_mc
outf = uproot.recreate(f'inputs/{args.jobid}/pu_mc.root')
for key, h in output.items():
    vals = h.value
    vals /= vals.sum()
    outf[key] = (vals, binning)


def norm(h):
    scale = h.allvalues.sum()
    for i in range(len(h)):
        "container_service_names": "dask",
        "dask_container_port": "8787",
        "should_transfer_files": "YES",
        "when_to_transfer_output": "ON_EXIT",
        "+DaskSchedulerAddress": '"129.93.183.33:8787"',
    })

cluster.scale(jobs=1)

client = Client(cluster)  #, security=sec_dask)

print("Dask client: ", client)

exe_args = {
    'client': client,
    'compression': compression,
}

hists = processor.run_uproot_job(
    filelist,
    treename,
    processor_instance=proc,
    executor=processor.dask_executor,
    #executor=processor.futures_executor,
    executor_args=exe_args)

assert (hists['cutflow']['ZJets_pt'] == 18)
assert (hists['cutflow']['ZJets_mass'] == 6)
assert (hists['cutflow']['Data_pt'] == 84)
assert (hists['cutflow']['Data_mass'] == 66)
示例#8
0
    for this_file in infiles:
        index = this_file.split("_")[1].split(".json")[0]
        print(this_file, index)

        uproot.open.defaults[
            "xrootd_handler"] = uproot.source.xrootd.MultithreadedXRootDSource

        p = HbbProcessor(year=year, tagger='v2')
        args = {'savemetrics': True, 'schema': NanoAODSchema}

        output = processor.run_uproot_job(
            this_file,
            treename="Events",
            processor_instance=p,
            executor=processor.dask_executor,
            executor_args={
                "client": client,
                "skipbadfiles": 1,
                "schema": processor.NanoAODSchema,
                "treereduction": 2,
            },
            chunksize=100000,
            #        maxchunks=args.max,
        )

        outfile = '/uscms/home/cmantill/nobackup/tmp/hbb-cut-based/vbf-category/ggf-vbf-ddb2/outfiles/' + str(
            year) + '_dask_' + index + '.coffea'
        util.save(output, outfile)
        print("saved " + outfile)
示例#9
0


	print(sample_name)
	samples = {
		sample_name : filelist
	}
	
	
	# Class -> Object
	#JW_Processor_instance = JW_Processor(year,setname,corrections,xsecDY)  <--on developing-->
	JW_Processor_instance = JW_Processor(year,setname,xsecDY,pu,corrections)
	
	
	## -->Multi-node Executor
	result = processor.run_uproot_job(
		samples,  #dataset
		"Events", # Tree name
		JW_Processor_instance, # Class
		executor=processor.futures_executor,
		executor_args={"schema": NanoAODSchema, "workers": 20},
	#maxchunks=4,
	)
	
	outname = data_sample + '.futures'
	#outname = 'DY_test.futures'
	save(result,outname)
	
	elapsed_time = time.time() - start
	print("Time: ",elapsed_time)
示例#10
0
            output['era_2'] += processor.column_accumulator(
                era[totcut & (channel_ == 2)])

        return output

    def postprocess(self, accumulator):
        return accumulator


if __name__ == "__main__":
    import pandas as pd

    out_ = processor.run_uproot_job(
        dataDS,
        treename='ffNtuplizer/ffNtuple',
        processor_instance=LeptonjetEventDrawer(data_type='data'),
        executor=processor.futures_executor,
        executor_args=dict(workers=12, flatten=False),
        chunksize=500000,
    )

    df_4mu = pd.DataFrame(
        [
            out_['run_2'].value,
            out_['lumi_2'].value,
            out_['event_2'].value,
            out_['era_2'].value,
        ],
        index=['run', 'lumi', 'event', 'era'],
        dtype='Int64',
    ).transpose()
    df_4mu.sort_values(by=['run', 'lumi', 'event'], inplace=True)
示例#11
0
                                     "dask_container_port": "8787",
                                     "should_transfer_files": "YES",
                                     "when_to_transfer_output": "ON_EXIT",
                                     "+DaskSchedulerAddress": '"129.93.183.33:8787"',
                                    })


cluster.scale(jobs=2)

client = Client(cluster)#, security=sec_dask)

#cachestrategy = 'dask-worker'
exe_args = {
        'client': client,
        #'cachestrategy': cachestrategy,
        #'savemetrics': True,
        #'worker_affinity': True if cachestrategy is not None else False,
    }
output = processor.run_uproot_job(fileset,
                                treename = 'Events',
                                processor_instance = METProcessor(),
                                executor = processor.dask_executor,
                                executor_args = exe_args
                                )

# Generates a 1D histogram from the data output to the 'MET' key. fill_opts are optional, to fill the graph (default is a line).
hist.plot1d(output['MET'], overlay='dataset', fill_opts={'edgecolor': (0,0,0,0.3), 'alpha': 0.8})

# Easy way to print all cutflow dict values. Can just do print(output['cutflow']["KEY_NAME"]) for one.
for key, value in output['cutflow'].items():
    print(key, value)
示例#12
0
    except yaml.YAMLError as exc:
        print(exc)

checkfile = open('sync_outputs/desy.csv')
checklist = pd.read_csv(checkfile)
print(checklist.shape)

t0 = time()
out = processor.run_uproot_job(
    fileset,
    treename='Events',
    processor_instance=SignalProcessor(sync=True,
                                       categories=['eemt'],
                                       checklist=checklist),
    executor=processor.futures_executor,
    executor_args={
        'workers': 20,
        'flatten': True,
        "nano": True
    },
    #chunksize=1000,
    #maxchunks=50,
)

print(out['cutflow_sync'].items())
print(out['cutflow'].items())

lumi = np.array(out['lumi'].value, dtype=int)
run = np.array(out['run'].value, dtype=int)
evt = np.array(out['evt'].value, dtype=int)
示例#13
0
        return accumulator


if __name__ == "__main__":
    import os
    from os.path import join, isdir, splitext
    from FireHydrant.Analysis.PlottingOptions import *

    reldir = splitext(__file__)[0].replace('_', '/')
    outdir = join(os.getenv('FH_BASE'), "Imgs", reldir)
    if not isdir(outdir): os.makedirs(outdir)

    out_sig2mu2e = processor.run_uproot_job(
        sigDS_2mu2e,
        treename='ffNtuplizer/ffNtuple',
        processor_instance=GenJetProcessor(data_type='sig-2mu2e'),
        executor=processor.futures_executor,
        executor_args=dict(workers=12, flatten=False),
        chunksize=500000,
    )
    out_sig4mu = processor.run_uproot_job(
        sigDS_4mu,
        treename='ffNtuplizer/ffNtuple',
        processor_instance=GenJetProcessor(data_type='sig-4mu'),
        executor=processor.futures_executor,
        executor_args=dict(workers=12, flatten=False),
        chunksize=500000,
    )

    import re
    longdecay = re.compile('^.*_lxy-300$')
    sampleSig = re.compile(
示例#14
0

if __name__ == "__main__":
    import os
    from os.path import join, isdir
    from FireHydrant.Analysis.PlottingOptions import *

    outdir = join(os.getenv('FH_BASE'), "Imgs", __file__.split('.')[0])
    if not isdir(outdir): os.makedirs(outdir)

    outputs = {}
    outputs['bkg'] = processor.run_uproot_job(
        bkgDS,
        treename='ffNtuplizer/ffNtuple',
        processor_instance=LeptonjetLeadSubleadProcessor(region='SR',
                                                         data_type='bkg'),
        executor=processor.futures_executor,
        executor_args=dict(workers=12, flatten=True),
        chunksize=500000,
    )

    outputs['sig-2mu2e'] = processor.run_uproot_job(
        filterSigDS(sigDS_2mu2e),
        treename='ffNtuplizer/ffNtuple',
        processor_instance=LeptonjetLeadSubleadProcessor(
            region='SR', data_type='sig-2mu2e'),
        executor=processor.futures_executor,
        executor_args=dict(workers=12, flatten=True),
        chunksize=500000,
    )
print('[{}] starting submission'.format(datetime.now().strftime("%H:%M:%S")))
final_accumulator, metrics = run_uproot_job(
    dataset,
    'otree',
    load('boostedHbbProcessor.coffea'),
    funcx_executor,
    executor_args={
        # 'local_path': '/hadoop/store/user/awoodard/data',
        # 'stageout_url': 'root://deepthought.crc.nd.edu://store/user/awoodard/data',
        # 'local_path': '/scratch365/awoodard/funcx',
        # 'stageout_url': 'file:///scratch365/awoodard/funcx',
        'local_path': '/scratch/midway2/annawoodard/funcx/results',
        'stageout_url': 'file:///scratch/midway2/annawoodard/funcx/results',
        'endpoints': [midway_uuid],
        'skipbadfiles': True,
        'savemetrics': True,
        'xrootdtimeout': 20,
        # 'poll_period': 5,
        'poll_period': 30,
        # 'tailtimeout': 500,
        # 'tailretry': 90,
        # 'batch_size': 500,
        'funcx_service_address': 'https://dev.funcx.org/api/v1'
        # 'funcx_service_address': 'https://funcx.org/api/v1'
    },
    pre_executor=futures_executor,
    chunksize=args.chunksize,
    metadata_cache=metadata_cache
)
returned = time.time()
print(metrics)
示例#16
0
文件: run_dask.py 项目: smdogra/decaf
fileslice = slice(None)
with open("metadata/" + options.year + ".json") as fin:
    samplefiles = json.load(fin)

filelist = {}
for dataset, info in samplefiles.items():
    if options.dataset and options.dataset not in dataset: continue
    files = []
    for file in info['files'][fileslice]:
        files.append(file)
    filelist[dataset] = files

from distributed import Client
client = Client('coffea-dask.fnal.gov:8786')
tstart = time.time()
output = processor.run_uproot_job(
    filelist,
    treename='Events',
    processor_instance=processor_instance,
    executor=processor.dask_executor,
    executor_args={'client': client},
    chunksize=50000,
)

# Pickle is not very fast or memory efficient, will be replaced by something better soon
#    with lz4f.open("pods/"+options.year+"/"+dataset+".pkl.gz", mode="xb", compression_level=5) as fout:
os.system("mkdir -p hists/" + options.analysis + year)
save(output, 'hists/' + options.analysis + year + '/' + dataset + '.dask')
dt = time.time() - tstart
print(dt)
示例#17
0
         chunk_size = 15000
     if "QCD_Pt_2400to3200" in samples2process[0]:
         chunk_size = 1500
     if "QCD_Pt_3200toInf" in samples2process[0]:
         chunk_size = 500
     print(
         "Processing QCD samples from which we only need a small amount of events!"
     )
     print("Events to be processed: ", chunk_size)
     output = processor.run_uproot_job(
         subsample_files,
         treename='Events',
         processor_instance=TrijetHistogramMaker(isMC=isMC),
         executor=processor.futures_executor,
         chunksize=chunk_size,
         maxchunks=1,
         executor_args={
             'workers': args.workers,
             'flatten': False,
             'status': not args.condor,
             "schema": HackSchema
         })
     util.save(output,
               f"DataHistograms_{save_tag}_{samples2process[0]}.coffea")
 else:
     output = processor.run_uproot_job(
         subsample_files,
         treename='Events',
         processor_instance=TrijetHistogramMaker(isMC=isMC),
         executor=processor.futures_executor,
         chunksize=250000,
示例#18
0
if __name__ == '__main__':
    tmFileset = {
        'CMSSW CUETPM81': [
            '/Users/chrispap/QCD/new/Autumn18.QCD_HT1000to1500_TuneCP5_13TeV-madgraphMLM-pythia8_0_RA2AnalysisTree.root',
            '/Users/chrispap/QCD/new/Autumn18.QCD_HT1500to2000_TuneCP5_13TeV-madgraphMLM-pythia8_0_RA2AnalysisTree.root',
            '/Users/chrispap/QCD/new/Autumn18.QCD_HT2000toInf_TuneCP5_13TeV-madgraphMLM-pythia8_0_RA2AnalysisTree.root',
        ],
    }

    tmOut = processor.run_uproot_job(
        tmFileset,
        treename="TreeMaker2/PreSelection",
        processor_instance=TreeMakerProcessor(),
        executor=processor.futures_executor,
        executor_args={
            "schema": TreeMakerSchema,
            "workers": 4
        },
        chunksize=100000
    )

    numerator = tmOut["nTracksHist"].integrate('dataset', 'CMSSW RECO')
    denominator = tmOut["nTracksHist"].integrate('dataset', 'CMSSW GEN')

    # make a nice ratio plot, adjusting some font sizes
    plt.rcParams.update({
        'font.size': 14,
        'axes.titlesize': 18,
        'axes.labelsize': 18,
        'xtick.labelsize': 12,
示例#19
0
    ## SR
    CUTNAMES = dict(
        enumerate([
            'lj0,1 sumq0',
            'dphi>pi/2',
            'EGM0pt>60',
            'Njets<4',
            'NtightB==0',
        ]))

    out_bkg = processor.run_uproot_job(
        bkgDS,
        treename='ffNtuplizer/ffNtuple',
        processor_instance=CutflowProcessor(data_type='bkg',
                                            region='SR',
                                            enforceNeutral=True),
        executor=processor.futures_executor,
        executor_args=dict(workers=12, flatten=False),
        chunksize=500000,
    )
    # --- CHANNEL - 2mu2e
    outputs = OrderedDict()
    h_ = out_bkg['count'].integrate('channel', slice(1, 2))
    outputs.update({k[0]: v for k, v in h_.values().items()})
    df_ = pd.DataFrame(outputs)
    for k, n in CUTNAMES.items():
        df_.rename(index={k: n}, inplace=True)
    YieldsDf['2mu2e-SR-OS'] = df_

    # --- CHANNEL - 4mu
    outputs = OrderedDict()
示例#20
0
    }
    workers = 8

if overwrite:

    # create .h5 file
    df = pd.DataFrame(df_out)
    df.to_hdf('data/data_X.h5', key='df', format='table', mode='w')

    output = processor.run_uproot_job(
        fileset,
        treename='Events',
        processor_instance=WHhadProcessor(),
        executor=processor.futures_executor,
        executor_args={
            'workers': workers,
            'function_args': {
                'flatten': False
            }
        },
        chunksize=500000,
    )
    df_out = pd.DataFrame({
        'met':
        output['met'].value.flatten(),
        'ht':
        output['ht'].value.flatten(),
        'lead_jet_pt':
        output['lead_jet_pt'].value.flatten(),
        'sublead_jet_pt':
        output['sublead_jet_pt'].value.flatten(),
示例#21
0
    if not overwrite:
        cache.load()

    if cfg == cache.get('cfg') and histograms == cache.get(
            'histograms') and cache.get('simple_output'):
        output = cache.get('simple_output')

    else:
        print("I'm running now")

        output = processor.run_uproot_job(
            fileset,
            "Events",
            charge_flip_ss(year=year,
                           variations=[],
                           accumulator=desired_output),
            exe,
            exe_args,
            chunksize=500000,
        )

        cache['fileset'] = fileset
        cache['cfg'] = cfg
        cache['histograms'] = histograms
        cache['simple_output'] = output
        cache.dump()

    import matplotlib.pyplot as plt
    import mplhep as hep
    plt.style.use(hep.style.CMS)
示例#22
0
                 'debug-log': 'debug.log',
                 'transactions-log': 'tr.log',
                 'stats-log': 'stats.log',
                 'verbose': False,
                 'port': [9123,9130],
                 'environment-file': topeftenv.get_environment(),
                 'master-name': '{}-workqueue-coffea'.format(os.environ['USER']),
                 'print-stdout': True,
                 'skipbadfiles': False,
                 'schema': NanoAODSchema,
                 'extra-input-files': ["topeft.py"]
}

# Run the processor and get the output                                                                                                                                                                     
tstart = time.time()
output = processor.run_uproot_job(flist, treename=treename, processor_instance=processor_instance, executor=processor.work_queue_executor, executor_args=executor_args, chunksize=chunksize, maxchunks=nchunks)
dt = time.time() - tstart

print('Processed {} events in {} seconds ({:.2f} evts/sec).'.format(nevts_total,dt,nevts_total/dt))

nbins = sum(sum(arr.size for arr in h._sumw.values()) for h in output.values() if isinstance(h, hist.Hist))
nfilled = sum(sum(np.sum(arr > 0) for arr in h._sumw.values()) for h in output.values() if isinstance(h, hist.Hist))
print("Filled %.0f bins, nonzero bins: %1.1f %%" % (nbins, 100*nfilled/nbins,))

# This is taken from the DM photon analysis...                                                                                                                                                             
# Pickle is not very fast or memory efficient, will be replaced by something better soon                                                                                                                   
#    with lz4f.open("pods/"+options.year+"/"+dataset+".pkl.gz", mode="xb", compression_level=5) as fout:                                                                                                   
if not outpath.endswith('/'): outpath += '/'
if not os.path.isdir(outpath): os.system("mkdir -p %s"%outpath)
print('Saving output in %s...'%(outpath + outname + ".pkl.gz"))
with gzip.open(outpath + outname + ".pkl.gz", "wb") as fout:
def main():

    overwrite = True

    # load the config and the cache
    cfg = loadConfig()

    # Inputs are defined in a dictionary
    # dataset : list of files
    fileset = {
        'tW_scattering': glob.glob("/hadoop/cms/store/user/dspitzba/nanoAOD/ttw_samples/0p1p2/tW_scattering__nanoAOD/merged/*.root"),
        "TTW":           glob.glob("/hadoop/cms/store/user/dspitzba/nanoAOD/ttw_samples/0p1p2/TTWJetsToLNu_TuneCP5_13TeV-amcatnloFXFX-madspin-pythia8__RunIIAutumn18NanoAODv6-Nano25Oct2019_102X_upgrade2018_realistic_v20_ext1-v1/merged/*.root") \
                        + glob.glob("/hadoop/cms/store/user/dspitzba/nanoAOD/ttw_samples/0p1p2/TTWJetsToQQ_TuneCP5_13TeV-amcatnloFXFX-madspin-pythia8__RunIIAutumn18NanoAODv6-Nano25Oct2019_102X_upgrade2018_realistic_v20-v1/merged/*.root"),
        #"ttbar":        glob.glob("/hadoop/cms/store/user/dspitzba/nanoAOD/ttw_samples/0p1p2/TTJets_SingleLeptFromT_TuneCP5_13TeV-madgraphMLM-pythia8__RunIIAutumn18NanoAODv6-Nano25Oct2019_102X_upgrade2018_realistic_v20-v1/merged/*.root") # adding this is still surprisingly fast (20GB file!)
    }

    # histograms
    histograms = ["MET_pt", "Jet_pt", "Jet_eta", "Jet_pt_fwd", "W_pt_notFromTop", "GenJet_pt_fwd", "Spectator_pt", "Spectator_eta"]
    histograms+= ["Top_pt", "Top_eta", "Antitop_pt", "Antitop_eta", "W_pt", "W_eta", "N_b", "N_jet"]


    # initialize cache
    cache = dir_archive(os.path.join(os.path.expandvars(cfg['caches']['base']), cfg['caches']['simpleProcessor']), serialized=True)
    if not overwrite:
        cache.load()

    if cfg == cache.get('cfg') and histograms == cache.get('histograms') and fileset == cache.get('fileset') and cache.get('simple_output'):
        output = cache.get('simple_output')

    else:
        # Run the processor
        output = processor.run_uproot_job(fileset,
                                      treename='Events',
                                      processor_instance=exampleProcessor(),
                                      executor=processor.futures_executor,
                                      executor_args={'workers': 12, 'function_args': {'flatten': False}},
                                      chunksize=500000,
                                     )
        cache['fileset']        = fileset
        cache['cfg']            = cfg
        cache['histograms']     = histograms
        cache['simple_output']  = output
        cache.dump()

    # Make a few plots
    outdir = "./tmp_plots"
    if not os.path.exists(outdir):
        os.makedirs(outdir)

    for name in histograms:
        print (name)
        histogram = output[name]
        if name == 'MET_pt':
            # rebin
            new_met_bins = hist.Bin('pt', r'$E_T^{miss} \ (GeV)$', 20, 0, 200)
            histogram = histogram.rebin('pt', new_met_bins)
        if name == 'W_pt_notFromTop':
            # rebin
            new_pt_bins = hist.Bin('pt', r'$p_{T}(W) \ (GeV)$', 25, 0, 500)
            histogram = histogram.rebin('pt', new_pt_bins)

        ax = hist.plot1d(histogram,overlay="dataset", density=False, stack=True) # make density plots because we don't care about x-sec differences
        ax.set_yscale('linear') # can be log
        #ax.set_ylim(0,0.1)
        ax.figure.savefig(os.path.join(outdir, "{}.pdf".format(name)))
        ax.clear()

        ax = hist.plot1d(histogram,overlay="dataset", density=True, stack=False) # make density plots because we don't care about x-sec differences
        ax.set_yscale('linear') # can be log
        #ax.set_ylim(0,0.1)
        ax.figure.savefig(os.path.join(outdir, "{}_shape.pdf".format(name)))
        ax.clear()

    return output
def main():
    # start run time clock
    tstart = time.time()

    # get options from command line
    parser = OptionParser()
    parser.add_option('-d', '--dataset',   help='dataset',           dest='dataset')
    parser.add_option('-N', '--nFiles',    help='nFiles',            dest='nFiles',    type=int, default=-1)
    parser.add_option('-M', '--startFile', help='startFile',         dest='startFile', type=int, default=0)
    parser.add_option(      '--condor',    help='running on condor', dest='condor',              default=False, action='store_true')
    parser.add_option(      '--dask',      help='run w/ dask', dest='dask',              default=False, action='store_true')
    parser.add_option(      '--port',      help='port for dask status dashboard (localhost:port)', dest='port', type=int, default=8787)
    parser.add_option(      '--mincores',  help='dask waits for min # cores', dest='mincores', type=int, default=4)
    parser.add_option(      '--quiet',     help='suppress status printouts', dest='quiet',              default=False, action='store_true')
    parser.add_option('-w', '--workers',   help='Number of workers to use for multi-worker executors (e.g. futures or condor)', dest='workers', type=int, default=8)
    parser.add_option('-s', '--chunksize', help='Chunk size',        dest='chunksize', type=int, default=10000)
    parser.add_option('-m', '--maxchunks', help='Max number of chunks (for testing)',        dest='maxchunks', type=int, default=None)
    options, args = parser.parse_args()

    # set output root file
    sample = options.dataset
    # getting dictionary of files from a sample collection e.g. "2016_QCD, 2016_WJets, 2016_TTJets, 2016_ZJets"
    fileset = s.getFileset(sample, True, options.startFile, options.nFiles)
    outfile = "MyAnalysis_%s_%d" % (sample, options.startFile) if options.condor or options.dask else "test"

    # get processor args
    exe_args = {'workers': options.workers, 'flatten': False}
    if options.dask:
        exe_args = use_dask(options.condor,options.workers,options.port)
        if options.quiet: exe_args['status'] = False

        client = exe_args['client']
        while len(client.ncores()) < options.mincores:
            print('Waiting for more cores to spin up, currently there are {0} available...'.format(len(client.ncores())))
            print('Dask client info ->', client)
            time.sleep(10)

    sf = s.sfGetter(sample)
    print("scaleFactor = {}".format(sf))

    # run processor
    output = processor.run_uproot_job(
        fileset,
        treename='TreeMaker2/PreSelection',
        processor_instance=MainProcessor(sample,sf),
        executor=processor.dask_executor if options.dask else processor.futures_executor,
        executor_args=exe_args,
        chunksize=options.chunksize,
        maxchunks=options.maxchunks,
    )

    # export the histograms to root files
    ## the loop makes sure we are only saving the histograms that are filled
    values_dict = {}
    branchdict = {}
    for v in output.keys():
        if len(output[v].value) > 0:
            branchdict[v] = uproot.newbranch("f4")
            values_dict[v] = output[v].value
    tree = uproot.newtree(branchdict)
    if values_dict != {}:
        print("saving root files...")
        with uproot.recreate("{}.root".format(outfile)) as f:
            f["tree"] = tree
            f["tree"].extend(values_dict)
    # print run time in seconds
    dt = time.time() - tstart
    print("run time: %.2f [sec]" % (dt))
示例#25
0
    from os.path import join, isdir, splitext

    reldir = splitext(__file__)[0].replace('_', '/')
    outdir = join(os.getenv('FH_BASE'), "Imgs", reldir)
    if not isdir(outdir): os.makedirs(outdir)

    import re
    longdecay = re.compile('^.*_lxy-300$')

    # ----------------------------------------------------------
    ## mu cand efficiency, resolution

    output = processor.run_uproot_job(
        sigDS_2mu2e,
        treename='ffNtuplizer/ffNtuple',
        processor_instance=MuEffiResoProcessor(),
        executor=processor.futures_executor,
        executor_args=dict(workers=12, flatten=False),
        chunksize=500000,
    )

    fig, ax = plt.subplots(figsize=(8, 6))
    hist.plotratio(num=output['lxy'][longdecay].sum('dataset').integrate(
        'reco', 'true'),
                   denom=output['lxy'][longdecay].sum('dataset').integrate(
                       'reco', 'inclusive'),
                   overflow='over',
                   error_opts={
                       'marker': 'o',
                   },
                   ax=ax,
                   label='PFMu+DSAMu')
    #files = {'Charmonium2018AOD': filesets['Charmonium2018AOD'][:]}
    files = {'MonteCarlo2017AOD': filesets['MonteCarlo2017AOD'][1:2]}

    # creating necessary folders into dir output data
    os.system("mkdir -p output/" + args.name)
    os.system("rm -rf output/" + args.name + "/*")

    # If the process is for data or mc
    if (args.data): analysis_type = 'data'    
    if (args.mc): analysis_type = 'mc'      

    if config_yaml['executor'] == 'futures_executor': 
        output = processor.run_uproot_job(files,
                                        treename='Events',
                                        processor_instance=EventSelectorProcessor(args.name, analysis_type),
                                        executor=processor.futures_executor, # Uses python futures to multiprocessing
                                        executor_args={"schema": BaseSchema, 'workers': config_yaml['n_cores']}, # BaseSchema returns a base.nano-events object
                                        chunksize=config_yaml['chunksize'],
                                        )

    elif config_yaml['executor'] == 'iterative_executor':
        output = processor.run_uproot_job(files,
                                        treename='Events',
                                        processor_instance=EventSelectorProcessor(args.name, analysis_type),
                                        executor=processor.iterative_executor,
                                        executor_args={'schema': BaseSchema},
                                        chunksize=config_yaml['chunksize'],
                                        )

    elapsed = round(time.time() - tstart, 2)
    print(f"Process finished in: {elapsed} s")
示例#27
0
for dataset, info in samplefiles.items():
    filelist = {}
    if options.dataset and options.dataset not in dataset: continue
    print('Processing:', dataset)
    files = []
    for file in info['files'][fileslice]:
        files.append(file)
    filelist[dataset] = files

    tstart = time.time()
    output = processor.run_uproot_job(
        filelist,
        treename='Events',
        processor_instance=processor_instance,
        executor=processor.futures_executor,
        executor_args={
            'nano': True,
            'workers': options.workers
        },
    )

    #nbins = sum(sum(arr.size for arr in h._sumw.values()) for h in output.values() if isinstance(h, hist.Hist))
    #nfilled = sum(sum(np.sum(arr > 0) for arr in h._sumw.values()) for h in output.values() if isinstance(h, hist.Hist))
    #print("Filled %.1fM bins" % (nbins/1e6, ))
    #print("Nonzero bins: %.1f%%" % (100*nfilled/nbins, ))

    os.system("mkdir -p hists/" + options.processor)
    save(output, 'hists/' + options.processor + '/' + dataset + '.futures')
    dt = time.time() - tstart
    nworkers = options.workers
    print("%.2f us*cpu overall" % (1e6 * dt * nworkers, ))
示例#28
0

if __name__ == "__main__":
    import os
    import re
    from os.path import join, isdir, splitext
    from FireHydrant.Analysis.PlottingOptions import *

    reldir = splitext(__file__)[0].replace('_', '/')
    outdir = join(os.getenv('FH_BASE'), "Imgs", reldir)
    if not isdir(outdir): os.makedirs(outdir)

    output_2mu2e = processor.run_uproot_job(
        sigDS_2mu2e,
        treename='ffNtuplizer/ffNtuple',
        processor_instance=LjTkIsoProcessor(data_type='sig-2mu2e'),
        executor=processor.futures_executor,
        executor_args=dict(workers=12, flatten=False),
        chunksize=500000,
    )

    output_4mu = processor.run_uproot_job(
        sigDS_4mu,
        treename='ffNtuplizer/ffNtuple',
        processor_instance=LjTkIsoProcessor(data_type='sig-4mu'),
        executor=processor.futures_executor,
        executor_args=dict(workers=12, flatten=False),
        chunksize=500000,
    )

    output_bkg = processor.run_uproot_job(
        bkgDS,
示例#29
0
    if not overwrite:
        cache.load()

    if cfg == cache.get('cfg') and histograms == cache.get(
            'histograms') and cache.get('simple_output'):
        output = cache.get('simple_output')

    else:
        print("I'm running now")

        output = processor.run_uproot_job(
            fileset,
            "Events",
            trilep_analysis(year=year,
                            variations=variations,
                            accumulator=desired_output),
            exe,
            exe_args,
            chunksize=250000,
        )

        cache['fileset'] = fileset
        cache['cfg'] = cfg
        cache['histograms'] = histograms
        cache['simple_output'] = output
        cache.dump()

    lines = ['entry']
    lines += [
        'filter',
        'lepveto',
示例#30
0
                           pt=zp.p4.pt.flatten())#,
                           #weight=df['genWeight'])
        print(output['hmass'].values())
        return output

    def postprocess(self, accumulator):
        return accumulator

samples = {"default":args.files}
#with open('files_prev.json') as fin:
#    samples = json.load(fin)

output = processor.run_uproot_job(samples,
                                  treename='Events',
                                  processor_instance=GenVisualizer(),
                                  executor=processor.futures_executor,
                                  executor_args={'workers': 4},
                                  chunksize=500000,
                                 )
save(output, 'genstuff.coffea')

output = load("genstuff.coffea")

#hmass = output["hmass"]
#bin_contents = hmass.values()[('ZPrimeToQQ_DMsimp_HT400_M50',)]
#edges = hmass.axis('mass').edges()
#edge_pairs = [(edges[i], edges[i+1]) for i in range(len(edges)-1)]
#histd = zip(edge_pairs, bin_contents)
#for thing in histd:
#    print(thing)