def test_preloaded_nanoevents(): columns = [ 'nMuon', 'Muon_pt', 'Muon_eta', 'Muon_phi', 'Muon_mass', 'Muon_charge', 'nJet', 'Jet_eta' ] p = NanoEventsProcessor(columns=columns) rootdir = uproot.open(os.path.abspath('tests/samples/nano_dy.root')) tree = rootdir['Events'] arrays = tree.arrays(columns, how=dict) src = SimplePreloadedColumnSource(arrays, rootdir.file.uuid, tree.num_entries, object_path='/Events') print(arrays) events = NanoEventsFactory.from_preloaded(src, metadata={ 'dataset': 'ZJets' }).events() hists = p.process(events) print(hists) assert (hists['cutflow']['ZJets_pt'] == 18) assert (hists['cutflow']['ZJets_mass'] == 6) with pytest.raises(AttributeError): print(events.Muon.matched_jet)
def test_preloaded_nanoevents(): columns = [ "nMuon", "Muon_pt", "Muon_eta", "Muon_phi", "Muon_mass", "Muon_charge", "nJet", "Jet_eta", ] p = NanoEventsProcessor(columns=columns) rootdir = uproot.open(os.path.abspath("tests/samples/nano_dy.root")) tree = rootdir["Events"] arrays = tree.arrays(columns, how=dict) src = SimplePreloadedColumnSource( arrays, rootdir.file.uuid, tree.num_entries, object_path="/Events" ) print(arrays) events = NanoEventsFactory.from_preloaded( src, metadata={"dataset": "ZJets"} ).events() hists = p.process(events) print(hists) assert hists["cutflow"]["ZJets_pt"] == 18 assert hists["cutflow"]["ZJets_mass"] == 6 with pytest.raises(AttributeError): print(events.Muon.matched_jet)
def do_dask_cached(client, filelist, cachestrategy=None): from coffea.nanoevents import NanoAODSchema from coffea.processor.test_items import NanoEventsProcessor from coffea.processor.dask import register_columncache register_columncache(client) exe_args = { "client": client, "schema": NanoAODSchema, "cachestrategy": cachestrategy, "savemetrics": True, "worker_affinity": True if cachestrategy is not None else False, } hists, metrics = processor.run_uproot_job( filelist, "Events", processor_instance=NanoEventsProcessor(canaries=[ "a9490124-3648-11ea-89e9-f5b55c90beef/%2FEvents%3B1/0-40/nMuon%2C%21load%2C%21counts2offsets%2C%21skip/offsets", "a9490124-3648-11ea-89e9-f5b55c90beef/%2FEvents%3B1/0-40/Muon_phi%2C%21load%2C%21content", "a9490124-3648-11ea-89e9-f5b55c90beef/%2FEvents%3B1/0-40/Muon_pt%2C%21load%2C%21content", "a9490124-3648-11ea-89e9-f5b55c90beef/%2FEvents%3B1/0-40/Muon_eta%2C%21load%2C%21content", "a9490124-3648-11ea-89e9-f5b55c90beef/%2FEvents%3B1/0-40/Muon_mass%2C%21load%2C%21content", "a9490124-3648-11ea-89e9-f5b55c90beef/%2FEvents%3B1/0-40/Muon_charge%2C%21load%2C%21content", ]), executor=processor.dask_executor, executor_args=exe_args, ) assert hists["cutflow"]["ZJets_pt"] == 18 assert hists["cutflow"]["ZJets_mass"] == 6 assert hists["cutflow"]["Data_pt"] == 84 assert hists["cutflow"]["Data_mass"] == 66 return hists["worker"]
def do_dask_cached(client, filelist, cachestrategy=None): from coffea.nanoevents import NanoAODSchema from coffea.processor.test_items import NanoEventsProcessor from coffea.processor.dask import register_columncache register_columncache(client) exe_args = { 'client': client, 'schema': NanoAODSchema, 'cachestrategy': cachestrategy, 'savemetrics': True, 'worker_affinity': True if cachestrategy is not None else False, } hists, metrics = processor.run_uproot_job( filelist, 'Events', processor_instance=NanoEventsProcessor(canaries=[ 'a9490124-3648-11ea-89e9-f5b55c90beef/%2FEvents%3B1/0-40/nMuon%2C%21load%2C%21counts2offsets%2C%21skip/offsets', 'a9490124-3648-11ea-89e9-f5b55c90beef/%2FEvents%3B1/0-40/Muon_phi%2C%21load%2C%21content', 'a9490124-3648-11ea-89e9-f5b55c90beef/%2FEvents%3B1/0-40/Muon_pt%2C%21load%2C%21content', 'a9490124-3648-11ea-89e9-f5b55c90beef/%2FEvents%3B1/0-40/Muon_eta%2C%21load%2C%21content', 'a9490124-3648-11ea-89e9-f5b55c90beef/%2FEvents%3B1/0-40/Muon_mass%2C%21load%2C%21content', 'a9490124-3648-11ea-89e9-f5b55c90beef/%2FEvents%3B1/0-40/Muon_charge%2C%21load%2C%21content' ]), executor=processor.dask_executor, executor_args=exe_args) assert (hists['cutflow']['ZJets_pt'] == 18) assert (hists['cutflow']['ZJets_mass'] == 6) assert (hists['cutflow']['Data_pt'] == 84) assert (hists['cutflow']['Data_mass'] == 66) return hists['worker']
def test_nanoevents_analysis(executor, compression, maxchunks): from coffea.processor.test_items import NanoEventsProcessor filelist = { "DummyBad": [osp.abspath("tests/samples/non_existent.root")], "ZJets": [osp.abspath("tests/samples/nano_dy.root")], "Data": [osp.abspath("tests/samples/nano_dimuon.root")], } treename = "Events" exe_args = { "workers": 1, "skipbadfiles": True, "schema": processor.NanoAODSchema, "compression": compression, } hists = processor.run_uproot_job( filelist, treename, NanoEventsProcessor(), executor, executor_args=exe_args, maxchunks=maxchunks, ) assert hists["cutflow"]["ZJets_pt"] == 18 assert hists["cutflow"]["ZJets_mass"] == 6 assert hists["cutflow"]["Data_pt"] == 84 assert hists["cutflow"]["Data_mass"] == 66
def do_dask_cached(client, filelist, cachestrategy=None): from coffea.processor.test_items import NanoEventsProcessor from coffea.processor.dask import register_columncache register_columncache(client) exe_args = { 'client': client, 'nano': True, 'cachestrategy': cachestrategy, 'savemetrics': True, 'worker_affinity': True if cachestrategy is not None else False, } hists, metrics = processor.run_uproot_job( filelist, 'Events', processor_instance=NanoEventsProcessor(canaries=[ '0001a210a3f8364811eaa29ff5b55c90beef;Events;0;40;Muon_pt' ]), executor=processor.dask_executor, executor_args=exe_args) assert (hists['cutflow']['ZJets_pt'] == 18) assert (hists['cutflow']['ZJets_mass'] == 6) assert (hists['cutflow']['Data_pt'] == 84) assert (hists['cutflow']['Data_mass'] == 66) return hists['worker']
def do_dask_cached(client, filelist, cachestrategy=None): from coffea.nanoevents import schemas from coffea.processor.test_items import NanoEventsProcessor from coffea.processor.dask import register_columncache register_columncache(client) worker_affinity = True if cachestrategy is not None else False executor = processor.DaskExecutor(client=client, worker_affinity=worker_affinity) run = processor.Runner( executor=executor, schema=schemas.NanoAODSchema, cachestrategy=cachestrategy, savemetrics=True, ) hists, metrics = run( filelist, "Events", processor_instance=NanoEventsProcessor(canaries=[ "a9490124-3648-11ea-89e9-f5b55c90beef/%2FEvents%3B1/0-40/nMuon%2C%21load%2C%21counts2offsets%2C%21skip/offsets", "a9490124-3648-11ea-89e9-f5b55c90beef/%2FEvents%3B1/0-40/Muon_phi%2C%21load%2C%21content", "a9490124-3648-11ea-89e9-f5b55c90beef/%2FEvents%3B1/0-40/Muon_pt%2C%21load%2C%21content", "a9490124-3648-11ea-89e9-f5b55c90beef/%2FEvents%3B1/0-40/Muon_eta%2C%21load%2C%21content", "a9490124-3648-11ea-89e9-f5b55c90beef/%2FEvents%3B1/0-40/Muon_mass%2C%21load%2C%21content", "a9490124-3648-11ea-89e9-f5b55c90beef/%2FEvents%3B1/0-40/Muon_charge%2C%21load%2C%21content", ]), ) assert hists["cutflow"]["ZJets_pt"] == 18 assert hists["cutflow"]["ZJets_mass"] == 6 assert hists["cutflow"]["Data_pt"] == 84 assert hists["cutflow"]["Data_mass"] == 66 return hists["worker"]
def test_loadsave(): filename = 'testprocessor.coffea' try: aprocessor = NanoEventsProcessor() save(aprocessor, filename) newprocessor = load(filename) assert 'pt' in newprocessor.accumulator assert newprocessor.accumulator['pt'].compatible(aprocessor.accumulator['pt']) finally: if os.path.exists(filename): os.remove(filename)
def test_preloaded_nanoevents(): columns = [ 'nMuon', 'Muon_pt', 'Muon_eta', 'Muon_phi', 'Muon_mass', 'Muon_charge', 'nJet', 'Jet_eta' ] p = NanoEventsProcessor(columns=columns) tree = uproot.open(os.path.abspath('tests/samples/nano_dy.root'))['Events'] arrays = tree.arrays(columns, flatten=True, namedecode='ascii') df = processor.PreloadedDataFrame(tree.numentries, arrays) print(arrays) events = NanoEvents.from_arrays(arrays, metadata={'dataset': 'ZJets'}) hists = p.process(events) print(hists) assert (hists['cutflow']['ZJets_pt'] == 18) assert (hists['cutflow']['ZJets_mass'] == 6) with pytest.raises(RuntimeError): print(events.Muon.matched_jet)
def test_nanoevents_analysis(executor, compression, maxchunks, skipbadfiles): from coffea.processor.test_items import NanoEventsProcessor filelist = { "DummyBad": { "treename": "Events", "files": [osp.abspath("tests/samples/non_existent.root")], }, "ZJets": { "treename": "Events", "files": [osp.abspath("tests/samples/nano_dy.root")], "metadata": {"checkusermeta": True, "someusermeta": "hello"}, }, "Data": { "treename": "Events", "files": [osp.abspath("tests/samples/nano_dimuon.root")], "metadata": {"checkusermeta": True, "someusermeta2": "world"}, }, } executor = executor(compression=compression) run = processor.Runner( executor=executor, skipbadfiles=skipbadfiles, schema=processor.NanoAODSchema, maxchunks=maxchunks, ) if skipbadfiles: hists = run(filelist, "Events", processor_instance=NanoEventsProcessor()) assert hists["cutflow"]["ZJets_pt"] == 18 assert hists["cutflow"]["ZJets_mass"] == 6 assert hists["cutflow"]["Data_pt"] == 84 assert hists["cutflow"]["Data_mass"] == 66 else: with pytest.raises(FileNotFoundError): hists = run(filelist, "Events", processor_instance=NanoEventsProcessor())
def do_dask_cached(client, filelist, cachestrategy=None): from coffea.processor.test_items import NanoEventsProcessor exe_args = { 'client': client, 'nano': True, 'cachestrategy': cachestrategy, 'savemetrics': True, } hists, metrics = processor.run_uproot_job( filelist, 'Events', processor_instance=NanoEventsProcessor(), executor=processor.dask_executor, executor_args=exe_args) assert (hists['cutflow']['ZJets_pt'] == 18) assert (hists['cutflow']['ZJets_mass'] == 6) assert (hists['cutflow']['Data_pt'] == 84) assert (hists['cutflow']['Data_mass'] == 66)
executor = processor.DaskExecutor(client=client) run = processor.Runner( executor=executor, use_skyhook=True, format="parquet", schema=schemas.NanoAODSchema, ) hists = run( { "ZJets": "/mnt/cephfs/nanoevents/ZJets", "Data": "/mnt/cephfs/nanoevents/Data", }, "Events", processor_instance=NanoEventsProcessor(), ) assert hists["cutflow"]["ZJets_pt"] == 108 assert hists["cutflow"]["ZJets_mass"] == 36 assert hists["cutflow"]["Data_pt"] == 504 assert hists["cutflow"]["Data_mass"] == 396 # now run again on parquet files in cephfs (without any pushdown) executor_args = {"client": client} run = processor.Runner( executor=executor, format="parquet", schema=schemas.NanoAODSchema, )
def test_spark_executor(): pyspark = pytest.importorskip("pyspark", minversion="2.4.1") from pyarrow.util import guid from coffea.processor.spark.detail import (_spark_initialize, _spark_make_dfs, _spark_stop) from coffea.processor import run_spark_job from coffea.nanoevents import schemas import os import os.path as osp import pyspark.sql spark_config = pyspark.sql.SparkSession.builder \ .appName('spark-executor-test-%s' % guid()) \ .master('local[*]') \ .config('spark.sql.execution.arrow.enabled','true') \ .config('spark.executor.x509proxyname','x509_u12409') \ .config('spark.sql.execution.arrow.maxRecordsPerBatch', 200000) spark = _spark_initialize(config=spark_config, log_level='ERROR', spark_progress=False) filelist = { 'ZJets': { 'files': ['file:' + osp.join(os.getcwd(), 'tests/samples/nano_dy.root')], 'treename': 'Events' }, 'Data': { 'files': [ 'file:' + osp.join(os.getcwd(), 'tests/samples/nano_dimuon.root') ], 'treename': 'Events' } } from coffea.processor.test_items import NanoTestProcessor, NanoEventsProcessor from coffea.processor.spark.spark_executor import spark_executor columns = [ 'nMuon', 'Muon_pt', 'Muon_eta', 'Muon_phi', 'Muon_mass', 'Muon_charge' ] proc = NanoTestProcessor(columns=columns) hists = run_spark_job(filelist, processor_instance=proc, executor=spark_executor, spark=spark, thread_workers=1, executor_args={'file_type': 'root'}) assert (sum(spark_executor.counts.values()) == 80) assert (hists['cutflow']['ZJets_pt'] == 18) assert (hists['cutflow']['ZJets_mass'] == 6) assert (hists['cutflow']['Data_pt'] == 84) assert (hists['cutflow']['Data_mass'] == 66) hists = run_spark_job(filelist, processor_instance=proc, executor=spark_executor, spark=spark, thread_workers=1, executor_args={'file_type': 'root'}) assert (sum(spark_executor.counts.values()) == 80) assert (hists['cutflow']['ZJets_pt'] == 18) assert (hists['cutflow']['ZJets_mass'] == 6) assert (hists['cutflow']['Data_pt'] == 84) assert (hists['cutflow']['Data_mass'] == 66) proc = NanoEventsProcessor(columns=columns) hists = run_spark_job(filelist, processor_instance=proc, executor=spark_executor, spark=spark, thread_workers=1, executor_args={ 'file_type': 'root', 'schema': schemas.NanoAODSchema }) _spark_stop(spark) assert (sum(spark_executor.counts.values()) == 80) assert (hists['cutflow']['ZJets_pt'] == 18) assert (hists['cutflow']['ZJets_mass'] == 6) assert (hists['cutflow']['Data_pt'] == 84) assert (hists['cutflow']['Data_mass'] == 66)
def test_spark_executor(): pyspark = pytest.importorskip("pyspark", minversion="2.4.1") from pyarrow.util import guid from coffea.processor.spark.detail import ( _spark_initialize, _spark_stop, ) from coffea.processor import run_spark_job from coffea.nanoevents import schemas import os import os.path as osp import pyspark.sql spark_config = (pyspark.sql.SparkSession.builder.appName( "spark-executor-test-%s" % guid()).master("local[*]").config( "spark.sql.execution.arrow.enabled", "true").config("spark.driver.host", "127.0.0.1").config( "spark.driver.bindAddress", "127.0.0.1").config( "spark.executor.x509proxyname", "x509_u12409").config( "spark.sql.execution.arrow.maxRecordsPerBatch", 200000)) spark = _spark_initialize(config=spark_config, log_level="ERROR", spark_progress=False) filelist = { "ZJets": { "files": ["file:" + osp.join(os.getcwd(), "tests/samples/nano_dy.root")], "treename": "Events", }, "Data": { "files": [ "file:" + osp.join(os.getcwd(), "tests/samples/nano_dimuon.root") ], "treename": "Events", }, } from coffea.processor.test_items import NanoTestProcessor, NanoEventsProcessor from coffea.processor.spark.spark_executor import spark_executor columns = [ "nMuon", "Muon_pt", "Muon_eta", "Muon_phi", "Muon_mass", "Muon_charge" ] proc = NanoTestProcessor(columns=columns) hists = run_spark_job( filelist, processor_instance=proc, executor=spark_executor, spark=spark, thread_workers=1, executor_args={"file_type": "root"}, ) assert sum(spark_executor.counts.values()) == 80 assert hists["cutflow"]["ZJets_pt"] == 18 assert hists["cutflow"]["ZJets_mass"] == 6 assert hists["cutflow"]["Data_pt"] == 84 assert hists["cutflow"]["Data_mass"] == 66 hists = run_spark_job( filelist, processor_instance=proc, executor=spark_executor, spark=spark, thread_workers=1, executor_args={"file_type": "root"}, ) assert sum(spark_executor.counts.values()) == 80 assert hists["cutflow"]["ZJets_pt"] == 18 assert hists["cutflow"]["ZJets_mass"] == 6 assert hists["cutflow"]["Data_pt"] == 84 assert hists["cutflow"]["Data_mass"] == 66 proc = NanoEventsProcessor(columns=columns) hists = run_spark_job( filelist, processor_instance=proc, executor=spark_executor, spark=spark, thread_workers=1, executor_args={ "file_type": "root", "schema": schemas.NanoAODSchema }, ) _spark_stop(spark) assert sum(spark_executor.counts.values()) == 80 assert hists["cutflow"]["ZJets_pt"] == 18 assert hists["cutflow"]["ZJets_mass"] == 6 assert hists["cutflow"]["Data_pt"] == 84 assert hists["cutflow"]["Data_mass"] == 66