示例#1
0
def test_spark_executor():
    pyspark = pytest.importorskip("pyspark", minversion="2.4.1")
    from pyarrow.compat import guid

    from coffea.processor.spark.detail import (_spark_initialize,
                                               _spark_make_dfs, _spark_stop)
    from coffea.processor import run_spark_job

    import os
    import os.path as osp

    import pyspark.sql
    spark_config = pyspark.sql.SparkSession.builder \
        .appName('spark-executor-test-%s' % guid()) \
        .master('local[*]') \
        .config('spark.sql.execution.arrow.enabled','true') \
        .config('spark.executor.x509proxyname','x509_u12409') \
        .config('spark.sql.execution.arrow.maxRecordsPerBatch', 200000)

    spark = _spark_initialize(config=spark_config,
                              log_level='ERROR',
                              spark_progress=False)

    filelist = {
        'ZJets': {
            'files':
            ['file:' + osp.join(os.getcwd(), 'tests/samples/nano_dy.root')],
            'treename':
            'Events'
        },
        'Data': {
            'files': [
                'file:' +
                osp.join(os.getcwd(), 'tests/samples/nano_dimuon.root')
            ],
            'treename':
            'Events'
        }
    }

    from coffea.processor.test_items import NanoTestProcessor
    from coffea.processor.spark.spark_executor import spark_executor

    columns = ['nMuon', 'Muon_pt', 'Muon_eta', 'Muon_phi', 'Muon_mass']
    proc = NanoTestProcessor(columns=columns)

    hists = run_spark_job(filelist,
                          processor_instance=proc,
                          executor=spark_executor,
                          spark=spark,
                          thread_workers=1,
                          executor_args={'file_type': 'root'})

    _spark_stop(spark)

    assert (sum(spark_executor.counts.values()) == 20)
    assert (hists['cutflow']['ZJets_pt'] == 4)
    assert (hists['cutflow']['ZJets_mass'] == 1)
    assert (hists['cutflow']['Data_pt'] == 15)
    assert (hists['cutflow']['Data_mass'] == 5)
示例#2
0
def check_spark_functionality():
    spark = _spark_initialize()

    env = Environment(loader=PackageLoader('coffea.processor',
                                           'templates'),
                      autoescape=select_autoescape(['py'])
                      )

    template_name = 'spark.py.tmpl'
    tmpl = env.get_template(template_name)

    global processor_instance, lz4_clevel, coffea_udf
    processor_instance = DummyProcessor()
    lz4_clevel = 1

    cols = ['dataset']
    output = tmpl.render(cols=cols)
    exec(output)

    dataset = [{'dataset': 'WJets'}, {'dataset': 'WJets'}, {'dataset': 'WJets'}]
    df = spark.createDataFrame(dataset, schema='dataset: string')
    pd_one = df.toPandas()

    df = df.withColumn('histos', coffea_udf(*cols))
    pd_two = df.toPandas()

    _spark_stop(spark)

    return pd_one['dataset'].count(), pd_two['dataset'].count(), pd_two['histos']
示例#3
0
def test_spark_imports():
    pyspark = pytest.importorskip("pyspark", minversion="2.4.1")

    from coffea.processor.spark.spark_executor import spark_executor
    from coffea.processor.spark.detail import (_spark_initialize,
                                               _spark_make_dfs, _spark_stop)

    spark = _spark_initialize()
    _spark_stop(spark)
示例#4
0
def test_spark_imports():
    pytest.importorskip("pyspark", minversion="2.4.1")

    from coffea.processor.spark.detail import (
        _spark_initialize,
        _spark_stop,
    )

    spark = _spark_initialize(bindAddress="127.0.0.1", host="127.0.0.1")
    _spark_stop(spark)
示例#5
0
def test_spark_executor():
    pyspark = pytest.importorskip("pyspark", minversion="2.4.1")
    from pyarrow.util import guid

    from coffea.processor.spark.detail import (
        _spark_initialize,
        _spark_stop,
    )
    from coffea.processor import run_spark_job
    from coffea.nanoevents import schemas

    import os
    import os.path as osp

    import pyspark.sql

    spark_config = (pyspark.sql.SparkSession.builder.appName(
        "spark-executor-test-%s" % guid()).master("local[*]").config(
            "spark.sql.execution.arrow.enabled",
            "true").config("spark.driver.host", "127.0.0.1").config(
                "spark.driver.bindAddress", "127.0.0.1").config(
                    "spark.executor.x509proxyname", "x509_u12409").config(
                        "spark.sql.execution.arrow.maxRecordsPerBatch",
                        200000))

    spark = _spark_initialize(config=spark_config,
                              log_level="ERROR",
                              spark_progress=False)

    filelist = {
        "ZJets": {
            "files":
            ["file:" + osp.join(os.getcwd(), "tests/samples/nano_dy.root")],
            "treename":
            "Events",
        },
        "Data": {
            "files": [
                "file:" +
                osp.join(os.getcwd(), "tests/samples/nano_dimuon.root")
            ],
            "treename":
            "Events",
        },
    }

    from coffea.processor.test_items import NanoTestProcessor, NanoEventsProcessor
    from coffea.processor.spark.spark_executor import spark_executor

    columns = [
        "nMuon", "Muon_pt", "Muon_eta", "Muon_phi", "Muon_mass", "Muon_charge"
    ]
    proc = NanoTestProcessor(columns=columns)

    hists = run_spark_job(
        filelist,
        processor_instance=proc,
        executor=spark_executor,
        spark=spark,
        thread_workers=1,
        executor_args={"file_type": "root"},
    )

    assert sum(spark_executor.counts.values()) == 80
    assert hists["cutflow"]["ZJets_pt"] == 18
    assert hists["cutflow"]["ZJets_mass"] == 6
    assert hists["cutflow"]["Data_pt"] == 84
    assert hists["cutflow"]["Data_mass"] == 66

    hists = run_spark_job(
        filelist,
        processor_instance=proc,
        executor=spark_executor,
        spark=spark,
        thread_workers=1,
        executor_args={"file_type": "root"},
    )

    assert sum(spark_executor.counts.values()) == 80
    assert hists["cutflow"]["ZJets_pt"] == 18
    assert hists["cutflow"]["ZJets_mass"] == 6
    assert hists["cutflow"]["Data_pt"] == 84
    assert hists["cutflow"]["Data_mass"] == 66

    proc = NanoEventsProcessor(columns=columns)
    hists = run_spark_job(
        filelist,
        processor_instance=proc,
        executor=spark_executor,
        spark=spark,
        thread_workers=1,
        executor_args={
            "file_type": "root",
            "schema": schemas.NanoAODSchema
        },
    )

    _spark_stop(spark)

    assert sum(spark_executor.counts.values()) == 80
    assert hists["cutflow"]["ZJets_pt"] == 18
    assert hists["cutflow"]["ZJets_mass"] == 6
    assert hists["cutflow"]["Data_pt"] == 84
    assert hists["cutflow"]["Data_mass"] == 66
if __name__ == "__main__":
    tick = time.time()

    spark_config = (pyspark.sql.SparkSession.builder.appName(
        "spark-executor-test-%s" % guid()).master("local[1]").config(
            "spark.sql.execution.arrow.enabled",
            "true").config("spark.executor.memory", "7g").config(
                "spark.executor.cores",
                "1").config("spark.driver.memory", "16g").config(
                    "spark.driver.maxResultSize", "4g").config(
                        "spark.sql.execution.arrow.maxRecordsPerBatch",
                        100000).config("spark.cores.max", "1"))

    spark = _spark_initialize(
        config=spark_config,
        log_level="ERROR",
        spark_progress=False,
        laurelin_version="1.0.0",
    )
    print("Spark initialized")

    file_name = "vbf_powheg_dipole_NANOV10_2018.root"
    file_path = f"{os.getcwd()}/tests/samples/{file_name}"
    dataset = {"test": file_path}

    samp_info = SamplesInfo(xrootd=False)
    samp_info.paths = dataset
    samp_info.year = "2018"
    samp_info.load("test", use_dask=False)
    samp_info.lumi_weights["test"] = 1.0

    executor = spark_executor