Exemplo n.º 1
0
def test_native_file_TextIOWrapper(tmpdir):
    data = ('foooo\n'
            'barrr\n'
            'bazzz\n')

    path = os.path.join(str(tmpdir), guid())
    with open(path, 'wb') as f:
        f.write(data.encode('utf-8'))

    with TextIOWrapper(pa.OSFile(path, mode='rb')) as fil:
        assert fil.readable()
        res = fil.read()
        assert res == data
    assert fil.closed

    with TextIOWrapper(pa.OSFile(path, mode='rb')) as fil:
        # Iteration works
        lines = list(fil)
        assert ''.join(lines) == data

    # Writing
    path2 = os.path.join(str(tmpdir), guid())
    with TextIOWrapper(pa.OSFile(path2, mode='wb')) as fil:
        assert fil.writable()
        fil.write(data)

    with TextIOWrapper(pa.OSFile(path2, mode='rb')) as fil:
        res = fil.read()
        assert res == data
Exemplo n.º 2
0
def test_native_file_permissions(tmpdir):
    # ARROW-10124: permissions of created files should follow umask
    cur_umask = os.umask(0o002)
    os.umask(cur_umask)

    path = os.path.join(str(tmpdir), guid())
    with pa.OSFile(path, mode='w'):
        pass
    assert os.stat(path).st_mode & 0o777 == 0o666 & ~cur_umask

    path = os.path.join(str(tmpdir), guid())
    with pa.memory_map(path, 'w'):
        pass
    assert os.stat(path).st_mode & 0o777 == 0o666 & ~cur_umask
Exemplo n.º 3
0
def test_native_file_raises_ValueError_after_close(tmpdir):
    path = os.path.join(str(tmpdir), guid())
    with open(path, 'wb') as f:
        f.write(b'foooo')

    with pa.OSFile(path, mode='rb') as os_file:
        assert not os_file.closed
    assert os_file.closed

    with pa.memory_map(path, mode='rb') as mmap_file:
        assert not mmap_file.closed
    assert mmap_file.closed

    files = [os_file,
             mmap_file]

    methods = [('tell', ()),
               ('seek', (0,)),
               ('size', ()),
               ('flush', ()),
               ('readable', ()),
               ('writable', ()),
               ('seekable', ())]

    for f in files:
        for method, args in methods:
            with pytest.raises(ValueError):
                getattr(f, method)(*args)
Exemplo n.º 4
0
def test_memory_map_close_remove(tmpdir):
    # ARROW-6740: should be able to delete closed memory-mapped file (Windows)
    path = os.path.join(str(tmpdir), guid())
    mmap = pa.create_memory_map(path, 4096)
    mmap.close()
    assert mmap.closed
    os.remove(path)  # Shouldn't fail
Exemplo n.º 5
0
    def test_read_multiple_parquet_files(self):

        tmpdir = pjoin(self.tmp_path, 'multi-parquet-' + guid())

        self.hdfs.mkdir(tmpdir)

        expected = self._write_multiple_hdfs_pq_files(tmpdir)
        result = self.hdfs.read_parquet(tmpdir)

        _pandas_api.assert_frame_equal(
            result.to_pandas().sort_values(by='index').reset_index(drop=True),
            expected.to_pandas())
Exemplo n.º 6
0
    def test_read_multiple_parquet_files_with_uri(self):
        import pyarrow.parquet as pq

        tmpdir = pjoin(self.tmp_path, 'multi-parquet-uri-' + guid())

        self.hdfs.mkdir(tmpdir)

        expected = self._write_multiple_hdfs_pq_files(tmpdir)
        path = _get_hdfs_uri(tmpdir)
        result = pq.read_table(path)

        _pandas_api.assert_frame_equal(
            result.to_pandas().sort_values(by='index').reset_index(drop=True),
            expected.to_pandas())
Exemplo n.º 7
0
def test_native_file_modes(tmpdir):
    path = os.path.join(str(tmpdir), guid())
    with open(path, 'wb') as f:
        f.write(b'foooo')

    with pa.OSFile(path, mode='r') as f:
        assert f.mode == 'rb'
        assert f.readable()
        assert not f.writable()
        assert f.seekable()

    with pa.OSFile(path, mode='rb') as f:
        assert f.mode == 'rb'
        assert f.readable()
        assert not f.writable()
        assert f.seekable()

    with pa.OSFile(path, mode='w') as f:
        assert f.mode == 'wb'
        assert not f.readable()
        assert f.writable()
        assert not f.seekable()

    with pa.OSFile(path, mode='wb') as f:
        assert f.mode == 'wb'
        assert not f.readable()
        assert f.writable()
        assert not f.seekable()

    with open(path, 'wb') as f:
        f.write(b'foooo')

    with pa.memory_map(path, 'r') as f:
        assert f.mode == 'rb'
        assert f.readable()
        assert not f.writable()
        assert f.seekable()

    with pa.memory_map(path, 'r+') as f:
        assert f.mode == 'rb+'
        assert f.readable()
        assert f.writable()
        assert f.seekable()

    with pa.memory_map(path, 'r+b') as f:
        assert f.mode == 'rb+'
        assert f.readable()
        assert f.writable()
        assert f.seekable()
Exemplo n.º 8
0
    def test_read_multiple_parquet_files_with_uri(self):
        import pyarrow.parquet as pq

        tmpdir = pjoin(self.tmp_path, 'multi-parquet-uri-' + guid())

        self.hdfs.mkdir(tmpdir)

        expected = self._write_multiple_hdfs_pq_files(tmpdir)
        path = _get_hdfs_uri(tmpdir)
        # TODO for URI it should not be needed to pass this argument
        result = pq.read_table(path, use_legacy_dataset=True)

        _pandas_api.assert_frame_equal(
            result.to_pandas().sort_values(by='index').reset_index(drop=True),
            expected.to_pandas())
Exemplo n.º 9
0
def sample_disk_data(request, tmpdir):
    SIZE = 4096
    arr = np.random.randint(0, 256, size=SIZE).astype('u1')
    data = arr.tobytes()[:SIZE]

    path = os.path.join(str(tmpdir), guid())

    with open(path, 'wb') as f:
        f.write(data)

    def teardown():
        _try_delete(path)

    request.addfinalizer(teardown)
    return path, data
Exemplo n.º 10
0
def s3_example_s3fs(s3_connection, s3_server, s3_bucket):
    s3fs = pytest.importorskip('s3fs')

    host, port, access_key, secret_key = s3_connection
    fs = s3fs.S3FileSystem(
        key=access_key,
        secret=secret_key,
        client_kwargs={'endpoint_url': 'http://{}:{}'.format(host, port)})

    test_path = '{}/{}'.format(s3_bucket, guid())

    fs.mkdir(test_path)
    yield fs, test_path
    try:
        fs.rm(test_path, recursive=True)
    except FileNotFoundError:
        pass
Exemplo n.º 11
0
def test_memory_map_resize(tmpdir):
    SIZE = 4096
    arr = np.random.randint(0, 256, size=SIZE).astype(np.uint8)
    data1 = arr.tobytes()[:(SIZE // 2)]
    data2 = arr.tobytes()[(SIZE // 2):]

    path = os.path.join(str(tmpdir), guid())

    mmap = pa.create_memory_map(path, SIZE / 2)
    mmap.write(data1)

    mmap.resize(SIZE)
    mmap.write(data2)

    mmap.close()

    with open(path, 'rb') as f:
        assert f.read() == arr.tobytes()
Exemplo n.º 12
0
    def test_read_write_parquet_files_with_uri(self):
        import pyarrow.parquet as pq

        tmpdir = pjoin(self.tmp_path, 'uri-parquet-' + guid())
        self.hdfs.mkdir(tmpdir)
        path = _get_hdfs_uri(pjoin(tmpdir, 'test.parquet'))

        size = 5
        df = test_parquet._test_dataframe(size, seed=0)
        # Hack so that we don't have a dtype cast in v1 files
        df['uint32'] = df['uint32'].astype(np.int64)
        table = pa.Table.from_pandas(df, preserve_index=False)

        pq.write_table(table, path, filesystem=self.hdfs)

        result = pq.read_table(path, filesystem=self.hdfs).to_pandas()

        _pandas_api.assert_frame_equal(result, df)
Exemplo n.º 13
0
def test_os_file_writer(tmpdir):
    SIZE = 4096
    arr = np.random.randint(0, 256, size=SIZE).astype('u1')
    data = arr.tobytes()[:SIZE]

    path = os.path.join(str(tmpdir), guid())
    with open(path, 'wb') as f:
        f.write(data)

    # Truncates file
    f2 = pa.OSFile(path, mode='w')
    f2.write(b'foo')

    with pa.OSFile(path) as f3:
        assert f3.size() == 3

    with pytest.raises(IOError):
        f2.read(5)
Exemplo n.º 14
0
def test_dataset_read_pandas_common_metadata(tempdir, preserve_index):
    # ARROW-1103
    nfiles = 5
    size = 5

    dirpath = tempdir / guid()
    dirpath.mkdir()

    test_data = []
    frames = []
    paths = []
    for i in range(nfiles):
        df = _test_dataframe(size, seed=i)
        df.index = pd.Index(np.arange(i * size, (i + 1) * size), name='index')

        path = dirpath / '{}.parquet'.format(i)

        table = pa.Table.from_pandas(df, preserve_index=preserve_index)

        # Obliterate metadata
        table = table.replace_schema_metadata(None)
        assert table.schema.metadata is None

        _write_table(table, path)
        test_data.append(table)
        frames.append(df)
        paths.append(path)

    # Write _metadata common file
    table_for_metadata = pa.Table.from_pandas(
        df, preserve_index=preserve_index
    )
    pq.write_metadata(table_for_metadata.schema, dirpath / '_metadata')

    dataset = pq.ParquetDataset(dirpath)
    columns = ['uint8', 'strings']
    result = dataset.read_pandas(columns=columns).to_pandas()
    expected = pd.concat([x[columns] for x in frames])
    expected.index.name = (
        df.index.name if preserve_index is not False else None)
    tm.assert_frame_equal(result, expected)
Exemplo n.º 15
0
def test_memory_map_writer(tmpdir):
    SIZE = 4096
    arr = np.random.randint(0, 256, size=SIZE).astype('u1')
    data = arr.tobytes()[:SIZE]

    path = os.path.join(str(tmpdir), guid())
    with open(path, 'wb') as f:
        f.write(data)

    f = pa.memory_map(path, mode='r+b')

    f.seek(10)
    f.write(b'peekaboo')
    assert f.tell() == 18

    f.seek(10)
    assert f.read(8) == b'peekaboo'

    f2 = pa.memory_map(path, mode='r+b')

    f2.seek(10)
    f2.write(b'booapeak')
    f2.seek(10)

    f.seek(10)
    assert f.read(8) == b'booapeak'

    # Does not truncate file
    f3 = pa.memory_map(path, mode='w')
    f3.write(b'foo')

    with pa.memory_map(path) as f4:
        assert f4.size() == SIZE

    with pytest.raises(IOError):
        f3.read(5)

    f.seek(0)
    assert f.read(3) == b'foo'
Exemplo n.º 16
0
 def test_read_common_metadata_files(self):
     tmpdir = pjoin(self.tmp_path, 'common-metadata-' + guid())
     self.hdfs.mkdir(tmpdir)
     _test_read_common_metadata_files(self.hdfs, tmpdir)
Exemplo n.º 17
0
from tqdm import tqdm
import pyspark.sql
import pyspark.sql.functions as fn
from pyarrow.util import guid

try:
    from collections.abc import Sequence
except ImportError:
    from collections import Sequence

from ..executor import _futures_handler

# this is a reasonable local spark configuration
_default_config = pyspark.sql.SparkSession.builder \
    .appName('coffea-analysis-%s' % guid()) \
    .master('local[*]') \
    .config('spark.sql.execution.arrow.enabled', 'true') \
    .config('spark.sql.execution.arrow.maxRecordsPerBatch', 200000)


def _spark_initialize(config=_default_config, **kwargs):
    spark_progress = False
    if 'spark_progress' in kwargs.keys():
        spark_progress = kwargs['spark_progress']

    cfg_actual = config
    # get spark to not complain about missing log configs
    cfg_actual = cfg_actual.config('spark.driver.extraJavaOptions',
                                   '-Dlog4jspark.root.logger=ERROR,console')
    if not spark_progress:
Exemplo n.º 18
0
def test_memory_map_deref_remove(tmpdir):
    path = os.path.join(str(tmpdir), guid())
    pa.create_memory_map(path, 4096)
    os.remove(path)  # Shouldn't fail
Exemplo n.º 19
0
def test_spark_executor():
    pyspark = pytest.importorskip("pyspark", minversion="2.4.1")
    from pyarrow.util import guid

    from coffea.processor.spark.detail import (
        _spark_initialize,
        _spark_stop,
    )
    from coffea.processor import run_spark_job
    from coffea.nanoevents import schemas

    import os
    import os.path as osp

    import pyspark.sql

    spark_config = (pyspark.sql.SparkSession.builder.appName(
        "spark-executor-test-%s" % guid()).master("local[*]").config(
            "spark.sql.execution.arrow.enabled",
            "true").config("spark.driver.host", "127.0.0.1").config(
                "spark.driver.bindAddress", "127.0.0.1").config(
                    "spark.executor.x509proxyname", "x509_u12409").config(
                        "spark.sql.execution.arrow.maxRecordsPerBatch",
                        200000))

    spark = _spark_initialize(config=spark_config,
                              log_level="ERROR",
                              spark_progress=False)

    filelist = {
        "ZJets": {
            "files":
            ["file:" + osp.join(os.getcwd(), "tests/samples/nano_dy.root")],
            "treename":
            "Events",
        },
        "Data": {
            "files": [
                "file:" +
                osp.join(os.getcwd(), "tests/samples/nano_dimuon.root")
            ],
            "treename":
            "Events",
        },
    }

    from coffea.processor.test_items import NanoTestProcessor, NanoEventsProcessor
    from coffea.processor.spark.spark_executor import spark_executor

    columns = [
        "nMuon", "Muon_pt", "Muon_eta", "Muon_phi", "Muon_mass", "Muon_charge"
    ]
    proc = NanoTestProcessor(columns=columns)

    hists = run_spark_job(
        filelist,
        processor_instance=proc,
        executor=spark_executor,
        spark=spark,
        thread_workers=1,
        executor_args={"file_type": "root"},
    )

    assert sum(spark_executor.counts.values()) == 80
    assert hists["cutflow"]["ZJets_pt"] == 18
    assert hists["cutflow"]["ZJets_mass"] == 6
    assert hists["cutflow"]["Data_pt"] == 84
    assert hists["cutflow"]["Data_mass"] == 66

    hists = run_spark_job(
        filelist,
        processor_instance=proc,
        executor=spark_executor,
        spark=spark,
        thread_workers=1,
        executor_args={"file_type": "root"},
    )

    assert sum(spark_executor.counts.values()) == 80
    assert hists["cutflow"]["ZJets_pt"] == 18
    assert hists["cutflow"]["ZJets_mass"] == 6
    assert hists["cutflow"]["Data_pt"] == 84
    assert hists["cutflow"]["Data_mass"] == 66

    proc = NanoEventsProcessor(columns=columns)
    hists = run_spark_job(
        filelist,
        processor_instance=proc,
        executor=spark_executor,
        spark=spark,
        thread_workers=1,
        executor_args={
            "file_type": "root",
            "schema": schemas.NanoAODSchema
        },
    )

    _spark_stop(spark)

    assert sum(spark_executor.counts.values()) == 80
    assert hists["cutflow"]["ZJets_pt"] == 18
    assert hists["cutflow"]["ZJets_mass"] == 6
    assert hists["cutflow"]["Data_pt"] == 84
    assert hists["cutflow"]["Data_mass"] == 66
Exemplo n.º 20
0
def test_memory_zero_length(tmpdir):
    path = os.path.join(str(tmpdir), guid())
    f = open(path, 'wb')
    f.close()
    with pa.memory_map(path, mode='r+b') as memory_map:
        assert memory_map.size() == 0
Exemplo n.º 21
0
from tqdm import tqdm
import pyspark.sql
import pyspark.sql.functions as fn
from pyarrow.util import guid

try:
    from collections.abc import Sequence
except ImportError:
    from collections import Sequence

from coffea.processor.executor import _futures_handler

# this is a reasonable local spark configuration
_default_config = (pyspark.sql.SparkSession.builder.appName(
    "coffea-analysis-%s" % guid()).master("local[*]").config(
        "spark.sql.execution.arrow.enabled",
        "true").config("spark.sql.execution.arrow.maxRecordsPerBatch", 200000))


def _spark_initialize(config=_default_config, **kwargs):
    spark_progress = False
    if "spark_progress" in kwargs.keys():
        spark_progress = kwargs["spark_progress"]

    cfg_actual = config
    # get spark to not complain about missing log configs
    cfg_actual = cfg_actual.config("spark.driver.extraJavaOptions",
                                   "-Dlog4jspark.root.logger=ERROR,console")
    if not spark_progress:
        cfg_actual = cfg_actual.config("spark.ui.showConsoleProgress", "false")
Exemplo n.º 22
0
def test_spark_executor():
    pyspark = pytest.importorskip("pyspark", minversion="2.4.1")
    from pyarrow.util import guid

    from coffea.processor.spark.detail import (_spark_initialize,
                                               _spark_make_dfs, _spark_stop)
    from coffea.processor import run_spark_job
    from coffea.nanoevents import schemas

    import os
    import os.path as osp

    import pyspark.sql
    spark_config = pyspark.sql.SparkSession.builder \
        .appName('spark-executor-test-%s' % guid()) \
        .master('local[*]') \
        .config('spark.sql.execution.arrow.enabled','true') \
        .config('spark.executor.x509proxyname','x509_u12409') \
        .config('spark.sql.execution.arrow.maxRecordsPerBatch', 200000)

    spark = _spark_initialize(config=spark_config,
                              log_level='ERROR',
                              spark_progress=False)

    filelist = {
        'ZJets': {
            'files':
            ['file:' + osp.join(os.getcwd(), 'tests/samples/nano_dy.root')],
            'treename':
            'Events'
        },
        'Data': {
            'files': [
                'file:' +
                osp.join(os.getcwd(), 'tests/samples/nano_dimuon.root')
            ],
            'treename':
            'Events'
        }
    }

    from coffea.processor.test_items import NanoTestProcessor, NanoEventsProcessor
    from coffea.processor.spark.spark_executor import spark_executor

    columns = [
        'nMuon', 'Muon_pt', 'Muon_eta', 'Muon_phi', 'Muon_mass', 'Muon_charge'
    ]
    proc = NanoTestProcessor(columns=columns)

    hists = run_spark_job(filelist,
                          processor_instance=proc,
                          executor=spark_executor,
                          spark=spark,
                          thread_workers=1,
                          executor_args={'file_type': 'root'})

    assert (sum(spark_executor.counts.values()) == 80)
    assert (hists['cutflow']['ZJets_pt'] == 18)
    assert (hists['cutflow']['ZJets_mass'] == 6)
    assert (hists['cutflow']['Data_pt'] == 84)
    assert (hists['cutflow']['Data_mass'] == 66)

    hists = run_spark_job(filelist,
                          processor_instance=proc,
                          executor=spark_executor,
                          spark=spark,
                          thread_workers=1,
                          executor_args={'file_type': 'root'})

    assert (sum(spark_executor.counts.values()) == 80)
    assert (hists['cutflow']['ZJets_pt'] == 18)
    assert (hists['cutflow']['ZJets_mass'] == 6)
    assert (hists['cutflow']['Data_pt'] == 84)
    assert (hists['cutflow']['Data_mass'] == 66)

    proc = NanoEventsProcessor(columns=columns)
    hists = run_spark_job(filelist,
                          processor_instance=proc,
                          executor=spark_executor,
                          spark=spark,
                          thread_workers=1,
                          executor_args={
                              'file_type': 'root',
                              'schema': schemas.NanoAODSchema
                          })

    _spark_stop(spark)

    assert (sum(spark_executor.counts.values()) == 80)
    assert (hists['cutflow']['ZJets_pt'] == 18)
    assert (hists['cutflow']['ZJets_mass'] == 6)
    assert (hists['cutflow']['Data_pt'] == 84)
    assert (hists['cutflow']['Data_mass'] == 66)
Exemplo n.º 23
0
 def test_write_to_dataset_no_partitions(self):
     tmpdir = pjoin(self.tmp_path, 'write-no_partitions-' + guid())
     self.hdfs.mkdir(tmpdir)
     _test_write_to_dataset_no_partitions(tmpdir, filesystem=self.hdfs)
# are manually changed from 64bit to 32bit (counts2nestedindex_form, counts2offsets_form)
# itemsize changed from 8 to 4, etc.
# Also, dataset name should be changed from pyarrow.StringScalar to python string.

# In addition, custom Spark branch is needed, otherwise it is too slow
# https://github.com/lgray/spark/tree/v2.4.4_arrowhacks

# So, a lot of trouble for questionable reward.
# This code will not work out of the box, and even if all hacks are implemented, there
# are unexplained memory problems when processing large datasets.

if __name__ == "__main__":
    tick = time.time()

    spark_config = (pyspark.sql.SparkSession.builder.appName(
        "spark-executor-test-%s" % guid()).master("local[1]").config(
            "spark.sql.execution.arrow.enabled",
            "true").config("spark.executor.memory", "7g").config(
                "spark.executor.cores",
                "1").config("spark.driver.memory", "16g").config(
                    "spark.driver.maxResultSize", "4g").config(
                        "spark.sql.execution.arrow.maxRecordsPerBatch",
                        100000).config("spark.cores.max", "1"))

    spark = _spark_initialize(
        config=spark_config,
        log_level="ERROR",
        spark_progress=False,
        laurelin_version="1.0.0",
    )
    print("Spark initialized")