def test_native_file_TextIOWrapper(tmpdir): data = ('foooo\n' 'barrr\n' 'bazzz\n') path = os.path.join(str(tmpdir), guid()) with open(path, 'wb') as f: f.write(data.encode('utf-8')) with TextIOWrapper(pa.OSFile(path, mode='rb')) as fil: assert fil.readable() res = fil.read() assert res == data assert fil.closed with TextIOWrapper(pa.OSFile(path, mode='rb')) as fil: # Iteration works lines = list(fil) assert ''.join(lines) == data # Writing path2 = os.path.join(str(tmpdir), guid()) with TextIOWrapper(pa.OSFile(path2, mode='wb')) as fil: assert fil.writable() fil.write(data) with TextIOWrapper(pa.OSFile(path2, mode='rb')) as fil: res = fil.read() assert res == data
def test_native_file_permissions(tmpdir): # ARROW-10124: permissions of created files should follow umask cur_umask = os.umask(0o002) os.umask(cur_umask) path = os.path.join(str(tmpdir), guid()) with pa.OSFile(path, mode='w'): pass assert os.stat(path).st_mode & 0o777 == 0o666 & ~cur_umask path = os.path.join(str(tmpdir), guid()) with pa.memory_map(path, 'w'): pass assert os.stat(path).st_mode & 0o777 == 0o666 & ~cur_umask
def test_native_file_raises_ValueError_after_close(tmpdir): path = os.path.join(str(tmpdir), guid()) with open(path, 'wb') as f: f.write(b'foooo') with pa.OSFile(path, mode='rb') as os_file: assert not os_file.closed assert os_file.closed with pa.memory_map(path, mode='rb') as mmap_file: assert not mmap_file.closed assert mmap_file.closed files = [os_file, mmap_file] methods = [('tell', ()), ('seek', (0,)), ('size', ()), ('flush', ()), ('readable', ()), ('writable', ()), ('seekable', ())] for f in files: for method, args in methods: with pytest.raises(ValueError): getattr(f, method)(*args)
def test_memory_map_close_remove(tmpdir): # ARROW-6740: should be able to delete closed memory-mapped file (Windows) path = os.path.join(str(tmpdir), guid()) mmap = pa.create_memory_map(path, 4096) mmap.close() assert mmap.closed os.remove(path) # Shouldn't fail
def test_read_multiple_parquet_files(self): tmpdir = pjoin(self.tmp_path, 'multi-parquet-' + guid()) self.hdfs.mkdir(tmpdir) expected = self._write_multiple_hdfs_pq_files(tmpdir) result = self.hdfs.read_parquet(tmpdir) _pandas_api.assert_frame_equal( result.to_pandas().sort_values(by='index').reset_index(drop=True), expected.to_pandas())
def test_read_multiple_parquet_files_with_uri(self): import pyarrow.parquet as pq tmpdir = pjoin(self.tmp_path, 'multi-parquet-uri-' + guid()) self.hdfs.mkdir(tmpdir) expected = self._write_multiple_hdfs_pq_files(tmpdir) path = _get_hdfs_uri(tmpdir) result = pq.read_table(path) _pandas_api.assert_frame_equal( result.to_pandas().sort_values(by='index').reset_index(drop=True), expected.to_pandas())
def test_native_file_modes(tmpdir): path = os.path.join(str(tmpdir), guid()) with open(path, 'wb') as f: f.write(b'foooo') with pa.OSFile(path, mode='r') as f: assert f.mode == 'rb' assert f.readable() assert not f.writable() assert f.seekable() with pa.OSFile(path, mode='rb') as f: assert f.mode == 'rb' assert f.readable() assert not f.writable() assert f.seekable() with pa.OSFile(path, mode='w') as f: assert f.mode == 'wb' assert not f.readable() assert f.writable() assert not f.seekable() with pa.OSFile(path, mode='wb') as f: assert f.mode == 'wb' assert not f.readable() assert f.writable() assert not f.seekable() with open(path, 'wb') as f: f.write(b'foooo') with pa.memory_map(path, 'r') as f: assert f.mode == 'rb' assert f.readable() assert not f.writable() assert f.seekable() with pa.memory_map(path, 'r+') as f: assert f.mode == 'rb+' assert f.readable() assert f.writable() assert f.seekable() with pa.memory_map(path, 'r+b') as f: assert f.mode == 'rb+' assert f.readable() assert f.writable() assert f.seekable()
def test_read_multiple_parquet_files_with_uri(self): import pyarrow.parquet as pq tmpdir = pjoin(self.tmp_path, 'multi-parquet-uri-' + guid()) self.hdfs.mkdir(tmpdir) expected = self._write_multiple_hdfs_pq_files(tmpdir) path = _get_hdfs_uri(tmpdir) # TODO for URI it should not be needed to pass this argument result = pq.read_table(path, use_legacy_dataset=True) _pandas_api.assert_frame_equal( result.to_pandas().sort_values(by='index').reset_index(drop=True), expected.to_pandas())
def sample_disk_data(request, tmpdir): SIZE = 4096 arr = np.random.randint(0, 256, size=SIZE).astype('u1') data = arr.tobytes()[:SIZE] path = os.path.join(str(tmpdir), guid()) with open(path, 'wb') as f: f.write(data) def teardown(): _try_delete(path) request.addfinalizer(teardown) return path, data
def s3_example_s3fs(s3_connection, s3_server, s3_bucket): s3fs = pytest.importorskip('s3fs') host, port, access_key, secret_key = s3_connection fs = s3fs.S3FileSystem( key=access_key, secret=secret_key, client_kwargs={'endpoint_url': 'http://{}:{}'.format(host, port)}) test_path = '{}/{}'.format(s3_bucket, guid()) fs.mkdir(test_path) yield fs, test_path try: fs.rm(test_path, recursive=True) except FileNotFoundError: pass
def test_memory_map_resize(tmpdir): SIZE = 4096 arr = np.random.randint(0, 256, size=SIZE).astype(np.uint8) data1 = arr.tobytes()[:(SIZE // 2)] data2 = arr.tobytes()[(SIZE // 2):] path = os.path.join(str(tmpdir), guid()) mmap = pa.create_memory_map(path, SIZE / 2) mmap.write(data1) mmap.resize(SIZE) mmap.write(data2) mmap.close() with open(path, 'rb') as f: assert f.read() == arr.tobytes()
def test_read_write_parquet_files_with_uri(self): import pyarrow.parquet as pq tmpdir = pjoin(self.tmp_path, 'uri-parquet-' + guid()) self.hdfs.mkdir(tmpdir) path = _get_hdfs_uri(pjoin(tmpdir, 'test.parquet')) size = 5 df = test_parquet._test_dataframe(size, seed=0) # Hack so that we don't have a dtype cast in v1 files df['uint32'] = df['uint32'].astype(np.int64) table = pa.Table.from_pandas(df, preserve_index=False) pq.write_table(table, path, filesystem=self.hdfs) result = pq.read_table(path, filesystem=self.hdfs).to_pandas() _pandas_api.assert_frame_equal(result, df)
def test_os_file_writer(tmpdir): SIZE = 4096 arr = np.random.randint(0, 256, size=SIZE).astype('u1') data = arr.tobytes()[:SIZE] path = os.path.join(str(tmpdir), guid()) with open(path, 'wb') as f: f.write(data) # Truncates file f2 = pa.OSFile(path, mode='w') f2.write(b'foo') with pa.OSFile(path) as f3: assert f3.size() == 3 with pytest.raises(IOError): f2.read(5)
def test_dataset_read_pandas_common_metadata(tempdir, preserve_index): # ARROW-1103 nfiles = 5 size = 5 dirpath = tempdir / guid() dirpath.mkdir() test_data = [] frames = [] paths = [] for i in range(nfiles): df = _test_dataframe(size, seed=i) df.index = pd.Index(np.arange(i * size, (i + 1) * size), name='index') path = dirpath / '{}.parquet'.format(i) table = pa.Table.from_pandas(df, preserve_index=preserve_index) # Obliterate metadata table = table.replace_schema_metadata(None) assert table.schema.metadata is None _write_table(table, path) test_data.append(table) frames.append(df) paths.append(path) # Write _metadata common file table_for_metadata = pa.Table.from_pandas( df, preserve_index=preserve_index ) pq.write_metadata(table_for_metadata.schema, dirpath / '_metadata') dataset = pq.ParquetDataset(dirpath) columns = ['uint8', 'strings'] result = dataset.read_pandas(columns=columns).to_pandas() expected = pd.concat([x[columns] for x in frames]) expected.index.name = ( df.index.name if preserve_index is not False else None) tm.assert_frame_equal(result, expected)
def test_memory_map_writer(tmpdir): SIZE = 4096 arr = np.random.randint(0, 256, size=SIZE).astype('u1') data = arr.tobytes()[:SIZE] path = os.path.join(str(tmpdir), guid()) with open(path, 'wb') as f: f.write(data) f = pa.memory_map(path, mode='r+b') f.seek(10) f.write(b'peekaboo') assert f.tell() == 18 f.seek(10) assert f.read(8) == b'peekaboo' f2 = pa.memory_map(path, mode='r+b') f2.seek(10) f2.write(b'booapeak') f2.seek(10) f.seek(10) assert f.read(8) == b'booapeak' # Does not truncate file f3 = pa.memory_map(path, mode='w') f3.write(b'foo') with pa.memory_map(path) as f4: assert f4.size() == SIZE with pytest.raises(IOError): f3.read(5) f.seek(0) assert f.read(3) == b'foo'
def test_read_common_metadata_files(self): tmpdir = pjoin(self.tmp_path, 'common-metadata-' + guid()) self.hdfs.mkdir(tmpdir) _test_read_common_metadata_files(self.hdfs, tmpdir)
from tqdm import tqdm import pyspark.sql import pyspark.sql.functions as fn from pyarrow.util import guid try: from collections.abc import Sequence except ImportError: from collections import Sequence from ..executor import _futures_handler # this is a reasonable local spark configuration _default_config = pyspark.sql.SparkSession.builder \ .appName('coffea-analysis-%s' % guid()) \ .master('local[*]') \ .config('spark.sql.execution.arrow.enabled', 'true') \ .config('spark.sql.execution.arrow.maxRecordsPerBatch', 200000) def _spark_initialize(config=_default_config, **kwargs): spark_progress = False if 'spark_progress' in kwargs.keys(): spark_progress = kwargs['spark_progress'] cfg_actual = config # get spark to not complain about missing log configs cfg_actual = cfg_actual.config('spark.driver.extraJavaOptions', '-Dlog4jspark.root.logger=ERROR,console') if not spark_progress:
def test_memory_map_deref_remove(tmpdir): path = os.path.join(str(tmpdir), guid()) pa.create_memory_map(path, 4096) os.remove(path) # Shouldn't fail
def test_spark_executor(): pyspark = pytest.importorskip("pyspark", minversion="2.4.1") from pyarrow.util import guid from coffea.processor.spark.detail import ( _spark_initialize, _spark_stop, ) from coffea.processor import run_spark_job from coffea.nanoevents import schemas import os import os.path as osp import pyspark.sql spark_config = (pyspark.sql.SparkSession.builder.appName( "spark-executor-test-%s" % guid()).master("local[*]").config( "spark.sql.execution.arrow.enabled", "true").config("spark.driver.host", "127.0.0.1").config( "spark.driver.bindAddress", "127.0.0.1").config( "spark.executor.x509proxyname", "x509_u12409").config( "spark.sql.execution.arrow.maxRecordsPerBatch", 200000)) spark = _spark_initialize(config=spark_config, log_level="ERROR", spark_progress=False) filelist = { "ZJets": { "files": ["file:" + osp.join(os.getcwd(), "tests/samples/nano_dy.root")], "treename": "Events", }, "Data": { "files": [ "file:" + osp.join(os.getcwd(), "tests/samples/nano_dimuon.root") ], "treename": "Events", }, } from coffea.processor.test_items import NanoTestProcessor, NanoEventsProcessor from coffea.processor.spark.spark_executor import spark_executor columns = [ "nMuon", "Muon_pt", "Muon_eta", "Muon_phi", "Muon_mass", "Muon_charge" ] proc = NanoTestProcessor(columns=columns) hists = run_spark_job( filelist, processor_instance=proc, executor=spark_executor, spark=spark, thread_workers=1, executor_args={"file_type": "root"}, ) assert sum(spark_executor.counts.values()) == 80 assert hists["cutflow"]["ZJets_pt"] == 18 assert hists["cutflow"]["ZJets_mass"] == 6 assert hists["cutflow"]["Data_pt"] == 84 assert hists["cutflow"]["Data_mass"] == 66 hists = run_spark_job( filelist, processor_instance=proc, executor=spark_executor, spark=spark, thread_workers=1, executor_args={"file_type": "root"}, ) assert sum(spark_executor.counts.values()) == 80 assert hists["cutflow"]["ZJets_pt"] == 18 assert hists["cutflow"]["ZJets_mass"] == 6 assert hists["cutflow"]["Data_pt"] == 84 assert hists["cutflow"]["Data_mass"] == 66 proc = NanoEventsProcessor(columns=columns) hists = run_spark_job( filelist, processor_instance=proc, executor=spark_executor, spark=spark, thread_workers=1, executor_args={ "file_type": "root", "schema": schemas.NanoAODSchema }, ) _spark_stop(spark) assert sum(spark_executor.counts.values()) == 80 assert hists["cutflow"]["ZJets_pt"] == 18 assert hists["cutflow"]["ZJets_mass"] == 6 assert hists["cutflow"]["Data_pt"] == 84 assert hists["cutflow"]["Data_mass"] == 66
def test_memory_zero_length(tmpdir): path = os.path.join(str(tmpdir), guid()) f = open(path, 'wb') f.close() with pa.memory_map(path, mode='r+b') as memory_map: assert memory_map.size() == 0
from tqdm import tqdm import pyspark.sql import pyspark.sql.functions as fn from pyarrow.util import guid try: from collections.abc import Sequence except ImportError: from collections import Sequence from coffea.processor.executor import _futures_handler # this is a reasonable local spark configuration _default_config = (pyspark.sql.SparkSession.builder.appName( "coffea-analysis-%s" % guid()).master("local[*]").config( "spark.sql.execution.arrow.enabled", "true").config("spark.sql.execution.arrow.maxRecordsPerBatch", 200000)) def _spark_initialize(config=_default_config, **kwargs): spark_progress = False if "spark_progress" in kwargs.keys(): spark_progress = kwargs["spark_progress"] cfg_actual = config # get spark to not complain about missing log configs cfg_actual = cfg_actual.config("spark.driver.extraJavaOptions", "-Dlog4jspark.root.logger=ERROR,console") if not spark_progress: cfg_actual = cfg_actual.config("spark.ui.showConsoleProgress", "false")
def test_spark_executor(): pyspark = pytest.importorskip("pyspark", minversion="2.4.1") from pyarrow.util import guid from coffea.processor.spark.detail import (_spark_initialize, _spark_make_dfs, _spark_stop) from coffea.processor import run_spark_job from coffea.nanoevents import schemas import os import os.path as osp import pyspark.sql spark_config = pyspark.sql.SparkSession.builder \ .appName('spark-executor-test-%s' % guid()) \ .master('local[*]') \ .config('spark.sql.execution.arrow.enabled','true') \ .config('spark.executor.x509proxyname','x509_u12409') \ .config('spark.sql.execution.arrow.maxRecordsPerBatch', 200000) spark = _spark_initialize(config=spark_config, log_level='ERROR', spark_progress=False) filelist = { 'ZJets': { 'files': ['file:' + osp.join(os.getcwd(), 'tests/samples/nano_dy.root')], 'treename': 'Events' }, 'Data': { 'files': [ 'file:' + osp.join(os.getcwd(), 'tests/samples/nano_dimuon.root') ], 'treename': 'Events' } } from coffea.processor.test_items import NanoTestProcessor, NanoEventsProcessor from coffea.processor.spark.spark_executor import spark_executor columns = [ 'nMuon', 'Muon_pt', 'Muon_eta', 'Muon_phi', 'Muon_mass', 'Muon_charge' ] proc = NanoTestProcessor(columns=columns) hists = run_spark_job(filelist, processor_instance=proc, executor=spark_executor, spark=spark, thread_workers=1, executor_args={'file_type': 'root'}) assert (sum(spark_executor.counts.values()) == 80) assert (hists['cutflow']['ZJets_pt'] == 18) assert (hists['cutflow']['ZJets_mass'] == 6) assert (hists['cutflow']['Data_pt'] == 84) assert (hists['cutflow']['Data_mass'] == 66) hists = run_spark_job(filelist, processor_instance=proc, executor=spark_executor, spark=spark, thread_workers=1, executor_args={'file_type': 'root'}) assert (sum(spark_executor.counts.values()) == 80) assert (hists['cutflow']['ZJets_pt'] == 18) assert (hists['cutflow']['ZJets_mass'] == 6) assert (hists['cutflow']['Data_pt'] == 84) assert (hists['cutflow']['Data_mass'] == 66) proc = NanoEventsProcessor(columns=columns) hists = run_spark_job(filelist, processor_instance=proc, executor=spark_executor, spark=spark, thread_workers=1, executor_args={ 'file_type': 'root', 'schema': schemas.NanoAODSchema }) _spark_stop(spark) assert (sum(spark_executor.counts.values()) == 80) assert (hists['cutflow']['ZJets_pt'] == 18) assert (hists['cutflow']['ZJets_mass'] == 6) assert (hists['cutflow']['Data_pt'] == 84) assert (hists['cutflow']['Data_mass'] == 66)
def test_write_to_dataset_no_partitions(self): tmpdir = pjoin(self.tmp_path, 'write-no_partitions-' + guid()) self.hdfs.mkdir(tmpdir) _test_write_to_dataset_no_partitions(tmpdir, filesystem=self.hdfs)
# are manually changed from 64bit to 32bit (counts2nestedindex_form, counts2offsets_form) # itemsize changed from 8 to 4, etc. # Also, dataset name should be changed from pyarrow.StringScalar to python string. # In addition, custom Spark branch is needed, otherwise it is too slow # https://github.com/lgray/spark/tree/v2.4.4_arrowhacks # So, a lot of trouble for questionable reward. # This code will not work out of the box, and even if all hacks are implemented, there # are unexplained memory problems when processing large datasets. if __name__ == "__main__": tick = time.time() spark_config = (pyspark.sql.SparkSession.builder.appName( "spark-executor-test-%s" % guid()).master("local[1]").config( "spark.sql.execution.arrow.enabled", "true").config("spark.executor.memory", "7g").config( "spark.executor.cores", "1").config("spark.driver.memory", "16g").config( "spark.driver.maxResultSize", "4g").config( "spark.sql.execution.arrow.maxRecordsPerBatch", 100000).config("spark.cores.max", "1")) spark = _spark_initialize( config=spark_config, log_level="ERROR", spark_progress=False, laurelin_version="1.0.0", ) print("Spark initialized")