示例#1
0
文件: test_io.py 项目: sunchao/arrow
def test_native_file_TextIOWrapper(tmpdir):
    data = (u'foooo\n'
            u'barrr\n'
            u'bazzz\n')

    path = os.path.join(str(tmpdir), guid())
    with open(path, 'wb') as f:
        f.write(data.encode('utf-8'))

    with TextIOWrapper(pa.OSFile(path, mode='rb')) as fil:
        assert fil.readable()
        res = fil.read()
        assert res == data
    assert fil.closed

    with TextIOWrapper(pa.OSFile(path, mode='rb')) as fil:
        # Iteration works
        lines = list(fil)
        assert ''.join(lines) == data

    # Writing
    path2 = os.path.join(str(tmpdir), guid())
    with TextIOWrapper(pa.OSFile(path2, mode='wb')) as fil:
        assert fil.writable()
        fil.write(data)

    with TextIOWrapper(pa.OSFile(path2, mode='rb')) as fil:
        res = fil.read()
        assert res == data
示例#2
0
文件: test_io.py 项目: sunchao/arrow
def test_native_file_raises_ValueError_after_close(tmpdir):
    path = os.path.join(str(tmpdir), guid())
    with open(path, 'wb') as f:
        f.write(b'foooo')

    with pa.OSFile(path, mode='rb') as os_file:
        assert not os_file.closed
    assert os_file.closed

    with pa.memory_map(path, mode='rb') as mmap_file:
        assert not mmap_file.closed
    assert mmap_file.closed

    files = [os_file,
             mmap_file]

    methods = [('tell', ()),
               ('seek', (0,)),
               ('size', ()),
               ('flush', ()),
               ('readable', ()),
               ('writable', ()),
               ('seekable', ())]

    for f in files:
        for method, args in methods:
            with pytest.raises(ValueError):
                getattr(f, method)(*args)
def test_dataset_read_pandas(tmpdir):
    import pyarrow.parquet as pq

    nfiles = 5
    size = 5

    dirpath = tmpdir.join(guid()).strpath
    os.mkdir(dirpath)

    test_data = []
    frames = []
    paths = []
    for i in range(nfiles):
        df = _test_dataframe(size, seed=i)
        df.index = np.arange(i * size, (i + 1) * size)
        df.index.name = 'index'

        path = pjoin(dirpath, '{0}.parquet'.format(i))

        table = pa.Table.from_pandas(df)
        _write_table(table, path)
        test_data.append(table)
        frames.append(df)
        paths.append(path)

    dataset = pq.ParquetDataset(dirpath)
    columns = ['uint8', 'strings']
    result = dataset.read_pandas(columns=columns).to_pandas()
    expected = pd.concat([x[columns] for x in frames])

    tm.assert_frame_equal(result, expected)
示例#4
0
def test_native_file_modes(tmpdir):
    path = os.path.join(str(tmpdir), guid())
    with open(path, 'wb') as f:
        f.write(b'foooo')

    with pa.OSFile(path, mode='r') as f:
        assert f.mode == 'rb'

    with pa.OSFile(path, mode='rb') as f:
        assert f.mode == 'rb'

    with pa.OSFile(path, mode='w') as f:
        assert f.mode == 'wb'

    with pa.OSFile(path, mode='wb') as f:
        assert f.mode == 'wb'

    with open(path, 'wb') as f:
        f.write(b'foooo')

    with pa.memory_map(path, 'r') as f:
        assert f.mode == 'rb'

    with pa.memory_map(path, 'r+') as f:
        assert f.mode == 'rb+'

    with pa.memory_map(path, 'r+b') as f:
        assert f.mode == 'rb+'
示例#5
0
    def test_read_multiple_parquet_files(self):
        import pyarrow.parquet as pq

        nfiles = 10
        size = 5

        tmpdir = pjoin(self.tmp_path, 'multi-parquet-' + guid())

        self.hdfs.mkdir(tmpdir)

        test_data = []
        paths = []
        for i in range(nfiles):
            df = test_parquet._test_dataframe(size, seed=i)

            df['index'] = np.arange(i * size, (i + 1) * size)

            # Hack so that we don't have a dtype cast in v1 files
            df['uint32'] = df['uint32'].astype(np.int64)

            path = pjoin(tmpdir, '{0}.parquet'.format(i))

            table = pa.Table.from_pandas(df, preserve_index=False)
            with self.hdfs.open(path, 'wb') as f:
                pq.write_table(table, f)

            test_data.append(table)
            paths.append(path)

        result = self.hdfs.read_parquet(tmpdir)
        expected = pa.concat_tables(test_data)

        pdt.assert_frame_equal(result.to_pandas()
                               .sort_values(by='index').reset_index(drop=True),
                               expected.to_pandas())
示例#6
0
    def test_read_multiple_parquet_files(self):

        tmpdir = pjoin(self.tmp_path, 'multi-parquet-' + guid())

        self.hdfs.mkdir(tmpdir)

        expected = self._write_multiple_hdfs_pq_files(tmpdir)
        result = self.hdfs.read_parquet(tmpdir)

        pdt.assert_frame_equal(result.to_pandas()
                               .sort_values(by='index').reset_index(drop=True),
                               expected.to_pandas())
示例#7
0
文件: test_io.py 项目: kiril-me/arrow
def sample_disk_data(request):
    SIZE = 4096
    arr = np.random.randint(0, 256, size=SIZE).astype('u1')
    data = arr.tobytes()[:SIZE]

    path = guid()
    with open(path, 'wb') as f:
        f.write(data)

    def teardown():
        _try_delete(path)
    request.addfinalizer(teardown)
    return path, data
示例#8
0
    def test_read_multiple_parquet_files_with_uri(self):
        import pyarrow.parquet as pq

        tmpdir = pjoin(self.tmp_path, 'multi-parquet-uri-' + guid())

        self.hdfs.mkdir(tmpdir)

        expected = self._write_multiple_hdfs_pq_files(tmpdir)
        path = _get_hdfs_uri(tmpdir)
        result = pq.read_table(path)

        pdt.assert_frame_equal(result.to_pandas()
                               .sort_values(by='index').reset_index(drop=True),
                               expected.to_pandas())
def s3_example():
    access_key = os.environ['PYARROW_TEST_S3_ACCESS_KEY']
    secret_key = os.environ['PYARROW_TEST_S3_SECRET_KEY']
    bucket_name = os.environ['PYARROW_TEST_S3_BUCKET']

    import s3fs
    fs = s3fs.S3FileSystem(key=access_key, secret=secret_key)

    test_dir = guid()

    bucket_uri = 's3://{0}/{1}'.format(bucket_name, test_dir)
    fs.mkdir(bucket_uri)
    yield fs, bucket_uri
    fs.rm(bucket_uri, recursive=True)
示例#10
0
def sample_disk_data(request, tmpdir):
    SIZE = 4096
    arr = np.random.randint(0, 256, size=SIZE).astype('u1')
    data = arr.tobytes()[:SIZE]

    path = os.path.join(str(tmpdir), guid())

    with open(path, 'wb') as f:
        f.write(data)

    def teardown():
        _try_delete(path)

    request.addfinalizer(teardown)
    return path, data
示例#11
0
    def test_read_multiple_parquet_files_with_uri(self):
        import pyarrow.parquet as pq

        tmpdir = pjoin(self.tmp_path, 'multi-parquet-uri-' + guid())

        self.hdfs.mkdir(tmpdir)

        expected = self._write_multiple_hdfs_pq_files(tmpdir)
        path = _get_hdfs_uri(tmpdir)
        result = pq.read_table(path)

        _pandas_api.assert_frame_equal(result.to_pandas()
                                       .sort_values(by='index')
                                       .reset_index(drop=True),
                                       expected.to_pandas())
示例#12
0
def _write_partitioned(
    table, root_path, partition_cols, fs, preserve_index=True, **kwargs
):
    """ Write table to a partitioned dataset with pyarrow.

        Logic copied from pyarrow.parquet.
        (arrow/python/pyarrow/parquet.py::write_to_dataset)

        TODO: Remove this in favor of pyarrow's `write_to_dataset`
              once ARROW-8244 is addressed.
    """
    fs.mkdirs(root_path, exist_ok=True)

    df = table.to_pandas(ignore_metadata=False)
    partition_keys = [df[col] for col in partition_cols]
    data_df = df.drop(partition_cols, axis="columns")
    data_cols = df.columns.drop(partition_cols)
    if len(data_cols) == 0 and not preserve_index:
        raise ValueError("No data left to save outside partition columns")

    subschema = table.schema
    for col in table.schema.names:
        if col in partition_cols:
            subschema = subschema.remove(subschema.get_field_index(col))

    md_list = []
    for keys, subgroup in data_df.groupby(partition_keys):
        if not isinstance(keys, tuple):
            keys = (keys,)
        subdir = fs.sep.join(
            [
                "{colname}={value}".format(colname=name, value=val)
                for name, val in zip(partition_cols, keys)
            ]
        )
        subtable = pa.Table.from_pandas(
            subgroup, preserve_index=False, schema=subschema, safe=False
        )
        prefix = fs.sep.join([root_path, subdir])
        fs.mkdir(prefix, exists_ok=True)
        outfile = guid() + ".parquet"
        full_path = fs.sep.join([prefix, outfile])
        with fs.open(full_path, "wb") as f:
            pq.write_table(subtable, f, metadata_collector=md_list, **kwargs)
        md_list[-1].set_file_path(fs.sep.join([subdir, outfile]))

    return md_list
示例#13
0
def test_dataset_read_pandas_common_metadata(tmpdir):
    # ARROW-1103
    import pyarrow.parquet as pq

    nfiles = 5
    size = 5

    dirpath = tmpdir.join(guid()).strpath
    os.mkdir(dirpath)

    test_data = []
    frames = []
    paths = []
    for i in range(nfiles):
        df = _test_dataframe(size, seed=i)
        df.index = pd.Index(np.arange(i * size, (i + 1) * size))
        df.index.name = 'index'

        path = pjoin(dirpath, '{0}.parquet'.format(i))

        df_ex_index = df.reset_index(drop=True)
        df_ex_index['index'] = df.index
        table = pa.Table.from_pandas(df_ex_index,
                                     preserve_index=False)

        # Obliterate metadata
        table = table.replace_schema_metadata(None)
        assert table.schema.metadata is None

        _write_table(table, path)
        test_data.append(table)
        frames.append(df)
        paths.append(path)

    # Write _metadata common file
    table_for_metadata = pa.Table.from_pandas(df)
    pq.write_metadata(table_for_metadata.schema,
                      pjoin(dirpath, '_metadata'))

    dataset = pq.ParquetDataset(dirpath)
    columns = ['uint8', 'strings']
    result = dataset.read_pandas(columns=columns).to_pandas()
    expected = pd.concat([x[columns] for x in frames])

    tm.assert_frame_equal(result, expected)
示例#14
0
文件: test_io.py 项目: rok/arrow
def test_memory_map_resize(tmpdir):
    SIZE = 4096
    arr = np.random.randint(0, 256, size=SIZE).astype(np.uint8)
    data1 = arr.tobytes()[:(SIZE // 2)]
    data2 = arr.tobytes()[(SIZE // 2):]

    path = os.path.join(str(tmpdir), guid())

    mmap = pa.create_memory_map(path, SIZE / 2)
    mmap.write(data1)

    mmap.resize(SIZE)
    mmap.write(data2)

    mmap.close()

    with open(path, 'rb') as f:
        assert f.read() == arr.tobytes()
示例#15
0
文件: test_hdfs.py 项目: tlantz/arrow
    def test_read_write_parquet_files_with_uri(self):
        import pyarrow.parquet as pq

        tmpdir = pjoin(self.tmp_path, 'uri-parquet-' + guid())
        self.hdfs.mkdir(tmpdir)
        path = _get_hdfs_uri(pjoin(tmpdir, 'test.parquet'))

        size = 5
        df = test_parquet._test_dataframe(size, seed=0)
        # Hack so that we don't have a dtype cast in v1 files
        df['uint32'] = df['uint32'].astype(np.int64)
        table = pa.Table.from_pandas(df, preserve_index=False)

        pq.write_table(table, path, filesystem=self.hdfs)

        result = pq.read_table(path, filesystem=self.hdfs).to_pandas()

        _pandas_api.assert_frame_equal(result, df)
示例#16
0
    def test_read_write_parquet_files_with_uri(self):
        import pyarrow.parquet as pq

        tmpdir = pjoin(self.tmp_path, 'uri-parquet-' + guid())
        self.hdfs.mkdir(tmpdir)
        path = _get_hdfs_uri(pjoin(tmpdir, 'test.parquet'))

        size = 5
        df = test_parquet._test_dataframe(size, seed=0)
        # Hack so that we don't have a dtype cast in v1 files
        df['uint32'] = df['uint32'].astype(np.int64)
        table = pa.Table.from_pandas(df, preserve_index=False)

        pq.write_table(table, path)

        result = pq.read_table(path).to_pandas()

        pdt.assert_frame_equal(result, df)
def test_memory_map_resize(tmpdir):
    SIZE = 4096
    arr = np.random.randint(0, 256, size=SIZE).astype(np.uint8)
    data1 = arr.tobytes()[:(SIZE // 2)]
    data2 = arr.tobytes()[(SIZE // 2):]

    path = os.path.join(str(tmpdir), guid())

    mmap = pa.create_memory_map(path, SIZE / 2)
    mmap.write(data1)

    mmap.resize(SIZE)
    mmap.write(data2)

    mmap.close()

    with open(path, 'rb') as f:
        assert f.read() == arr.tobytes()
示例#18
0
文件: test_io.py 项目: sunchao/arrow
def test_os_file_writer(tmpdir):
    SIZE = 4096
    arr = np.random.randint(0, 256, size=SIZE).astype('u1')
    data = arr.tobytes()[:SIZE]

    path = os.path.join(str(tmpdir), guid())
    with open(path, 'wb') as f:
        f.write(data)

    # Truncates file
    f2 = pa.OSFile(path, mode='w')
    f2.write('foo')

    with pa.OSFile(path) as f3:
        assert f3.size() == 3

    with pytest.raises(IOError):
        f2.read(5)
def test_os_file_writer(tmpdir):
    SIZE = 4096
    arr = np.random.randint(0, 256, size=SIZE).astype('u1')
    data = arr.tobytes()[:SIZE]

    path = os.path.join(str(tmpdir), guid())
    with open(path, 'wb') as f:
        f.write(data)

    # Truncates file
    f2 = pa.OSFile(path, mode='w')
    f2.write(b'foo')

    with pa.OSFile(path) as f3:
        assert f3.size() == 3

    with pytest.raises(IOError):
        f2.read(5)
示例#20
0
文件: test_io.py 项目: kiril-me/arrow
def test_memory_map_writer():
    SIZE = 4096
    arr = np.random.randint(0, 256, size=SIZE).astype('u1')
    data = arr.tobytes()[:SIZE]

    path = guid()
    try:
        with open(path, 'wb') as f:
            f.write(data)

        f = io.MemoryMappedFile(path, mode='r+w')

        f.seek(10)
        f.write('peekaboo')
        assert f.tell() == 18

        f.seek(10)
        assert f.read(8) == b'peekaboo'

        f2 = io.MemoryMappedFile(path, mode='r+w')

        f2.seek(10)
        f2.write(b'booapeak')
        f2.seek(10)

        f.seek(10)
        assert f.read(8) == b'booapeak'

        # Does not truncate file
        f3 = io.MemoryMappedFile(path, mode='w')
        f3.write('foo')

        with io.MemoryMappedFile(path) as f4:
            assert f4.size() == SIZE

        with pytest.raises(IOError):
            f3.read(5)

        f.seek(0)
        assert f.read(3) == b'foo'
    finally:
        _try_delete(path)
示例#21
0
def test_memory_map_writer():
    SIZE = 4096
    arr = np.random.randint(0, 256, size=SIZE).astype('u1')
    data = arr.tobytes()[:SIZE]

    path = guid()
    try:
        with open(path, 'wb') as f:
            f.write(data)

        f = io.MemoryMappedFile(path, mode='r+w')

        f.seek(10)
        f.write('peekaboo')
        assert f.tell() == 18

        f.seek(10)
        assert f.read(8) == b'peekaboo'

        f2 = io.MemoryMappedFile(path, mode='r+w')

        f2.seek(10)
        f2.write(b'booapeak')
        f2.seek(10)

        f.seek(10)
        assert f.read(8) == b'booapeak'

        # Does not truncate file
        f3 = io.MemoryMappedFile(path, mode='w')
        f3.write('foo')

        with io.MemoryMappedFile(path) as f4:
            assert f4.size() == SIZE

        with pytest.raises(IOError):
            f3.read(5)

        f.seek(0)
        assert f.read(3) == b'foo'
    finally:
        _try_delete(path)
示例#22
0
def test_spark_executor():
    pyspark = pytest.importorskip("pyspark", minversion="2.4.1")
    from pyarrow.compat import guid
    
    from coffea.processor.spark.detail import (_spark_initialize,
                                               _spark_make_dfs,
                                               _spark_stop)
    from coffea.processor import run_spark_job

    import os
    import os.path as osp

    import pyspark.sql
    spark_config = pyspark.sql.SparkSession.builder \
        .appName('spark-executor-test-%s' % guid()) \
        .master('local[*]') \
        .config('spark.sql.execution.arrow.enabled','true') \
        .config('spark.sql.execution.arrow.maxRecordsPerBatch', 200000)

    spark = _spark_initialize(config=spark_config,log_level='ERROR',spark_progress=False)

    filelist = {'ZJets': {'files': ['file:'+osp.join(os.getcwd(),'tests/samples/nano_dy.root')], 'treename': 'Events' },
                'Data'  : {'files': ['file:'+osp.join(os.getcwd(),'tests/samples/nano_dimuon.root')], 'treename': 'Events'}
                }

    from coffea.processor.test_items import NanoTestProcessor
    from coffea.processor.spark.spark_executor import spark_executor

    columns = ['nMuon','Muon_pt','Muon_eta','Muon_phi','Muon_mass']
    proc = NanoTestProcessor(columns=columns)

    hists = run_spark_job(filelist, processor_instance=proc, executor=spark_executor, spark=spark, thread_workers=1,
                          executor_args={'file_type': 'root'})

    _spark_stop(spark)

    assert( sum(spark_executor.counts.values()) == 20 )
    assert( hists['cutflow']['ZJets_pt'] == 4 )
    assert( hists['cutflow']['ZJets_mass'] == 1 )
    assert( hists['cutflow']['Data_pt'] == 15 )
    assert( hists['cutflow']['Data_mass'] == 5 )
示例#23
0
def test_os_file_writer():
    SIZE = 4096
    arr = np.random.randint(0, 256, size=SIZE).astype('u1')
    data = arr.tobytes()[:SIZE]

    path = guid()
    try:
        with open(path, 'wb') as f:
            f.write(data)

        # Truncates file
        f2 = io.OSFile(path, mode='w')
        f2.write('foo')

        with io.OSFile(path) as f3:
            assert f3.size() == 3

        with pytest.raises(IOError):
            f2.read(5)
    finally:
        _try_delete(path)
示例#24
0
文件: test_io.py 项目: kiril-me/arrow
def test_os_file_writer():
    SIZE = 4096
    arr = np.random.randint(0, 256, size=SIZE).astype('u1')
    data = arr.tobytes()[:SIZE]

    path = guid()
    try:
        with open(path, 'wb') as f:
            f.write(data)

        # Truncates file
        f2 = io.OSFile(path, mode='w')
        f2.write('foo')

        with io.OSFile(path) as f3:
            assert f3.size() == 3

        with pytest.raises(IOError):
            f2.read(5)
    finally:
        _try_delete(path)
示例#25
0
文件: test_io.py 项目: sunchao/arrow
def test_memory_map_writer(tmpdir):
    SIZE = 4096
    arr = np.random.randint(0, 256, size=SIZE).astype('u1')
    data = arr.tobytes()[:SIZE]

    path = os.path.join(str(tmpdir), guid())
    with open(path, 'wb') as f:
        f.write(data)

    f = pa.memory_map(path, mode='r+b')

    f.seek(10)
    f.write('peekaboo')
    assert f.tell() == 18

    f.seek(10)
    assert f.read(8) == b'peekaboo'

    f2 = pa.memory_map(path, mode='r+b')

    f2.seek(10)
    f2.write(b'booapeak')
    f2.seek(10)

    f.seek(10)
    assert f.read(8) == b'booapeak'

    # Does not truncate file
    f3 = pa.memory_map(path, mode='w')
    f3.write('foo')

    with pa.memory_map(path) as f4:
        assert f4.size() == SIZE

    with pytest.raises(IOError):
        f3.read(5)

    f.seek(0)
    assert f.read(3) == b'foo'
示例#26
0
    def _visit_level(base_dir, level, part_keys):
        name, values = partition_spec[level]
        for value in values:
            this_part_keys = part_keys + [(name, value)]

            level_dir = pjoin(base_dir, '{0}={1}'.format(name, value))
            fs.mkdir(level_dir)

            if level == DEPTH - 1:
                # Generate example data
                file_path = pjoin(level_dir, guid())

                filtered_df = _filter_partition(df, this_part_keys)
                part_table = pa.Table.from_pandas(filtered_df)
                with fs.open(file_path, 'wb') as f:
                    _write_table(part_table, f)
                assert fs.exists(file_path)

                _touch(pjoin(level_dir, '_SUCCESS'))
            else:
                _visit_level(level_dir, level + 1, this_part_keys)
                _touch(pjoin(level_dir, '_SUCCESS'))
def test_native_file_raises_ValueError_after_close(tmpdir):
    path = os.path.join(str(tmpdir), guid())
    with open(path, 'wb') as f:
        f.write(b'foooo')

    with pa.OSFile(path, mode='rb') as os_file:
        assert not os_file.closed
    assert os_file.closed

    with pa.memory_map(path, mode='rb') as mmap_file:
        assert not mmap_file.closed
    assert mmap_file.closed

    files = [os_file, mmap_file]

    methods = [('tell', ()), ('seek', (0, )), ('size', ()), ('flush', ()),
               ('readable', ()), ('writable', ()), ('seekable', ())]

    for f in files:
        for method, args in methods:
            with pytest.raises(ValueError):
                getattr(f, method)(*args)
def test_memory_map_writer(tmpdir):
    SIZE = 4096
    arr = np.random.randint(0, 256, size=SIZE).astype('u1')
    data = arr.tobytes()[:SIZE]

    path = os.path.join(str(tmpdir), guid())
    with open(path, 'wb') as f:
        f.write(data)

    f = pa.memory_map(path, mode='r+b')

    f.seek(10)
    f.write(b'peekaboo')
    assert f.tell() == 18

    f.seek(10)
    assert f.read(8) == b'peekaboo'

    f2 = pa.memory_map(path, mode='r+b')

    f2.seek(10)
    f2.write(b'booapeak')
    f2.seek(10)

    f.seek(10)
    assert f.read(8) == b'booapeak'

    # Does not truncate file
    f3 = pa.memory_map(path, mode='w')
    f3.write(b'foo')

    with pa.memory_map(path) as f4:
        assert f4.size() == SIZE

    with pytest.raises(IOError):
        f3.read(5)

    f.seek(0)
    assert f.read(3) == b'foo'
示例#29
0
    def _visit_level(base_dir, level, part_keys):
        name, values = partition_spec[level]
        for value in values:
            this_part_keys = part_keys + [(name, value)]

            level_dir = pjoin(base_dir, '{0}={1}'.format(name, value))
            fs.mkdir(level_dir)

            if level == DEPTH - 1:
                # Generate example data
                file_path = pjoin(level_dir, guid())

                filtered_df = _filter_partition(df, this_part_keys)
                part_table = pa.Table.from_pandas(filtered_df)
                with fs.open(file_path, 'wb') as f:
                    _write_table(part_table, f)
                assert fs.exists(file_path)

                _touch(pjoin(level_dir, '_SUCCESS'))
            else:
                _visit_level(level_dir, level + 1, this_part_keys)
                _touch(pjoin(level_dir, '_SUCCESS'))
示例#30
0
def test_ignore_private_directories(tmpdir):
    import pyarrow.parquet as pq

    nfiles = 10
    size = 5

    dirpath = tmpdir.join(guid()).strpath
    os.mkdir(dirpath)

    test_data = []
    paths = []
    for i in range(nfiles):
        df = _test_dataframe(size, seed=i)
        path = pjoin(dirpath, '{0}.parquet'.format(i))

        test_data.append(_write_table(df, path))
        paths.append(path)

    # private directory
    os.mkdir(pjoin(dirpath, '_impala_staging'))

    dataset = pq.ParquetDataset(dirpath)
    assert set(paths) == set(x.path for x in dataset.pieces)
示例#31
0
def test_ignore_private_directories(tmpdir):
    import pyarrow.parquet as pq

    nfiles = 10
    size = 5

    dirpath = tmpdir.join(guid()).strpath
    os.mkdir(dirpath)

    test_data = []
    paths = []
    for i in range(nfiles):
        df = _test_dataframe(size, seed=i)
        path = pjoin(dirpath, '{0}.parquet'.format(i))

        test_data.append(_write_table(df, path))
        paths.append(path)

    # private directory
    os.mkdir(pjoin(dirpath, '_impala_staging'))

    dataset = pq.ParquetDataset(dirpath)
    assert set(paths) == set(x.path for x in dataset.pieces)
示例#32
0
def upsert_to_dataset(table, root_path, partition_cols=None,
                      filesystem=None, preserve_index=True,
                      temp_folder=None, categories=None, **kwargs):
    if filesystem is None:
        fs = _get_fs_from_path(root_path)
    else:
        fs = _ensure_filesystem(filesystem)

    _mkdir_if_not_exists(fs, root_path)
    if temp_folder:
        if not os.path.exists(temp_folder):
            temp_folder = None

    if partition_cols is not None and len(partition_cols) > 0:
        # df is the data in the new table
        df = table.to_pandas()
        partition_keys = [df[col] for col in partition_cols]
        data_df = df.drop(partition_cols, axis='columns')
        data_cols = df.columns.drop(partition_cols)
        if len(data_cols) == 0:
            raise ValueError("No data left to save outside partition columns")
        subschema = table.schema
        # ARROW-2891: Ensure the output_schema is preserved when writing a
        # partitioned dataset
        for partition_col in partition_cols:
            subschema = subschema.remove(
                subschema.get_field_index(partition_col))
        for keys, subgroup in data_df.groupby(partition_keys):
            if not isinstance(keys, tuple):
                keys = (keys,)
            subdir = "/".join(
                ["{colname}={value}".format(colname=name, value=val)
                 for name, val in zip(partition_cols, keys)])

            prefix = "/".join([root_path, subdir])
            _mkdir_if_not_exists(fs, prefix)
            existing_files = [f for f in os.listdir(prefix) if f.endswith('.parquet')]
            if len(existing_files) > 1:
                raise ValueError('Unsupported scenario, multiple files found in path %s' % prefix)
            if len(existing_files) == 1:
                outfile = existing_files[0]
                full_path = "/".join([prefix, outfile])
                old_table = read_table(full_path)
                category_cols = _to_category_cols(subgroup, categories)  # get categories before merging
                old_subgroup = old_table.to_pandas()
                # TODO: compare old schema with new
                subgroup = _upsert_dataframes(subgroup, old_subgroup)
                # subgroup = pd.concat([subgroup, old_subgroup[~old_subgroup.index.isin(subgroup.index.values)]])
                for c, v in category_cols.items():
                    subgroup.loc[:, c] = subgroup.loc[:, c].astype('category', categories=v)
            else:
                outfile = compat.guid() + ".parquet"
                full_path = "/".join([prefix, outfile])
            subtable = Table.from_pandas(subgroup,
                                         preserve_index=preserve_index,
                                         schema=subschema)
            write_file = os.path.join(temp_folder, outfile) if temp_folder else full_path
            with fs.open(write_file, 'wb') as f:
                write_table(subtable, f, **kwargs)
            if temp_folder:
                shutil.move(write_file, full_path)
    else:
        existing_files = [f for f in os.listdir(root_path) if f.endswith('.parquet')]
        if len(existing_files) > 1:
            raise ValueError('Unsupported scenario, multiple files found in path %s' % root_path)
        if len(existing_files) == 1:
            # append use case
            outfile = existing_files[0]
            full_path = "/".join([root_path, outfile])
            old_table = read_table(full_path)
            subgroup = table.to_pandas()
            category_cols = _to_category_cols(subgroup, categories)
            old_subgroup = old_table.to_pandas()
            # TODO: compare old schema with new
            subgroup = _upsert_dataframes(subgroup, old_subgroup)
            # subgroup = pd.concat([old_subgroup[~old_subgroup.index.isin(subgroup.index)], subgroup])
            for c, v in category_cols.items():
                subgroup.loc[:, c] = subgroup.loc[:, c].astype('category', categories=v)
            schema = table.schema
            table = Table.from_pandas(
                subgroup,
                preserve_index=preserve_index,
                schema=schema
            )
        else:
            # write use case
            outfile = compat.guid() + ".parquet"
            full_path = "/".join([root_path, outfile])

        write_file = os.path.join(temp_folder, outfile) if temp_folder else full_path
        with fs.open(write_file, 'wb') as f:
            write_table(table, f, **kwargs)
        if temp_folder:
            shutil.move(write_file, full_path)
示例#33
0
文件: test_hdfs.py 项目: tlantz/arrow
 def test_write_to_dataset_no_partitions(self):
     tmpdir = pjoin(self.tmp_path, 'write-no_partitions-' + guid())
     self.hdfs.mkdir(tmpdir)
     test_parquet._test_write_to_dataset_no_partitions(tmpdir,
                                                       filesystem=self.hdfs)
示例#34
0
def write_to_dataset(table, root_path, partition_cols=None, filesystem=None,
                     preserve_index=None, **kwargs):
    """
    Wrapper around parquet.write_table for writing a Table to
    Parquet format by partitions.
    For each combination of partition columns and values,
    a subdirectories are created in the following
    manner:

    root_dir/
      group1=value1
        group2=value1
          <uuid>.parquet
        group2=value2
          <uuid>.parquet
      group1=valueN
        group2=value1
          <uuid>.parquet
        group2=valueN
          <uuid>.parquet

    Parameters
    ----------
    table : pyarrow.Table
    root_path : string,
        The root directory of the dataset
    filesystem : FileSystem, default None
        If nothing passed, paths assumed to be found in the local on-disk
        filesystem
    partition_cols : list,
        Column names by which to partition the dataset
        Columns are partitioned in the order they are given
    **kwargs : dict, kwargs for write_table function.
    """
    if preserve_index is not None:
        warnings.warn('preserve_index argument is deprecated as of 0.13.0 and '
                      'has no effect', DeprecationWarning)

    fs, root_path = _get_filesystem_and_path(filesystem, root_path)

    _mkdir_if_not_exists(fs, root_path)

    if partition_cols is not None and len(partition_cols) > 0:
        df = table.to_pandas(ignore_metadata=True)
        partition_keys = [df[col] for col in partition_cols]
        data_df = df.drop(partition_cols, axis='columns')
        data_cols = df.columns.drop(partition_cols)
        if len(data_cols) == 0:
            raise ValueError('No data left to save outside partition columns')

        subschema = table.schema

        # ARROW-2891: Ensure the output_schema is preserved when writing a
        # partitioned dataset
        for col in table.schema.names:
            if col in partition_cols:
                subschema = subschema.remove(subschema.get_field_index(col))

        for keys, subgroup in data_df.groupby(partition_keys):
            if not isinstance(keys, tuple):
                keys = (keys,)
            subdir = '/'.join(
                ['{colname}={value}'.format(colname=name, value=val)
                 for name, val in zip(partition_cols, keys)])
            subtable = pa.Table.from_pandas(subgroup, preserve_index=False,
                                            schema=subschema, safe=False)
            prefix = '/'.join([root_path, subdir])
            _mkdir_if_not_exists(fs, prefix)
            outfile = guid() + '.parquet'
            full_path = '/'.join([prefix, outfile])
            with fs.open(full_path, 'wb') as f:
                write_table(subtable, f, **kwargs)
    else:
        outfile = guid() + '.parquet'
        full_path = '/'.join([root_path, outfile])
        with fs.open(full_path, 'wb') as f:
            write_table(table, f, **kwargs)
示例#35
0
文件: parquet.py 项目: sunchao/arrow
def write_to_dataset(table, root_path, partition_cols=None,
                     filesystem=None, preserve_index=True, **kwargs):
    """
    Wrapper around parquet.write_table for writing a Table to
    Parquet format by partitions.
    For each combination of partition columns and values,
    a subdirectories are created in the following
    manner:

    root_dir/
      group1=value1
        group2=value1
          <uuid>.parquet
        group2=value2
          <uuid>.parquet
      group1=valueN
        group2=value1
          <uuid>.parquet
        group2=valueN
          <uuid>.parquet

    Parameters
    ----------
    table : pyarrow.Table
    root_path : string,
        The root directory of the dataset
    filesystem : FileSystem, default None
        If nothing passed, paths assumed to be found in the local on-disk
        filesystem
    partition_cols : list,
        Column names by which to partition the dataset
        Columns are partitioned in the order they are given
    preserve_index : bool,
        Parameter for instantiating Table; preserve pandas index or not.
    **kwargs : dict, kwargs for write_table function.
    """
    from pyarrow import (
        Table,
        compat
    )

    if filesystem is None:
        fs = _get_fs_from_path(root_path)
    else:
        fs = _ensure_filesystem(filesystem)

    _mkdir_if_not_exists(fs, root_path)

    if partition_cols is not None and len(partition_cols) > 0:
        df = table.to_pandas()
        partition_keys = [df[col] for col in partition_cols]
        data_df = df.drop(partition_cols, axis='columns')
        data_cols = df.columns.drop(partition_cols)
        if len(data_cols) == 0:
            raise ValueError("No data left to save outside partition columns")
        for keys, subgroup in data_df.groupby(partition_keys):
            if not isinstance(keys, tuple):
                keys = (keys,)
            subdir = "/".join(
                ["{colname}={value}".format(colname=name, value=val)
                 for name, val in zip(partition_cols, keys)])
            subtable = Table.from_pandas(subgroup,
                                         preserve_index=preserve_index)
            prefix = "/".join([root_path, subdir])
            _mkdir_if_not_exists(fs, prefix)
            outfile = compat.guid() + ".parquet"
            full_path = "/".join([prefix, outfile])
            with fs.open(full_path, 'wb') as f:
                write_table(subtable, f, **kwargs)
    else:
        outfile = compat.guid() + ".parquet"
        full_path = "/".join([root_path, outfile])
        with fs.open(full_path, 'wb') as f:
            write_table(table, f, **kwargs)
示例#36
0
文件: test_hdfs.py 项目: tlantz/arrow
 def test_read_common_metadata_files(self):
     tmpdir = pjoin(self.tmp_path, 'common-metadata-' + guid())
     self.hdfs.mkdir(tmpdir)
     test_parquet._test_read_common_metadata_files(self.hdfs, tmpdir)
示例#37
0
 def test_write_to_dataset_no_partitions(self):
     tmpdir = pjoin(self.tmp_path, 'write-no_partitions-' + guid())
     self.hdfs.mkdir(tmpdir)
     test_parquet._test_write_to_dataset_no_partitions(
         tmpdir, filesystem=self.hdfs)
示例#38
0
from tqdm import tqdm
import pyspark.sql
import pyspark.sql.functions as fn
from pyarrow.compat import guid

try:
    from collections.abc import Sequence
except ImportError:
    from collections import Sequence

from ..executor import _futures_handler

# this is a reasonable local spark configuration
_default_config = pyspark.sql.SparkSession.builder \
    .appName('coffea-analysis-%s' % guid()) \
    .master('local[*]') \
    .config('spark.sql.execution.arrow.enabled', 'true') \
    .config('spark.sql.execution.arrow.maxRecordsPerBatch', 200000)


def _spark_initialize(config=_default_config, **kwargs):
    spark_progress = False
    if 'spark_progress' in kwargs.keys():
        spark_progress = kwargs['spark_progress']

    cfg_actual = config
    # get spark to not complain about missing log configs
    cfg_actual = cfg_actual.config('spark.driver.extraJavaOptions',
                                   '-Dlog4jspark.root.logger=ERROR,console')
    if not spark_progress:
def test_read_multiple_files(tmpdir):
    import pyarrow.parquet as pq

    nfiles = 10
    size = 5

    dirpath = tmpdir.join(guid()).strpath
    os.mkdir(dirpath)

    test_data = []
    paths = []
    for i in range(nfiles):
        df = _test_dataframe(size, seed=i)

        # Hack so that we don't have a dtype cast in v1 files
        df['uint32'] = df['uint32'].astype(np.int64)

        path = pjoin(dirpath, '{0}.parquet'.format(i))

        table = pa.Table.from_pandas(df)
        _write_table(table, path)

        test_data.append(table)
        paths.append(path)

    # Write a _SUCCESS.crc file
    with open(pjoin(dirpath, '_SUCCESS.crc'), 'wb') as f:
        f.write(b'0')

    def read_multiple_files(paths, columns=None, nthreads=None, **kwargs):
        dataset = pq.ParquetDataset(paths, **kwargs)
        return dataset.read(columns=columns, nthreads=nthreads)

    result = read_multiple_files(paths)
    expected = pa.concat_tables(test_data)

    assert result.equals(expected)

    with pytest.raises(NotImplementedError):
        pq.read_pandas(dirpath)

    # Read with provided metadata
    metadata = pq.ParquetFile(paths[0]).metadata

    result2 = read_multiple_files(paths, metadata=metadata)
    assert result2.equals(expected)

    result3 = pa.localfs.read_parquet(dirpath, schema=metadata.schema)
    assert result3.equals(expected)

    # Read column subset
    to_read = [result[0], result[2], result[6], result[result.num_columns - 1]]

    result = pa.localfs.read_parquet(
        dirpath, columns=[c.name for c in to_read])
    expected = pa.Table.from_arrays(to_read, metadata=result.schema.metadata)
    assert result.equals(expected)

    # Read with multiple threads
    pa.localfs.read_parquet(dirpath, nthreads=2)

    # Test failure modes with non-uniform metadata
    bad_apple = _test_dataframe(size, seed=i).iloc[:, :4]
    bad_apple_path = tmpdir.join('{0}.parquet'.format(guid())).strpath

    t = pa.Table.from_pandas(bad_apple)
    _write_table(t, bad_apple_path)

    bad_meta = pq.ParquetFile(bad_apple_path).metadata

    with pytest.raises(ValueError):
        read_multiple_files(paths + [bad_apple_path])

    with pytest.raises(ValueError):
        read_multiple_files(paths, metadata=bad_meta)

    mixed_paths = [bad_apple_path, paths[0]]

    with pytest.raises(ValueError):
        read_multiple_files(mixed_paths, schema=bad_meta.schema)

    with pytest.raises(ValueError):
        read_multiple_files(mixed_paths)
示例#40
0
def test_memory_map_deref_remove(tmpdir):
    path = os.path.join(str(tmpdir), guid())
    pa.create_memory_map(path, 4096)
    os.remove(path)  # Shouldn't fail
示例#41
0
def test_read_multiple_files(tmpdir):
    import pyarrow.parquet as pq

    nfiles = 10
    size = 5

    dirpath = tmpdir.join(guid()).strpath
    os.mkdir(dirpath)

    test_data = []
    paths = []
    for i in range(nfiles):
        df = _test_dataframe(size, seed=i)

        # Hack so that we don't have a dtype cast in v1 files
        df['uint32'] = df['uint32'].astype(np.int64)

        path = pjoin(dirpath, '{0}.parquet'.format(i))

        table = pa.Table.from_pandas(df)
        _write_table(table, path)

        test_data.append(table)
        paths.append(path)

    # Write a _SUCCESS.crc file
    _touch(pjoin(dirpath, '_SUCCESS.crc'))

    def read_multiple_files(paths, columns=None, nthreads=None, **kwargs):
        dataset = pq.ParquetDataset(paths, **kwargs)
        return dataset.read(columns=columns, nthreads=nthreads)

    result = read_multiple_files(paths)
    expected = pa.concat_tables(test_data)

    assert result.equals(expected)

    # Read with provided metadata
    metadata = pq.read_metadata(paths[0])

    result2 = read_multiple_files(paths, metadata=metadata)
    assert result2.equals(expected)

    result3 = pa.localfs.read_parquet(dirpath, schema=metadata.schema)
    assert result3.equals(expected)

    # Read column subset
    to_read = [result[0], result[2], result[6], result[result.num_columns - 1]]

    result = pa.localfs.read_parquet(dirpath,
                                     columns=[c.name for c in to_read])
    expected = pa.Table.from_arrays(to_read, metadata=result.schema.metadata)
    assert result.equals(expected)

    # Read with multiple threads
    pa.localfs.read_parquet(dirpath, nthreads=2)

    # Test failure modes with non-uniform metadata
    bad_apple = _test_dataframe(size, seed=i).iloc[:, :4]
    bad_apple_path = tmpdir.join('{0}.parquet'.format(guid())).strpath

    t = pa.Table.from_pandas(bad_apple)
    _write_table(t, bad_apple_path)

    bad_meta = pq.read_metadata(bad_apple_path)

    with pytest.raises(ValueError):
        read_multiple_files(paths + [bad_apple_path])

    with pytest.raises(ValueError):
        read_multiple_files(paths, metadata=bad_meta)

    mixed_paths = [bad_apple_path, paths[0]]

    with pytest.raises(ValueError):
        read_multiple_files(mixed_paths, schema=bad_meta.schema)

    with pytest.raises(ValueError):
        read_multiple_files(mixed_paths)
def test_memory_zero_length(tmpdir):
    path = os.path.join(str(tmpdir), guid())
    f = open(path, 'wb')
    f.close()
    with pa.memory_map(path, mode='r+b') as memory_map:
        assert memory_map.size() == 0
示例#43
0
def write_to_dataset(table,
                     root_path,
                     partition_cols=None,
                     filesystem=None,
                     preserve_index=True,
                     **kwargs):
    """
    Wrapper around parquet.write_table for writing a Table to
    Parquet format by partitions.
    For each combination of partition columns and values,
    a subdirectories are created in the following
    manner:

    root_dir/
      group1=value1
        group2=value1
          <uuid>.parquet
        group2=value2
          <uuid>.parquet
      group1=valueN
        group2=value1
          <uuid>.parquet
        group2=valueN
          <uuid>.parquet

    Parameters
    ----------
    table : pyarrow.Table
    root_path : string,
        The root directory of the dataset
    filesystem : FileSystem, default None
        If nothing passed, paths assumed to be found in the local on-disk
        filesystem
    partition_cols : list,
        Column names by which to partition the dataset
        Columns are partitioned in the order they are given
    preserve_index : bool,
        Parameter for instantiating Table; preserve pandas index or not.
    **kwargs : dict, kwargs for write_table function.
    """
    fs, root_path = _get_filesystem_and_path(filesystem, root_path)

    _mkdir_if_not_exists(fs, root_path)

    if partition_cols is not None and len(partition_cols) > 0:
        df = table.to_pandas()
        partition_keys = [df[col] for col in partition_cols]
        data_df = df.drop(partition_cols, axis='columns')
        data_cols = df.columns.drop(partition_cols)
        if len(data_cols) == 0:
            raise ValueError('No data left to save outside partition columns')

        subschema = table.schema
        # ARROW-2891: Ensure the output_schema is preserved when writing a
        # partitioned dataset
        for col in table.schema.names:
            if (col.startswith('__index_level_') or col in partition_cols):
                subschema = subschema.remove(subschema.get_field_index(col))

        for keys, subgroup in data_df.groupby(partition_keys):
            if not isinstance(keys, tuple):
                keys = (keys, )
            subdir = '/'.join([
                '{colname}={value}'.format(colname=name, value=val)
                for name, val in zip(partition_cols, keys)
            ])
            subtable = pa.Table.from_pandas(subgroup,
                                            preserve_index=preserve_index,
                                            schema=subschema,
                                            safe=False)
            prefix = '/'.join([root_path, subdir])
            _mkdir_if_not_exists(fs, prefix)
            outfile = guid() + '.parquet'
            full_path = '/'.join([prefix, outfile])
            with fs.open(full_path, 'wb') as f:
                write_table(subtable, f, **kwargs)
    else:
        outfile = guid() + '.parquet'
        full_path = '/'.join([root_path, outfile])
        with fs.open(full_path, 'wb') as f:
            write_table(table, f, **kwargs)
示例#44
0
文件: test_io.py 项目: sunchao/arrow
def test_memory_zero_length(tmpdir):
    path = os.path.join(str(tmpdir), guid())
    f = open(path, 'wb')
    f.close()
    with pa.memory_map(path, mode='r+b') as memory_map:
        assert memory_map.size() == 0
示例#45
0
def write_to_dataset(
    df,
    root_path,
    partition_cols=None,
    fs=None,
    preserve_index=False,
    return_metadata=False,
    **kwargs,
):
    """Wraps `to_parquet` to write partitioned Parquet datasets.
    For each combination of partition group and value,
    subdirectories are created as follows:

    root_dir/
      group=value1
        <uuid>.parquet
      ...
      group=valueN
        <uuid>.parquet

    Parameters
    ----------
    df : cudf.DataFrame
    root_path : string,
        The root directory of the dataset
    fs : FileSystem, default None
        If nothing passed, paths assumed to be found in the local on-disk
        filesystem
    preserve_index : bool, default False
        Preserve index values in each parquet file.
    partition_cols : list,
        Column names by which to partition the dataset
        Columns are partitioned in the order they are given
    return_metadata : bool, default False
        Return parquet metadata for written data. Returned metadata will
        include the file-path metadata (relative to `root_path`).
    **kwargs : dict,
        kwargs for to_parquet function.
    """

    fs = _ensure_filesystem(fs, root_path)
    fs.mkdirs(root_path, exist_ok=True)
    metadata = []

    if partition_cols is not None and len(partition_cols) > 0:

        data_cols = df.columns.drop(partition_cols)
        if len(data_cols) == 0:
            raise ValueError("No data left to save outside partition columns")

        #  Loop through the partition groups
        for i, sub_df in enumerate(
            _get_partition_groups(
                df, partition_cols, preserve_index=preserve_index
            )
        ):
            if sub_df is None or len(sub_df) == 0:
                continue
            keys = tuple([sub_df[col].iloc[0] for col in partition_cols])
            if not isinstance(keys, tuple):
                keys = (keys,)
            subdir = fs.sep.join(
                [
                    "{colname}={value}".format(colname=name, value=val)
                    for name, val in zip(partition_cols, keys)
                ]
            )
            prefix = fs.sep.join([root_path, subdir])
            fs.mkdirs(prefix, exist_ok=True)
            outfile = guid() + ".parquet"
            full_path = fs.sep.join([prefix, outfile])
            write_df = sub_df.copy(deep=False)
            write_df.drop(columns=partition_cols, inplace=True)
            if return_metadata:
                metadata.append(
                    write_df.to_parquet(
                        full_path,
                        index=preserve_index,
                        metadata_file_path=fs.sep.join([subdir, outfile]),
                        **kwargs,
                    )
                )
            else:
                write_df.to_parquet(full_path, index=preserve_index, **kwargs)

    else:
        outfile = guid() + ".parquet"
        full_path = fs.sep.join([root_path, outfile])
        if return_metadata:
            metadata.append(
                df.to_parquet(
                    full_path,
                    index=preserve_index,
                    metadata_file_path=outfile,
                    **kwargs,
                )
            )
        else:
            df.to_parquet(full_path, index=preserve_index, **kwargs)

    if metadata:
        return (
            merge_parquet_filemetadata(metadata)
            if len(metadata) > 1
            else metadata[0]
        )
示例#46
0
def test_read_multiple_files(tmpdir):
    nfiles = 10
    size = 5

    dirpath = tmpdir.join(guid()).strpath
    os.mkdir(dirpath)

    test_data = []
    paths = []
    for i in range(nfiles):
        df = _test_dataframe(size, seed=i)

        # Hack so that we don't have a dtype cast in v1 files
        df['uint32'] = df['uint32'].astype(np.int64)

        path = pjoin(dirpath, '{0}.parquet'.format(i))

        table = pa.Table.from_pandas(df)
        pq.write_table(table, path)

        test_data.append(table)
        paths.append(path)

    # Write a _SUCCESS.crc file
    with open(pjoin(dirpath, '_SUCCESS.crc'), 'wb') as f:
        f.write(b'0')

    result = pq.read_multiple_files(paths)
    expected = pa.concat_tables(test_data)

    assert result.equals(expected)

    # Read with provided metadata
    metadata = pq.ParquetFile(paths[0]).metadata

    result2 = pq.read_multiple_files(paths, metadata=metadata)
    assert result2.equals(expected)

    result3 = pa.localfs.read_parquet(dirpath, schema=metadata.schema)
    assert result3.equals(expected)

    # Read column subset
    to_read = [result[0], result[3], result[6]]
    result = pa.localfs.read_parquet(dirpath,
                                     columns=[c.name for c in to_read])
    expected = pa.Table.from_arrays(to_read)
    assert result.equals(expected)

    # Read with multiple threads
    pa.localfs.read_parquet(dirpath, nthreads=2)

    # Test failure modes with non-uniform metadata
    bad_apple = _test_dataframe(size, seed=i).iloc[:, :4]
    bad_apple_path = tmpdir.join('{0}.parquet'.format(guid())).strpath

    t = pa.Table.from_pandas(bad_apple)
    pq.write_table(t, bad_apple_path)

    bad_meta = pq.ParquetFile(bad_apple_path).metadata

    with pytest.raises(ValueError):
        pq.read_multiple_files(paths + [bad_apple_path])

    with pytest.raises(ValueError):
        pq.read_multiple_files(paths, metadata=bad_meta)

    mixed_paths = [bad_apple_path, paths[0]]

    with pytest.raises(ValueError):
        pq.read_multiple_files(mixed_paths, schema=bad_meta.schema)

    with pytest.raises(ValueError):
        pq.read_multiple_files(mixed_paths)
示例#47
0
 def test_read_common_metadata_files(self):
     tmpdir = pjoin(self.tmp_path, 'common-metadata-' + guid())
     self.hdfs.mkdir(tmpdir)
     test_parquet._test_read_common_metadata_files(self.hdfs, tmpdir)
示例#48
0
def write_to_dataset(table,
                     root_path,
                     partition_cols=None,
                     filesystem=None,
                     preserve_index=True,
                     **kwargs):
    """
    Wrapper around parquet.write_table for writing a Table to
    Parquet format by partitions.
    For each combination of partition columns and values,
    a subdirectories are created in the following
    manner:

    root_dir/
      group1=value1
        group2=value1
          <uuid>.parquet
        group2=value2
          <uuid>.parquet
      group1=valueN
        group2=value1
          <uuid>.parquet
        group2=valueN
          <uuid>.parquet

    Parameters
    ----------
    table : pyarrow.Table
    root_path : string,
        The root directory of the dataset
    filesystem : FileSystem, default None
        If nothing passed, paths assumed to be found in the local on-disk
        filesystem
    partition_cols : list,
        Column names by which to partition the dataset
        Columns are partitioned in the order they are given
    preserve_index : bool,
        Parameter for instantiating Table; preserve pandas index or not.
    **kwargs : dict, kwargs for write_table function.
    """
    from pyarrow import (Table, compat)

    if filesystem is None:
        fs = LocalFileSystem.get_instance()
    else:
        fs = _ensure_filesystem(filesystem)

    _mkdir_if_not_exists(fs, root_path)

    if partition_cols is not None and len(partition_cols) > 0:
        df = table.to_pandas()
        partition_keys = [df[col] for col in partition_cols]
        data_df = df.drop(partition_cols, axis='columns')
        data_cols = df.columns.drop(partition_cols)
        if len(data_cols) == 0:
            raise ValueError("No data left to save outside partition columns")
        for keys, subgroup in data_df.groupby(partition_keys):
            if not isinstance(keys, tuple):
                keys = (keys, )
            subdir = "/".join([
                "{colname}={value}".format(colname=name, value=val)
                for name, val in zip(partition_cols, keys)
            ])
            subtable = Table.from_pandas(subgroup,
                                         preserve_index=preserve_index)
            prefix = "/".join([root_path, subdir])
            _mkdir_if_not_exists(fs, prefix)
            outfile = compat.guid() + ".parquet"
            full_path = "/".join([prefix, outfile])
            with fs.open(full_path, 'wb') as f:
                write_table(subtable, f, **kwargs)
    else:
        outfile = compat.guid() + ".parquet"
        full_path = "/".join([root_path, outfile])
        with fs.open(full_path, 'wb') as f:
            write_table(table, f, **kwargs)
示例#49
0
def write_to_dataset(df,
                     root_path,
                     partition_cols=None,
                     fs=None,
                     preserve_index=False,
                     **kwargs):
    """Wraps `to_parquet` to write partitioned Parquet datasets.
    For each combination of partition group and value,
    subdirectories are created as follows:

    root_dir/
      group=value1
        <uuid>.parquet
      ...
      group=valueN
        <uuid>.parquet

    Parameters
    ----------
    df : cudf.DataFrame
    root_path : string,
        The root directory of the dataset
    fs : FileSystem, default None
        If nothing passed, paths assumed to be found in the local on-disk
        filesystem
    preserve_index : bool, default False
        Preserve index values in each parquet file.
    partition_cols : list,
        Column names by which to partition the dataset
        Columns are partitioned in the order they are given
    **kwargs : dict,
        kwargs for to_parquet function.
    """

    fs, root_path = pq._get_filesystem_and_path(fs, root_path)
    _mkdir_if_not_exists(fs, root_path)

    if partition_cols is not None and len(partition_cols) > 0:

        data_cols = df.columns.drop(partition_cols)
        if len(data_cols) == 0:
            raise ValueError("No data left to save outside partition columns")

        #  Loop through the partition groups
        for i, sub_df in enumerate(
                _get_partition_groups(df,
                                      partition_cols,
                                      preserve_index=preserve_index)):
            if sub_df is None or len(sub_df) == 0:
                continue
            keys = tuple([sub_df[col].iloc[0] for col in partition_cols])
            if not isinstance(keys, tuple):
                keys = (keys, )
            subdir = "/".join([
                "{colname}={value}".format(colname=name, value=val)
                for name, val in zip(partition_cols, keys)
            ])
            prefix = "/".join([root_path, subdir])
            _mkdir_if_not_exists(fs, prefix)
            outfile = guid() + ".parquet"
            full_path = "/".join([prefix, outfile])
            write_df = sub_df.copy(deep=False)
            write_df.drop(columns=partition_cols, inplace=True)
            write_df.to_parquet(full_path, index=preserve_index, **kwargs)
    else:
        outfile = guid() + ".parquet"
        full_path = "/".join([root_path, outfile])
        df.to_parquet(full_path, index=preserve_index, **kwargs)
示例#50
0
def random_path():
    return 'feather_{}'.format(guid())
示例#51
0
    def pandas2Parquet(self,
                       pandasDF,
                       bucket: str,
                       folder: str,
                       file: str,
                       overwrite: bool = False,
                       engine: str = 'auto',
                       compression: str = 'snappy',
                       use_dictionary: bool = False,
                       coerce_timestamps: str = 'ms',
                       partition_cols: list = None,
                       row_group_size: int = None,
                       **kwargs):
        s3Path =      "s3://%s/%s/%s" % (bucket, folder, file) if folder != None \
                 else "s3://%s/%s" % (bucket, file)
        self.log.info("Writing the Pandas DF to S3 path %s" % (s3Path))

        if overwrite and self.isFolderPresent(s3Path):
            self.deleteObject(s3Path)

        if partition_cols is not None and len(partition_cols) > 0:
            part_keys = [pandasDF[col] for col in partition_cols]
            data_cols = pandasDF.columns.drop(partition_cols)
            if len(data_cols) == 0:
                raise ValueError(
                    'No data left to save outside partition columns')
            table = pa.Table.from_pandas(pandasDF)
            subschema = table.schema
            for col in table.schema.names:
                if (col.startswith('__index_level_') or col in partition_cols):
                    subschema = subschema.remove(
                        subschema.get_field_index(col))

            for keys, subgroup in pandasDF.groupby(part_keys):
                if not isinstance(keys, tuple):
                    keys = (keys, )

                subdir = '/'.join([
                    '{colname}={value}'.format(colname=name, value=val)
                    for name, val in zip(partition_cols, keys)
                ])

                subtable = pa.Table.from_pandas(df=subgroup,
                                                schema=subschema,
                                                preserve_index=False,
                                                safe=False,
                                                nthreads=5)
                prefix = '/'.join([s3Path, subdir])

                if (not overwrite) and self.isFolderPresent(prefix):
                    self.deleteObject(prefix)

                outfile = "pyarow-%s.%s.parquet" % (guid(), compression)
                full_path = '/'.join([prefix, outfile])
                self.log.debug("Creating the file: %s" % (full_path))
                self.mkdir(prefix)
                with self._s3fs.open(full_path, 'wb') as f:
                    pq.write_table(
                        table=subtable,
                        where=f,
                        compression=compression,
                        flavor='spark',  #Enable Spark compatibility
                        coerce_timestamps=
                        coerce_timestamps,  #Limit the timestamp to miliseconds
                        allow_truncated_timestamps=
                        True,  #Don't raise exception during truncation
                        use_dictionary=use_dictionary,
                        row_group_size=row_group_size,
                        version='2.0',
                        **kwargs)
        else:
            outfile = "pyarow-single-%s.%s.parquet" % (guid(), compression)
            full_path = '/'.join([prefix, outfile])
            self.log.debug("Creating the file: %s" % (full_path))
            with self._s3fs.open(full_path, 'wb') as f:
                pq.write_table(
                    table=pa.Table.from_pandas(df=pandasDF,
                                               preserve_index=False,
                                               nthreads=5),
                    where=f,
                    compression=compression,
                    flavor='spark',  #Enable Spark compatibility
                    coerce_timestamps=
                    coerce_timestamps,  #Limit the timestamp to miliseconds
                    allow_truncated_timestamps=
                    True,  #Don't raise exception during truncation
                    use_dictionary=use_dictionary,
                    row_group_size=row_group_size,
                    version='2.0',
                    **kwargs)
示例#52
0
def random_path():
    return 'feather_{}'.format(guid())
示例#53
0
def test_read_multiple_files(tmpdir):
    nfiles = 10
    size = 5

    dirpath = tmpdir.join(guid()).strpath
    os.mkdir(dirpath)

    test_data = []
    paths = []
    for i in range(nfiles):
        df = _test_dataframe(size, seed=i)

        # Hack so that we don't have a dtype cast in v1 files
        df['uint32'] = df['uint32'].astype(np.int64)

        path = pjoin(dirpath, '{0}.parquet'.format(i))

        table = pa.Table.from_pandas(df)
        pq.write_table(table, path)

        test_data.append(table)
        paths.append(path)

    result = pq.read_multiple_files(paths)
    expected = pa.concat_tables(test_data)

    assert result.equals(expected)

    # Read with provided metadata
    metadata = pq.ParquetFile(paths[0]).metadata

    result2 = pq.read_multiple_files(paths, metadata=metadata)
    assert result2.equals(expected)

    result3 = pa.localfs.read_parquet(dirpath, schema=metadata.schema)
    assert result3.equals(expected)

    # Read column subset
    to_read = [result[0], result[3], result[6]]
    result = pa.localfs.read_parquet(
        dirpath, columns=[c.name for c in to_read])
    expected = pa.Table.from_arrays(to_read)
    assert result.equals(expected)

    # Test failure modes with non-uniform metadata
    bad_apple = _test_dataframe(size, seed=i).iloc[:, :4]
    bad_apple_path = tmpdir.join('{0}.parquet'.format(guid())).strpath

    t = pa.Table.from_pandas(bad_apple)
    pq.write_table(t, bad_apple_path)

    bad_meta = pq.ParquetFile(bad_apple_path).metadata

    with pytest.raises(ValueError):
        pq.read_multiple_files(paths + [bad_apple_path])

    with pytest.raises(ValueError):
        pq.read_multiple_files(paths, metadata=bad_meta)

    mixed_paths = [bad_apple_path, paths[0]]

    with pytest.raises(ValueError):
        pq.read_multiple_files(mixed_paths, schema=bad_meta.schema)

    with pytest.raises(ValueError):
        pq.read_multiple_files(mixed_paths)