예제 #1
0
파일: test_orc.py 프로젝트: kszucs/arrow
def test_orcfile_readwrite():
    from pyarrow import orc

    buffer_output_stream = pa.BufferOutputStream()
    a = pa.array([1, None, 3, None])
    b = pa.array([None, "Arrow", None, "ORC"])
    table = pa.table({"int64": a, "utf8": b})
    orc.write_table(table, buffer_output_stream)
    buffer_reader = pa.BufferReader(buffer_output_stream.getvalue())
    orc_file = orc.ORCFile(buffer_reader)
    output_table = orc_file.read()
    assert table.equals(output_table)
    # Check for default WriteOptions
    assert orc_file.compression == 'UNCOMPRESSED'
    assert orc_file.file_version == '0.12'
    assert orc_file.row_index_stride == 10000
    assert orc_file.compression_size == 65536

    # deprecated keyword order
    buffer_output_stream = pa.BufferOutputStream()
    with pytest.warns(FutureWarning):
        orc.write_table(buffer_output_stream, table)
    buffer_reader = pa.BufferReader(buffer_output_stream.getvalue())
    orc_file = orc.ORCFile(buffer_reader)
    output_table = orc_file.read()
    assert table.equals(output_table)
    # Check for default WriteOptions
    assert orc_file.compression == 'UNCOMPRESSED'
    assert orc_file.file_version == '0.12'
    assert orc_file.row_index_stride == 10000
    assert orc_file.compression_size == 65536
예제 #2
0
def save_orc_file(dataframe, filepath):
    """Utility function to write dataframe to disk as orc file."""
    from pyarrow import Table, orc

    df = dataframe.copy()
    for c in df:
        if df[c].dtype.name == "category":
            df[c] = df[c].astype("string[pyarrow]")
    pa_table = Table.from_pandas(df, preserve_index=False)
    orc.write_table(pa_table, filepath)
예제 #3
0
def test_orc_writer_with_null_arrays(tempdir):
    from pyarrow import orc
    import pyarrow as pa

    path = str(tempdir / 'test.orc')
    a = pa.array([1, None, 3, None])
    b = pa.array([None, None, None, None])
    table = pa.table({"int64": a, "utf8": b})
    with pytest.raises(pa.ArrowNotImplementedError):
        orc.write_table(table, path)
예제 #4
0
파일: test_orc.py 프로젝트: pdet/arrow-1
def test_orcfile_readwrite():
    from pyarrow import orc

    buffer_output_stream = pa.BufferOutputStream()
    a = pa.array([1, None, 3, None])
    b = pa.array([None, "Arrow", None, "ORC"])
    table = pa.table({"int64": a, "utf8": b})
    orc.write_table(buffer_output_stream, table)
    buffer_reader = pa.BufferReader(buffer_output_stream.getvalue())
    output_table = orc.ORCFile(buffer_reader).read()
    assert table.equals(output_table)
예제 #5
0
파일: test_orc.py 프로젝트: tallamjr/arrow
def test_bytesio_readwrite():
    from pyarrow import orc
    from io import BytesIO

    buf = BytesIO()
    a = pa.array([1, None, 3, None])
    b = pa.array([None, "Arrow", None, "ORC"])
    table = pa.table({"int64": a, "utf8": b})
    orc.write_table(table, buf)
    buf.seek(0)
    orc_file = orc.ORCFile(buf)
    output_table = orc_file.read()
    assert table.equals(output_table)
예제 #6
0
파일: test_orc.py 프로젝트: kszucs/arrow
def test_filesystem_uri(tmpdir):
    from pyarrow import orc
    table = pa.table({"a": [1, 2, 3]})

    directory = tmpdir / "data_dir"
    directory.mkdir()
    path = directory / "data.orc"
    orc.write_table(table, str(path))

    # filesystem object
    result = orc.read_table(path, filesystem=fs.LocalFileSystem())
    assert result.equals(table)

    # filesystem URI
    result = orc.read_table("data_dir/data.orc",
                            filesystem=util._filesystem_uri(tmpdir))
    assert result.equals(table)
예제 #7
0
파일: test_orc.py 프로젝트: kszucs/arrow
def test_readwrite(tmpdir):
    from pyarrow import orc
    a = pa.array([1, None, 3, None])
    b = pa.array([None, "Arrow", None, "ORC"])
    table = pa.table({"int64": a, "utf8": b})
    file = tmpdir.join("test.orc")
    orc.write_table(table, file)
    output_table = orc.read_table(file)
    assert table.equals(output_table)

    output_table = orc.read_table(file, [])
    assert 4 == output_table.num_rows
    assert 0 == output_table.num_columns

    output_table = orc.read_table(file, columns=["int64"])
    assert 4 == output_table.num_rows
    assert 1 == output_table.num_columns
예제 #8
0
파일: test_orc.py 프로젝트: kszucs/arrow
def test_orcfile_readwrite_with_writeoptions():
    from pyarrow import orc

    buffer_output_stream = pa.BufferOutputStream()
    a = pa.array([1, None, 3, None])
    b = pa.array([None, "Arrow", None, "ORC"])
    table = pa.table({"int64": a, "utf8": b})
    orc.write_table(
        table,
        buffer_output_stream,
        compression='snappy',
        file_version='0.11',
        row_index_stride=5000,
        compression_block_size=32768,
    )
    buffer_reader = pa.BufferReader(buffer_output_stream.getvalue())
    orc_file = orc.ORCFile(buffer_reader)
    output_table = orc_file.read()
    assert table.equals(output_table)
    # Check for modified WriteOptions
    assert orc_file.compression == 'SNAPPY'
    assert orc_file.file_version == '0.11'
    assert orc_file.row_index_stride == 5000
    assert orc_file.compression_size == 32768

    # deprecated keyword order
    buffer_output_stream = pa.BufferOutputStream()
    with pytest.warns(FutureWarning):
        orc.write_table(
            buffer_output_stream,
            table,
            compression='uncompressed',
            file_version='0.11',
            row_index_stride=20000,
            compression_block_size=16384,
        )
    buffer_reader = pa.BufferReader(buffer_output_stream.getvalue())
    orc_file = orc.ORCFile(buffer_reader)
    output_table = orc_file.read()
    assert table.equals(output_table)
    # Check for default WriteOptions
    assert orc_file.compression == 'UNCOMPRESSED'
    assert orc_file.file_version == '0.11'
    assert orc_file.row_index_stride == 20000
    assert orc_file.compression_size == 16384
예제 #9
0
def test_orcfile_readwrite():
    from pyarrow import orc

    buffer_output_stream = pa.BufferOutputStream()
    a = pa.array([1, None, 3, None])
    b = pa.array([None, "Arrow", None, "ORC"])
    table = pa.table({"int64": a, "utf8": b})
    orc.write_table(table, buffer_output_stream)
    buffer_reader = pa.BufferReader(buffer_output_stream.getvalue())
    output_table = orc.ORCFile(buffer_reader).read()
    assert table.equals(output_table)

    # deprecated keyword order
    buffer_output_stream = pa.BufferOutputStream()
    with pytest.warns(FutureWarning):
        orc.write_table(buffer_output_stream, table)
    buffer_reader = pa.BufferReader(buffer_output_stream.getvalue())
    output_table = orc.ORCFile(buffer_reader).read()
    assert table.equals(output_table)
예제 #10
0
 def write_partition(cls, df, path, fs, filename, **kwargs):
     table = pa.Table.from_pandas(df)
     with fs.open(fs.sep.join([path, filename]), "wb") as f:
         orc.write_table(table, f)
예제 #11
0
파일: test_orc.py 프로젝트: kszucs/arrow
def test_column_selection(tempdir):
    from pyarrow import orc

    # create a table with nested types
    inner = pa.field('inner', pa.int64())
    middle = pa.field('middle', pa.struct([inner]))
    fields = [
        pa.field('basic', pa.int32()),
        pa.field('list', pa.list_(pa.field('item', pa.int32()))),
        pa.field('struct', pa.struct([middle,
                                      pa.field('inner2', pa.int64())])),
        pa.field(
            'list-struct',
            pa.list_(
                pa.field(
                    'item',
                    pa.struct([
                        pa.field('inner1', pa.int64()),
                        pa.field('inner2', pa.int64())
                    ])))),
        pa.field('basic2', pa.int64()),
    ]
    arrs = [[0], [[1, 2]], [{
        "middle": {
            "inner": 3
        },
        "inner2": 4
    }], [[{
        "inner1": 5,
        "inner2": 6
    }, {
        "inner1": 7,
        "inner2": 8
    }]], [9]]
    table = pa.table(arrs, schema=pa.schema(fields))

    path = str(tempdir / 'test.orc')
    orc.write_table(table, path)
    orc_file = orc.ORCFile(path)

    # default selecting all columns
    result1 = orc_file.read()
    assert result1.equals(table)

    # selecting with columns names
    result2 = orc_file.read(columns=["basic", "basic2"])
    assert result2.equals(table.select(["basic", "basic2"]))

    result3 = orc_file.read(columns=["list", "struct", "basic2"])
    assert result3.equals(table.select(["list", "struct", "basic2"]))

    # using dotted paths
    result4 = orc_file.read(columns=["struct.middle.inner"])
    expected4 = pa.table({"struct": [{"middle": {"inner": 3}}]})
    assert result4.equals(expected4)

    result5 = orc_file.read(columns=["struct.inner2"])
    expected5 = pa.table({"struct": [{"inner2": 4}]})
    assert result5.equals(expected5)

    result6 = orc_file.read(
        columns=["list", "struct.middle.inner", "struct.inner2"])
    assert result6.equals(table.select(["list", "struct"]))

    result7 = orc_file.read(columns=["list-struct.inner1"])
    expected7 = pa.table({"list-struct": [[{"inner1": 5}, {"inner1": 7}]]})
    assert result7.equals(expected7)

    # selecting with (Arrow-based) field indices
    result2 = orc_file.read(columns=[0, 4])
    assert result2.equals(table.select(["basic", "basic2"]))

    result3 = orc_file.read(columns=[1, 2, 3])
    assert result3.equals(table.select(["list", "struct", "list-struct"]))

    # error on non-existing name or index
    with pytest.raises(IOError):
        # liborc returns ParseError, which gets translated into IOError
        # instead of ValueError
        orc_file.read(columns=["wrong"])

    with pytest.raises(ValueError):
        orc_file.read(columns=[5])
예제 #12
0
파일: test_orc.py 프로젝트: kszucs/arrow
def test_orcfile_readwrite_with_bad_writeoptions():
    from pyarrow import orc
    buffer_output_stream = pa.BufferOutputStream()
    a = pa.array([1, None, 3, None])
    table = pa.table({"int64": a})

    # batch_size must be a positive integer
    with pytest.raises(ValueError):
        orc.write_table(
            buffer_output_stream,
            table,
            batch_size=0,
        )

    with pytest.raises(ValueError):
        orc.write_table(
            buffer_output_stream,
            table,
            batch_size=-100,
        )

    with pytest.raises(ValueError):
        orc.write_table(
            buffer_output_stream,
            table,
            batch_size=1024.23,
        )

    # file_version must be 0.11 or 0.12
    with pytest.raises(ValueError):
        orc.write_table(
            buffer_output_stream,
            table,
            file_version=0.13,
        )

    with pytest.raises(ValueError):
        orc.write_table(
            buffer_output_stream,
            table,
            file_version='1.1',
        )

    # stripe_size must be a positive integer
    with pytest.raises(ValueError):
        orc.write_table(
            buffer_output_stream,
            table,
            stripe_size=0,
        )

    with pytest.raises(ValueError):
        orc.write_table(
            buffer_output_stream,
            table,
            stripe_size=-400,
        )

    with pytest.raises(ValueError):
        orc.write_table(
            buffer_output_stream,
            table,
            stripe_size=4096.73,
        )

    # compression must be among the given options
    with pytest.raises(TypeError):
        orc.write_table(
            buffer_output_stream,
            table,
            compression=0,
        )

    with pytest.raises(ValueError):
        orc.write_table(
            buffer_output_stream,
            table,
            compression='none',
        )
    with pytest.raises(ValueError):
        orc.write_table(
            buffer_output_stream,
            table,
            compression='zlid',
        )

    # compression_block_size must be a positive integer
    with pytest.raises(ValueError):
        orc.write_table(
            buffer_output_stream,
            table,
            compression_block_size=0,
        )

    with pytest.raises(ValueError):
        orc.write_table(
            buffer_output_stream,
            table,
            compression_block_size=-200,
        )

    with pytest.raises(ValueError):
        orc.write_table(
            buffer_output_stream,
            table,
            compression_block_size=1096.73,
        )

    # compression_strategy must be among the given options
    with pytest.raises(TypeError):
        orc.write_table(
            buffer_output_stream,
            table,
            compression_strategy=0,
        )

    with pytest.raises(ValueError):
        orc.write_table(
            buffer_output_stream,
            table,
            compression_strategy='no',
        )
    with pytest.raises(ValueError):
        orc.write_table(
            buffer_output_stream,
            table,
            compression_strategy='large',
        )

    # row_index_stride must be a positive integer
    with pytest.raises(ValueError):
        orc.write_table(
            buffer_output_stream,
            table,
            row_index_stride=0,
        )

    with pytest.raises(ValueError):
        orc.write_table(
            buffer_output_stream,
            table,
            row_index_stride=-800,
        )

    with pytest.raises(ValueError):
        orc.write_table(
            buffer_output_stream,
            table,
            row_index_stride=3096.29,
        )

    # padding_tolerance must be possible to cast to float
    with pytest.raises(ValueError):
        orc.write_table(
            buffer_output_stream,
            table,
            padding_tolerance='cat',
        )

    # dictionary_key_size_threshold must be possible to cast to
    # float between 0.0 and 1.0
    with pytest.raises(ValueError):
        orc.write_table(
            buffer_output_stream,
            table,
            dictionary_key_size_threshold='arrow',
        )
    with pytest.raises(ValueError):
        orc.write_table(
            buffer_output_stream,
            table,
            dictionary_key_size_threshold=1.2,
        )
    with pytest.raises(ValueError):
        orc.write_table(
            buffer_output_stream,
            table,
            dictionary_key_size_threshold=-3.2,
        )

    # bloom_filter_columns must be convertible to a list containing
    # nonnegative integers
    with pytest.raises(ValueError):
        orc.write_table(
            buffer_output_stream,
            table,
            bloom_filter_columns="string",
        )

    with pytest.raises(ValueError):
        orc.write_table(
            buffer_output_stream,
            table,
            bloom_filter_columns=[0, 1.4],
        )

    with pytest.raises(ValueError):
        orc.write_table(
            buffer_output_stream,
            table,
            bloom_filter_columns={0, 2, -1},
        )

    # bloom_filter_fpp must be convertible to a float between 0.0 and 1.0
    with pytest.raises(ValueError):
        orc.write_table(
            buffer_output_stream,
            table,
            bloom_filter_fpp='arrow',
        )

    with pytest.raises(ValueError):
        orc.write_table(
            buffer_output_stream,
            table,
            bloom_filter_fpp=1.1,
        )

    with pytest.raises(ValueError):
        orc.write_table(
            buffer_output_stream,
            table,
            bloom_filter_fpp=-0.1,
        )
import sys
import os
import pandas as pd
import pyarrow.parquet as pq
import pyarrow.orc as orc
'''
parquet_file = pq.ParquetFile(sys.argv[1])
print(parquet_file.schema)
'''

orc_name = os.path.splitext(sys.argv[1])[0] + ".orc"

table = pq.read_table(sys.argv[1])

print("Writing ", orc_name)
orc.write_table(table, orc_name)