Пример #1
0
def test_file_pathlib(file_fixture, tmpdir):
    import pathlib

    _, batches = file_fixture.write_batches()
    source = file_fixture.get_source()

    path = tmpdir.join('file.arrow').strpath
    with open(path, 'wb') as f:
        f.write(source)

    t1 = pa.open_file(pathlib.Path(path)).read_all()
    t2 = pa.open_file(pa.OSFile(path)).read_all()

    assert t1.equals(t2)
Пример #2
0
def test_open_file_from_buffer(file_fixture):
    # ARROW-2859; APIs accept the buffer protocol
    _, batches = file_fixture.write_batches()
    source = file_fixture.get_source()

    reader1 = pa.open_file(source)
    reader2 = pa.open_file(pa.BufferReader(source))
    reader3 = pa.RecordBatchFileReader(source)

    result1 = reader1.read_all()
    result2 = reader2.read_all()
    result3 = reader3.read_all()

    assert result1.equals(result2)
    assert result1.equals(result3)
Пример #3
0
def test_deprecated_pyarrow_ns_apis():
    table = pa.table([pa.array([1, 2, 3, 4])], names=['a'])
    sink = pa.BufferOutputStream()
    with pa.ipc.new_stream(sink, table.schema) as writer:
        writer.write(table)

    with pytest.warns(FutureWarning,
                      match="please use pyarrow.ipc.open_stream"):
        pa.open_stream(sink.getvalue())

    sink = pa.BufferOutputStream()
    with pa.ipc.new_file(sink, table.schema) as writer:
        writer.write(table)
    with pytest.warns(FutureWarning, match="please use pyarrow.ipc.open_file"):
        pa.open_file(sink.getvalue())
Пример #4
0
    def test_read_all(self):
        _, batches = self.write_batches()
        file_contents = pa.BufferReader(self._get_source())

        reader = pa.open_file(file_contents)

        result = reader.read_all()
        expected = pa.Table.from_batches(batches)
        assert result.equals(expected)
Пример #5
0
    def test_read_all(self):
        _, batches = self.write_batches()
        file_contents = pa.BufferReader(self._get_source())

        reader = pa.open_file(file_contents)

        result = reader.read_all()
        expected = pa.Table.from_batches(batches)
        assert result.equals(expected)
Пример #6
0
    def test_read_pandas(self):
        frames, _ = self.write_batches()

        file_contents = pa.BufferReader(self._get_source())
        reader = pa.open_file(file_contents)
        result = reader.read_pandas()

        expected = pd.concat(frames)
        assert_frame_equal(result, expected)
Пример #7
0
    def test_read_pandas(self):
        frames, _ = self.write_batches()

        file_contents = pa.BufferReader(self._get_source())
        reader = pa.open_file(file_contents)
        result = reader.read_pandas()

        expected = pd.concat(frames)
        assert_frame_equal(result, expected)
Пример #8
0
def test_file_read_all(sink_factory):
    fixture = FileFormatFixture(sink_factory)

    _, batches = fixture.write_batches()
    file_contents = pa.BufferReader(fixture.get_source())

    reader = pa.open_file(file_contents)

    result = reader.read_all()
    expected = pa.Table.from_batches(batches)
    assert result.equals(expected)
Пример #9
0
    def test_simple_roundtrip(self):
        batches = self.write_batches()
        file_contents = self._get_source()

        reader = pa.open_file(file_contents)

        assert reader.num_record_batches == len(batches)

        for i, batch in enumerate(batches):
            # it works. Must convert back to DataFrame
            batch = reader.get_batch(i)
            assert batches[i].equals(batch)
Пример #10
0
    def _check_roundtrip(self, as_table=False):
        _, batches = self.write_batches(as_table=as_table)
        file_contents = pa.BufferReader(self._get_source())

        reader = pa.open_file(file_contents)

        assert reader.num_record_batches == len(batches)

        for i, batch in enumerate(batches):
            # it works. Must convert back to DataFrame
            batch = reader.get_batch(i)
            assert batches[i].equals(batch)
            assert reader.schema.equals(batches[0].schema)
Пример #11
0
    def func():
        df = table.to_pandas()

        batch = pa.RecordBatch.from_pandas(df)

        sink = io.BytesIO()
        writer = pa.RecordBatchFileWriter(sink, batch.schema)
        writer.write_batch(batch)
        writer.close()

        buf_reader = pa.BufferReader(sink.getvalue())
        reader = pa.open_file(buf_reader)
        reader.read_all()
Пример #12
0
def leak2():
    data = [pa.array(np.concatenate([np.random.randn(100000)] * 10))]
    table = pa.Table.from_arrays(data, ['foo'])
    while True:
        print('calling to_pandas')
        print('memory_usage: {0}'.format(memory_profiler.memory_usage()))
        df = table.to_pandas()

        batch = pa.RecordBatch.from_pandas(df)

        sink = io.BytesIO()
        writer = pa.RecordBatchFileWriter(sink, batch.schema)
        writer.write_batch(batch)
        writer.close()

        buf_reader = pa.BufferReader(sink.getvalue())
        reader = pa.open_file(buf_reader)
        reader.read_all()

        gc.collect()
Пример #13
0
def leak2():
    data = [pa.array(np.concatenate([np.random.randn(100000)] * 10))]
    table = pa.Table.from_arrays(data, ['foo'])
    while True:
        print('calling to_pandas')
        print('memory_usage: {0}'.format(memory_profiler.memory_usage()))
        df = table.to_pandas()

        batch = pa.RecordBatch.from_pandas(df)

        sink = io.BytesIO()
        writer = pa.RecordBatchFileWriter(sink, batch.schema)
        writer.write_batch(batch)
        writer.close()

        buf_reader = pa.BufferReader(sink.getvalue())
        reader = pa.open_file(buf_reader)
        reader.read_all()

        gc.collect()
Пример #14
0
def o(fp, fptype=None):
    """
    Opens the file given by the path into a python object.

    :param str fp: The path of the file
    """

    fp = xv(fp)

    #  , dtype={'ID': object}

    import re

    if re.match(r".*\.conllu", fp) is not None:
        import conllu

        sentences = conllu.parse(cat(fp))
        #  sys.stdout.write(str(type(ret)))
        return sentences

    if re.match(r".*\.h5", fp) is not None:
        # /usr/local/lib/python3.6/dist-packages/h5py

        sps("ttyify show_h5 " + q(fp))

        # from show_h5 import print_h5
        # print_h5(fp, section=None, show_attrs=False, show_data=False)

        # import required libraries
        import h5py as h5
        import numpy as np
        import matplotlib.pyplot as plt

        # Read H5 file
        ret = h5.File(fp, "r")

        return ret

    if re.match(r".*\.narrow", fp) is not None:
        import pyarrow as pa

        try:
            ret = pa.open_file(fp)
            return ret
        except:
            pass

    if re.match(r".*\.npy", fp) is not None:
        import numpy as np

        try:
            ret = np.load(fp)
            return ret
        except:
            pass

        try:
            ret = np.load(fp, encoding="latin1")
            return ret
        except:
            pass

        try:
            ret = np.load(fp, encoding="bytes")
            return ret
        except:
            pass

    if re.match(r".*\.xml", fp) is not None:
        import pandas as pd

        ret = read_csv_smart(fp)
        #  sys.stdout.write(str(type(ret)))
        return ret

    if re.match(r".*\.csv", fp) is not None:
        if fptype == "np":
            import numpy as np

            ret = np.genfromtxt(fp, delimiter=",")
        else:
            import pandas as pd

            ret = read_csv_smart(fp)
            #  sys.stdout.write(str(type(ret)))
        return ret

    if re.match(r".*\.xls", fp) is not None:
        import pandas as pd

        ret = pd.read_excel(fp)
        #  sys.stdout.write(str(type(ret)))
        return ret

    if re.match(r".*\.pkl", fp) is not None:
        import pandas as pd

        ret = pd.read_pickle(fp)
        return ret

    if re.match(r".*\.gensim", fp) is not None:
        import gensim

        # Load pre-trained Word2Vec model.
        try:
            ret = gensim.models.LdaModel.load(fp)
        except:
            try:
                ret = gensim.models.Word2Vec.load(fp)
            except:
                pass
        return ret

    if re.match(r".*\.pickle$", fp) is not None or re.match(r".*\.p$",
                                                            fp) is not None:
        import pickle

        with open(fp, "rUb") as f:
            data = f.read()

        ret = pickle.loads(data)
        return ret

    # Just open the file as an unknown binary
    #  Use python-magic
    import magic

    mimestr = magic.from_file(fp)

    # https://stackoverflow.com/q/4980146
    gre = Re()

    if gre.match(r"gzip compressed data", mimestr):
        # do something with gre.last_match
        import gzip

        f = gzip.open(fp)
        return f.read()

    # elif gre.match(r'bar',mimestr):
    #     # do something with gre.last_match
    #     return mimestr

    return mimestr
Пример #15
0
 def test_empty_file(self):
     buf = io.BytesIO(b'')
     with pytest.raises(pa.ArrowInvalid):
         pa.open_file(buf)
Пример #16
0
def read_file(source):
    reader = pa.open_file(source)
    return [reader.get_batch(i) for i in range(reader.num_record_batches)]
Пример #17
0
 def test_empty_file(self):
     buf = io.BytesIO(b'')
     with pytest.raises(pa.ArrowInvalid):
         pa.open_file(buf)
Пример #18
0
def read_file(source):
    reader = pa.open_file(source)
    return [reader.get_batch(i)
            for i in range(reader.num_record_batches)]
Пример #19
0
def test_empty_file():
    buf = b''
    with pytest.raises(pa.ArrowInvalid):
        pa.open_file(pa.BufferReader(buf))
Пример #20
0
#!/usr/bin/env python

import os
import pyarrow as pa

times = int(os.getenv('TIMES', 1))

for time in range(times):

    reader = pa.open_file('/tmp/python.arrow')
    schema = reader.schema

    for i in range(reader.num_record_batches):
        record_batch = reader.get_batch(i)
        if times > 1:
            break
        print('=' * 48)
        print(f'record-batch[{i}]:')
        for j, column in enumerate(schema):
            values = record_batch.column(j).to_numpy()
            print(f'  {column.name}: {record_batch.column(j).to_numpy()}')