Пример #1
0
    def run(self):
        nrows = 5
        df = pd.DataFrame({
            'one': np.random.randn(nrows),
            'two': ['foo', np.nan, 'bar', 'bazbaz', 'qux']
        })

        batch = A.RecordBatch.from_pandas(df)
        writer = ipc.ArrowFileWriter(self.sink, batch.schema)

        num_batches = 5
        frames = []
        batches = []
        for i in range(num_batches):
            unique_df = df.copy()
            unique_df['one'] = np.random.randn(nrows)

            batch = A.RecordBatch.from_pandas(unique_df)
            writer.write_record_batch(batch)
            frames.append(unique_df)
            batches.append(batch)

        writer.close()

        file_contents = self._get_source()
        reader = ipc.ArrowFileReader(aio.BufferReader(file_contents))

        assert reader.num_record_batches == num_batches

        for i in range(num_batches):
            # it works. Must convert back to DataFrame
            batch = reader.get_record_batch(i)
            assert batches[i].equals(batch)
Пример #2
0
def test_pandas_parquet_native_file_roundtrip(tmpdir):
    df = _test_dataframe(10000)
    arrow_table = pa.Table.from_pandas(df)
    imos = paio.InMemoryOutputStream()
    pq.write_table(arrow_table, imos, version="2.0")
    buf = imos.get_result()
    reader = paio.BufferReader(buf)
    df_read = pq.read_table(reader).to_pandas()
    pdt.assert_frame_equal(df, df_read)
Пример #3
0
def test_ipc_zero_copy_numpy():
    df = pd.DataFrame({'foo': [1.5]})

    batch = A.RecordBatch.from_pandas(df)
    sink = arrow_io.InMemoryOutputStream()
    write_file(batch, sink)
    buffer = sink.get_result()
    reader = arrow_io.BufferReader(buffer)

    batches = read_file(reader)

    data = batches[0].to_pandas()
    rdf = pd.DataFrame(data)
    assert_frame_equal(df, rdf)
Пример #4
0
def test_bytes_reader():
    # Like a BytesIO, but zero-copy underneath for C++ consumers
    data = b'some sample data'
    f = io.BufferReader(data)
    assert f.tell() == 0

    assert f.size() == len(data)

    assert f.read(4) == b'some'
    assert f.tell() == 4

    f.seek(0)
    assert f.tell() == 0

    f.seek(5)
    assert f.tell() == 5

    assert f.read(50) == b'sample data'

    f.close()
Пример #5
0
def test_pandas_parquet_native_file_roundtrip(tmpdir):
    size = 10000
    np.random.seed(0)
    df = pd.DataFrame({
        'uint8': np.arange(size, dtype=np.uint8),
        'uint16': np.arange(size, dtype=np.uint16),
        'uint32': np.arange(size, dtype=np.uint32),
        'uint64': np.arange(size, dtype=np.uint64),
        'int8': np.arange(size, dtype=np.int16),
        'int16': np.arange(size, dtype=np.int16),
        'int32': np.arange(size, dtype=np.int32),
        'int64': np.arange(size, dtype=np.int64),
        'float32': np.arange(size, dtype=np.float32),
        'float64': np.arange(size, dtype=np.float64),
        'bool': np.random.randn(size) > 0
    })
    arrow_table = A.from_pandas_dataframe(df)
    imos = paio.InMemoryOutputStream()
    pq.write_table(arrow_table, imos, version="2.0")
    buf = imos.get_result()
    reader = paio.BufferReader(buf)
    df_read = pq.read_table(reader).to_pandas()
    pdt.assert_frame_equal(df, df_read)
Пример #6
0
 def get_buffer():
     data = b'some sample data' * 1000
     reader = io.BufferReader(data)
     reader.seek(5)
     return reader.read_buffer(6)
Пример #7
0
def test_bytes_reader_non_bytes():
    with pytest.raises(ValueError):
        io.BufferReader(u('some sample data'))