def test_inmemory_write_after_closed(): f = pa.InMemoryOutputStream() f.write(b'ok') f.get_result() with pytest.raises(IOError): f.write(b'not ok')
def test_pandas_parquet_native_file_roundtrip(tmpdir): df = _test_dataframe(10000) arrow_table = pa.Table.from_pandas(df) imos = pa.InMemoryOutputStream() pq.write_table(arrow_table, imos, version="2.0") buf = imos.get_result() reader = pa.BufferReader(buf) df_read = pq.read_table(reader).to_pandas() tm.assert_frame_equal(df, df_read)
def test_nativefile_write_memoryview(): f = pa.InMemoryOutputStream() data = b'ok' arr = np.frombuffer(data, dtype='S1') f.write(arr) f.write(bytearray(data)) buf = f.get_result() assert buf.to_pybytes() == data * 2
def test_ipc_zero_copy_numpy(): df = pd.DataFrame({'foo': [1.5]}) batch = pa.RecordBatch.from_pandas(df) sink = pa.InMemoryOutputStream() write_file(batch, sink) buffer = sink.get_result() reader = pa.BufferReader(buffer) batches = read_file(reader) data = batches[0].to_pandas() rdf = pd.DataFrame(data) assert_frame_equal(df, rdf)
def test_memory_output_stream(): # 10 bytes val = b'dataabcdef' f = pa.InMemoryOutputStream() K = 1000 for i in range(K): f.write(val) buf = f.get_result() assert len(buf) == len(val) * K assert buf.to_pybytes() == val * K
def serialize_pandas(df): """Serialize a pandas DataFrame into a buffer protocol compatible object. Parameters ---------- df : pandas.DataFrame Returns ------- buf : buffer An object compatible with the buffer protocol """ batch = pa.RecordBatch.from_pandas(df) sink = pa.InMemoryOutputStream() writer = pa.RecordBatchStreamWriter(sink, batch.schema) writer.write_batch(batch) writer.close() return sink.get_result()
def _get_sink(self): return pa.InMemoryOutputStream()
# 2 -0.114206 0.073758 # 3 0.477138 -0.063724 # 1 << 10 --> 1024: 1 Kilobyte KILOBYTE = 1 << 10 MEGABYTE = KILOBYTE * KILOBYTE DATA_SIZE = 1024 * MEGABYTE NCOLS = 16 df = generate_data(MEGABYTE, NCOLS) # df --> batch batch = pa.RecordBatch.from_pandas(df) # Write batches in RAM sink = pa.InMemoryOutputStream() stream_writer = pa.StreamWriter(sink, batch.schema) for i in range(DATA_SIZE // MEGABYTE): stream_writer.write_batch(batch) # Info source = sink.get_result() source.size # Read back Arrow record batches in memory reader = pa.StreamReader(source) table = reader.read_all() table
def test_inmemory_output_stream(): with pytest.warns(FutureWarning): stream = pa.InMemoryOutputStream() assert isinstance(stream, pa.BufferOutputStream)