def run(self): nrows = 5 df = pd.DataFrame({ 'one': np.random.randn(nrows), 'two': ['foo', np.nan, 'bar', 'bazbaz', 'qux'] }) batch = A.RecordBatch.from_pandas(df) writer = ipc.ArrowFileWriter(self.sink, batch.schema) num_batches = 5 frames = [] batches = [] for i in range(num_batches): unique_df = df.copy() unique_df['one'] = np.random.randn(nrows) batch = A.RecordBatch.from_pandas(unique_df) writer.write_record_batch(batch) frames.append(unique_df) batches.append(batch) writer.close() file_contents = self._get_source() reader = ipc.ArrowFileReader(aio.BufferReader(file_contents)) assert reader.num_record_batches == num_batches for i in range(num_batches): # it works. Must convert back to DataFrame batch = reader.get_record_batch(i) assert batches[i].equals(batch)
def test_pandas_parquet_native_file_roundtrip(tmpdir): df = _test_dataframe(10000) arrow_table = pa.Table.from_pandas(df) imos = paio.InMemoryOutputStream() pq.write_table(arrow_table, imos, version="2.0") buf = imos.get_result() reader = paio.BufferReader(buf) df_read = pq.read_table(reader).to_pandas() pdt.assert_frame_equal(df, df_read)
def test_ipc_zero_copy_numpy(): df = pd.DataFrame({'foo': [1.5]}) batch = A.RecordBatch.from_pandas(df) sink = arrow_io.InMemoryOutputStream() write_file(batch, sink) buffer = sink.get_result() reader = arrow_io.BufferReader(buffer) batches = read_file(reader) data = batches[0].to_pandas() rdf = pd.DataFrame(data) assert_frame_equal(df, rdf)
def test_bytes_reader(): # Like a BytesIO, but zero-copy underneath for C++ consumers data = b'some sample data' f = io.BufferReader(data) assert f.tell() == 0 assert f.size() == len(data) assert f.read(4) == b'some' assert f.tell() == 4 f.seek(0) assert f.tell() == 0 f.seek(5) assert f.tell() == 5 assert f.read(50) == b'sample data' f.close()
def test_pandas_parquet_native_file_roundtrip(tmpdir): size = 10000 np.random.seed(0) df = pd.DataFrame({ 'uint8': np.arange(size, dtype=np.uint8), 'uint16': np.arange(size, dtype=np.uint16), 'uint32': np.arange(size, dtype=np.uint32), 'uint64': np.arange(size, dtype=np.uint64), 'int8': np.arange(size, dtype=np.int16), 'int16': np.arange(size, dtype=np.int16), 'int32': np.arange(size, dtype=np.int32), 'int64': np.arange(size, dtype=np.int64), 'float32': np.arange(size, dtype=np.float32), 'float64': np.arange(size, dtype=np.float64), 'bool': np.random.randn(size) > 0 }) arrow_table = A.from_pandas_dataframe(df) imos = paio.InMemoryOutputStream() pq.write_table(arrow_table, imos, version="2.0") buf = imos.get_result() reader = paio.BufferReader(buf) df_read = pq.read_table(reader).to_pandas() pdt.assert_frame_equal(df, df_read)
def get_buffer(): data = b'some sample data' * 1000 reader = io.BufferReader(data) reader.seek(5) return reader.read_buffer(6)
def test_bytes_reader_non_bytes(): with pytest.raises(ValueError): io.BufferReader(u('some sample data'))