def test_read_record_batch_on_stream_error_message(): # ARROW-5374 batch = pa.record_batch([pa.array([b"foo"], type=pa.utf8())], names=['strs']) stream = pa.BufferOutputStream() with pa.RecordBatchStreamWriter(stream, batch.schema) as writer: writer.write_batch(batch) buf = stream.getvalue() with pytest.raises(IOError, match="type record batch but got schema"): pa.read_record_batch(buf, batch.schema)
def _parse_arrow_message(self, message): self._parse_arrow_schema() return pyarrow.read_record_batch( pyarrow.py_buffer(message.arrow_record_batch.serialized_record_batch), self._schema, )
def decode_pyarrow_records(b64_schema, b64_records): """ Decodes an encoded record set provided a similarly encoded schema. Returns just the records as the schema will be included with that """ pa_schema = AthenaSDKUtils.parse_encoded_schema(b64_schema) return pa.read_record_batch(base64.b64decode(b64_records), pa_schema)
def write_mutable_tensor(self, session_id, name, payload_type, body): import pyarrow from ..serialize import dataserializer from ..tensor.core import Indexes session_uid = SessionActor.gen_uid(session_id) session_ref = self.get_actor_ref(session_uid) index_json_size = np.frombuffer(body[0:8], dtype=np.int64).item() index_json = json.loads(body[8:8 + index_json_size].decode('ascii')) index = Indexes.from_json(index_json).indexes if payload_type is None: value = dataserializer.loads(body[8 + index_json_size:]) elif payload_type == 'tensor': tensor_chunk_offset = 8 + index_json_size with pyarrow.BufferReader(body[tensor_chunk_offset:]) as reader: value = pyarrow.read_tensor(reader).to_numpy() elif payload_type == 'record_batch': schema_size = np.frombuffer(body[8 + index_json_size:8 + index_json_size + 8], dtype=np.int64).item() schema_offset = 8 + index_json_size + 8 with pyarrow.BufferReader(body[schema_offset:schema_offset + schema_size]) as reader: schema = pyarrow.read_schema(reader) record_batch_offset = schema_offset + schema_size with pyarrow.BufferReader(body[record_batch_offset:]) as reader: record_batch = pyarrow.read_record_batch(reader, schema) value = record_batch.to_pandas().to_records(index=False) else: raise ValueError(f'Not supported payload type: {payload_type}') return session_ref.write_mutable_tensor(name, index, value)
def test_schema_batch_serialize_methods(): nrows = 5 df = pd.DataFrame({ 'one': np.random.randn(nrows), 'two': ['foo', np.nan, 'bar', 'bazbaz', 'qux']}) batch = pa.RecordBatch.from_pandas(df) s_schema = batch.schema.serialize() s_batch = batch.serialize() recons_schema = pa.read_schema(s_schema) recons_batch = pa.read_record_batch(s_batch, recons_schema) assert recons_batch.equals(batch)
def test_batch_serialize(): batch = make_recordbatch(10) hbuf = batch.serialize() cbuf = cuda.serialize_record_batch(batch, global_context) # test that read_record_batch works properly: cuda.read_record_batch(batch.schema, cbuf) buf = cbuf.copy_to_host() assert hbuf.equals(buf) batch2 = pa.read_record_batch(buf, batch.schema) assert hbuf.equals(batch2.serialize()) assert batch.num_columns == batch2.num_columns assert batch.num_rows == batch2.num_rows assert batch.column(0).equals(batch2.column(0)) assert batch.equals(batch2)
def test_batch_serialize(): batch = make_recordbatch(10) hbuf = batch.serialize() cbuf = cuda.serialize_record_batch(batch, global_context) # test that read_record_batch works properly: cuda.read_record_batch(cbuf, batch.schema) buf = cbuf.copy_to_host() assert hbuf.equals(buf) batch2 = pa.read_record_batch(buf, batch.schema) assert hbuf.equals(batch2.serialize()) assert batch.num_columns == batch2.num_columns assert batch.num_rows == batch2.num_rows assert batch.column(0).equals(batch2.column(0)) assert batch.equals(batch2)
def _load_data(buf, schema): """ Load a `pandas.DataFrame` from a buffer written to shared memory Parameters ---------- buf : pyarrow.Buffer shcema : pyarrow.Schema Returns ------- df : pandas.DataFrame """ import pyarrow as pa message = pa.read_message(buf) rb = pa.read_record_batch(message, schema) return rb.to_pandas()
def _load_data(buf, schema, tdf=None): """ Load a `pandas.DataFrame` from a buffer written to shared memory Parameters ---------- buf : pyarrow.Buffer shcema : pyarrow.Schema tdf(optional) : TDataFrame Returns ------- df : pandas.DataFrame """ message = pa.read_message(buf) rb = pa.read_record_batch(message, schema) df = rb.to_pandas() df.set_tdf = MethodType(set_tdf, df) df.get_tdf = MethodType(get_tdf, df) df.set_tdf(tdf) return df
def test_read_record_batch(self): batches, messages = self._get_example_messages() for batch, message in zip(batches, messages[1:]): read_batch = pa.read_record_batch(message, batch.schema) assert read_batch.equals(batch)
def test_message_read_record_batch(example_messages): batches, messages = example_messages for batch, message in zip(batches, messages[1:]): read_batch = pa.read_record_batch(message, batch.schema) assert read_batch.equals(batch)
def get_dfs_arrow(object_ids): buffers = client.get_buffers(object_ids) return [ pa.read_record_batch(pa.BufferReader(buf), test_schema()).to_pandas() for buf in buffers ]