def test_deprecated_pyarrow_ns_apis(): table = pa.table([pa.array([1, 2, 3, 4])], names=['a']) sink = pa.BufferOutputStream() with pa.ipc.new_stream(sink, table.schema) as writer: writer.write(table) with pytest.warns(FutureWarning, match="please use pyarrow.ipc.open_stream"): pa.open_stream(sink.getvalue()) sink = pa.BufferOutputStream() with pa.ipc.new_file(sink, table.schema) as writer: writer.write(table) with pytest.warns(FutureWarning, match="please use pyarrow.ipc.open_file"): pa.open_file(sink.getvalue())
def test_open_stream_from_buffer(stream_fixture): # ARROW-2859 _, batches = stream_fixture.write_batches() source = stream_fixture.get_source() reader1 = pa.open_stream(source) reader2 = pa.open_stream(pa.BufferReader(source)) reader3 = pa.RecordBatchStreamReader(source) result1 = reader1.read_all() result2 = reader2.read_all() result3 = reader3.read_all() assert result1.equals(result2) assert result1.equals(result3)
def test_read_all(self): _, batches = self.write_batches() file_contents = pa.BufferReader(self._get_source()) reader = pa.open_stream(file_contents) result = reader.read_all() expected = pa.Table.from_batches(batches) assert result.equals(expected)
def test_read_pandas(self): frames, _ = self.write_batches() file_contents = pa.BufferReader(self._get_source()) reader = pa.open_stream(file_contents) result = reader.read_pandas() expected = pd.concat(frames) assert_frame_equal(result, expected)
def load_stream(self, stream): import pyarrow as pa if LooseVersion(pa.__version__) >= "0.12.0": reader = pa.ipc.open_stream(stream) else: reader = pa.open_stream(stream) for batch in reader: yield batch
def test_stream_read_pandas(stream_fixture): frames, _ = stream_fixture.write_batches() file_contents = stream_fixture.get_source() reader = pa.open_stream(file_contents) result = reader.read_pandas() expected = pd.concat(frames) assert_frame_equal(result, expected)
def load_stream(self, stream): """ Deserialize ArrowRecordBatches to an Arrow table and return as a list of pandas.Series. """ import pyarrow as pa reader = pa.open_stream(stream) for batch in reader: table = pa.Table.from_batches([batch]) yield [c.to_pandas() for c in table.itercolumns()]
def load_stream(self, stream): """ Deserialize ArrowRecordBatches to an Arrow table and return as a list of pandas.Series. """ from pyspark.sql.types import _check_dataframe_localize_timestamps import pyarrow as pa reader = pa.open_stream(stream) for batch in reader: # NOTE: changed from pa.Columns.to_pandas, timezone issue in conversion fixed in 0.7.1 pdf = _check_dataframe_localize_timestamps(batch.to_pandas()) yield [c for _, c in pdf.iteritems()]
def load_stream(self, stream): """ Deserialize ArrowRecordBatches to an Arrow table and return as a list of pandas.Series. """ from pyspark.sql.types import _check_dataframe_localize_timestamps import pyarrow as pa reader = pa.open_stream(stream) for batch in reader: # NOTE: changed from pa.Columns.to_pandas, timezone issue in conversion fixed in 0.7.1 pdf = _check_dataframe_localize_timestamps(batch.to_pandas(), self._timezone) yield [c for _, c in pdf.iteritems()]
def run(self): connection, client_address = self._sock.accept() try: source = connection.makefile(mode='rb') reader = pa.open_stream(source) self._schema = reader.schema if self._do_read_all: self._table = reader.read_all() else: for i, batch in enumerate(reader): self._batches.append(batch) finally: connection.close()
def load_stream(self, stream): """ Deserialize ArrowRecordBatches to an Arrow table and return as a list of pandas.Series. """ from pyspark.sql.types import from_arrow_schema, _check_dataframe_convert_date, \ _check_dataframe_localize_timestamps import pyarrow as pa reader = pa.open_stream(stream) schema = from_arrow_schema(reader.schema) for batch in reader: pdf = batch.to_pandas() pdf = _check_dataframe_convert_date(pdf, schema) pdf = _check_dataframe_localize_timestamps(pdf, self._timezone) yield [c for _, c in pdf.iteritems()]
def load_stream(self, stream): """ Deserialize ArrowRecordBatches to an Arrow table and return as a list of pandas.Series. """ import pyarrow as pa if LooseVersion(pa.__version__) >= "0.12.0": reader = pa.ipc.open_stream(stream) else: reader = pa.open_stream(stream) for batch in reader: yield [ self.arrow_to_pandas(c) for c in pa.Table.from_batches([batch]).itercolumns() ]
def test_categorical_roundtrip(self): df = pd.DataFrame({ 'one': np.random.randn(5), 'two': pd.Categorical(['foo', np.nan, 'bar', 'foo', 'foo'], categories=['foo', 'bar'], ordered=True) }) batch = pa.RecordBatch.from_pandas(df) writer = self._get_writer(self.sink, batch.schema) writer.write_batch(pa.RecordBatch.from_pandas(df)) writer.close() table = (pa.open_stream(pa.BufferReader(self._get_source())) .read_all()) assert_frame_equal(table.to_pandas(), df)
def test_simple_roundtrip(self): _, batches = self.write_batches() file_contents = pa.BufferReader(self._get_source()) reader = pa.open_stream(file_contents) assert reader.schema.equals(batches[0].schema) total = 0 for i, next_batch in enumerate(reader): assert next_batch.equals(batches[i]) total += 1 assert total == len(batches) with pytest.raises(StopIteration): reader.get_next_batch()
def test_ipc_stream_no_batches(): # ARROW-2307 table = pa.Table.from_arrays([pa.array([1, 2, 3, 4]), pa.array(['foo', 'bar', 'baz', 'qux'])], names=['a', 'b']) sink = pa.BufferOutputStream() writer = pa.RecordBatchStreamWriter(sink, table.schema) writer.close() source = sink.get_result() reader = pa.open_stream(source) result = reader.read_all() assert result.schema.equals(table.schema) assert len(result) == 0
def test_ipc_stream_no_batches(): # ARROW-2307 table = pa.Table.from_arrays( [pa.array([1, 2, 3, 4]), pa.array(['foo', 'bar', 'baz', 'qux'])], names=['a', 'b']) sink = pa.BufferOutputStream() writer = pa.RecordBatchStreamWriter(sink, table.schema) writer.close() source = sink.getvalue() reader = pa.open_stream(source) result = reader.read_all() assert result.schema.equals(table.schema) assert len(result) == 0
def test_arrow_chunk(scidb_con, url): prefix = 'arrow_chunk' url = '{}/{}'.format(url, prefix) schema = '<v:int64> [i=0:999:0:1000]' # Store # if url.startswith('s3://'): scidb_con.iquery(""" xsave( build({}, i), '{}')""".format(schema, url)) # Re-write one SciDB Chunk file to use multiple Arrow Chunks if url.startswith('s3://'): s3_key = '{}/{}/chunks/c_0'.format(base_prefix, prefix) obj = s3_con.get_object(Bucket=s3_bucket, Key=s3_key) reader = pyarrow.ipc.open_stream(obj['Body'].read()) elif url.startswith('file://'): fn = '{}/{}/chunks/c_0'.format(fs_base, prefix) reader = pyarrow.open_stream(pyarrow.OSFile(fn)) tbl = reader.read_all() if url.startswith('s3://'): sink = pyarrow.BufferOutputStream() writer = pyarrow.ipc.RecordBatchStreamWriter(sink, tbl.schema) elif url.startswith('file://'): writer = pyarrow.ipc.RecordBatchStreamWriter(fn, tbl.schema) batches = tbl.to_batches(max_chunksize=200) # 1000 / 200 = 5 chunks writer.write_table(pyarrow.Table.from_batches(batches)) writer.close() if url.startswith('s3://'): s3_con.put_object(Body=sink.getvalue().to_pybytes(), Bucket=s3_bucket, Key=s3_key) # Input que = "xinput('{}')".format(url) with pytest.raises(requests.exceptions.HTTPError): array = scidb_con.iquery(que, fetch=True)
def test_stream_write_dispatch(self): # ARROW-1616 df = pd.DataFrame({ 'one': np.random.randn(5), 'two': pd.Categorical(['foo', np.nan, 'bar', 'foo', 'foo'], categories=['foo', 'bar'], ordered=True) }) table = pa.Table.from_pandas(df, preserve_index=False) batch = pa.RecordBatch.from_pandas(df, preserve_index=False) writer = self._get_writer(self.sink, table.schema) writer.write(table) writer.write(batch) writer.close() table = (pa.open_stream(pa.BufferReader(self._get_source())) .read_all()) assert_frame_equal(table.to_pandas(), pd.concat([df, df], ignore_index=True))
def test_stream_write_table_batches(stream_fixture): # ARROW-504 df = pd.DataFrame({ 'one': np.random.randn(20), }) b1 = pa.RecordBatch.from_pandas(df[:10], preserve_index=False) b2 = pa.RecordBatch.from_pandas(df, preserve_index=False) table = pa.Table.from_batches([b1, b2, b1]) writer = stream_fixture._get_writer(stream_fixture.sink, table.schema) writer.write_table(table, chunksize=15) writer.close() batches = list(pa.open_stream(stream_fixture.get_source())) assert list(map(len, batches)) == [10, 15, 5, 10] result_table = pa.Table.from_batches(batches) assert_frame_equal(result_table.to_pandas(), pd.concat([df[:10], df, df[:10]], ignore_index=True))
def test_stream_write_table_batches(self): # ARROW-504 df = pd.DataFrame({ 'one': np.random.randn(20), }) b1 = pa.RecordBatch.from_pandas(df[:10], preserve_index=False) b2 = pa.RecordBatch.from_pandas(df, preserve_index=False) table = pa.Table.from_batches([b1, b2, b1]) writer = self._get_writer(self.sink, table.schema) writer.write_table(table, chunksize=15) writer.close() batches = list(pa.open_stream(pa.BufferReader(self._get_source()))) assert list(map(len, batches)) == [10, 15, 5, 10] result_table = pa.Table.from_batches(batches) assert_frame_equal(result_table.to_pandas(), pd.concat([df[:10], df, df[:10]], ignore_index=True))
def _load(self): source = pa.memory_map(self.path) reader = pa.open_stream(source) table = pa.Table.from_batches([b for b in reader]) self._load_table(table)
def deserialize(self, serialized_rows): reader = pa.open_stream(serialized_rows) table = reader.read_all() return table
def load_stream(self, stream): import pyarrow as pa reader = pa.open_stream(stream) for batch in reader: yield batch
def test_empty_stream(self): buf = io.BytesIO(b'') with pytest.raises(pa.ArrowInvalid): pa.open_stream(buf)
def to_arrow_stream_reader(arrow_bytes): """ pyarrow.RecordBatchStreamReader """ io_bytes = StringIO(arrow_bytes) arrow_stream_reader = open_stream(io_bytes) return arrow_stream_reader