def test_parquet(tmpdir, registered_period_type): # parquet support for extension types period_type = PeriodType('D') storage = pa.array([1, 2, 3, 4], pa.int64()) arr = pa.ExtensionArray.from_storage(period_type, storage) table = pa.table([arr], names=["ext"]) import pyarrow.parquet as pq filename = tmpdir / 'extension_type.parquet' pq.write_table(table, filename) # stored in parquet as storage type but with extension metadata saved # in the serialized arrow schema meta = pq.read_metadata(filename) assert meta.schema.column(0).physical_type == "INT64" assert b"ARROW:schema" in meta.metadata import base64 decoded_schema = base64.b64decode(meta.metadata[b"ARROW:schema"]) schema = pa.read_schema(pa.BufferReader(decoded_schema)) assert schema.field("ext").metadata == { b'ARROW:extension:metadata': b'freq=D', b'ARROW:extension:name': b'pandas.period' } # when reading in, properly create extension type if it is registered result = pq.read_table(filename) assert result.column("ext").type == period_type # when the type is not registered, read in as storage type pa.unregister_extension_type(period_type.extension_name) result = pq.read_table(filename) assert result.column("ext").type == pa.int64()
def registered_period_type(): # setup period_type = PeriodType('D') pa.register_extension_type(period_type) yield # teardown try: pa.unregister_extension_type('pandas.period') except KeyError: pass
def registered_period_type(request): # setup period_type = request.param period_class = period_type.__arrow_ext_class__() pa.register_extension_type(period_type) yield period_type, period_class # teardown try: pa.unregister_extension_type('test.period') except KeyError: pass
def test_parquet_period(tmpdir, registered_period_type): # Parquet support for primitive extension types period_type, period_class = registered_period_type storage = pa.array([1, 2, 3, 4], pa.int64()) arr = pa.ExtensionArray.from_storage(period_type, storage) table = pa.table([arr], names=["ext"]) import pyarrow.parquet as pq filename = tmpdir / 'period_extension_type.parquet' pq.write_table(table, filename) # Stored in parquet as storage type but with extension metadata saved # in the serialized arrow schema meta = pq.read_metadata(filename) assert meta.schema.column(0).physical_type == "INT64" assert b"ARROW:schema" in meta.metadata import base64 decoded_schema = base64.b64decode(meta.metadata[b"ARROW:schema"]) schema = pa.ipc.read_schema(pa.BufferReader(decoded_schema)) # Since the type could be reconstructed, the extension type metadata is # absent. assert schema.field("ext").metadata == {} # When reading in, properly create extension type if it is registered result = pq.read_table(filename) assert result.schema.field("ext").type == period_type assert result.schema.field("ext").metadata == {b'PARQUET:field_id': b'1'} # Get the exact array class defined by the registered type. result_array = result.column("ext").chunk(0) assert type(result_array) is period_class # When the type is not registered, read in as storage type pa.unregister_extension_type(period_type.extension_name) result = pq.read_table(filename) assert result.schema.field("ext").type == pa.int64() # The extension metadata is present for roundtripping. assert result.schema.field("ext").metadata == { b'ARROW:extension:metadata': b'freq=D', b'ARROW:extension:name': b'test.period', b'PARQUET:field_id': b'1', }
def test_generic_ext_type_ipc_unknown(registered_period_type): period_type = PeriodType('D') storage = pa.array([1, 2, 3, 4], pa.int64()) arr = pa.ExtensionArray.from_storage(period_type, storage) batch = pa.RecordBatch.from_arrays([arr], ["ext"]) buf = ipc_write_batch(batch) del batch # unregister type before loading again => reading unknown extension type # as plain array (but metadata in schema's field are preserved) pa.unregister_extension_type('pandas.period') batch = ipc_read_batch(buf) result = batch.column(0) assert isinstance(result, pa.Int64Array) ext_field = batch.schema.field('ext') assert ext_field.metadata == { b'ARROW:extension:metadata': b'freq=D', b'ARROW:extension:name': b'pandas.period' }