def test_write_compliant_nested_type_disable(tempdir, use_legacy_dataset, test_data): # prepare dataframe for testing df = pd.DataFrame(data=test_data) # verify that we can read/write with new flag disabled (default behaviour) _roundtrip_pandas_dataframe(df, write_kwargs={}, use_legacy_dataset=use_legacy_dataset) # Write to a parquet file while disabling compliant nested type table = pa.Table.from_pandas(df, preserve_index=False) path = str(tempdir / 'data.parquet') with pq.ParquetWriter(path, table.schema, version='2.6') as writer: writer.write_table(table) new_table = _read_table(path) # Validate that "items" columns is not compliant to Parquet nested format # Should be like this: list<item: struct<name: string, value: string>> assert isinstance(new_table.schema.types[0], pa.ListType) assert new_table.schema.types[0].value_field.name == 'item' # Verify that the new table can be read/written correctly _check_roundtrip(new_table, use_legacy_dataset=use_legacy_dataset, use_compliant_nested_type=False)
def test_write_compliant_nested_type_enable(tempdir, use_legacy_dataset, test_data): # prepare dataframe for testing df = pd.DataFrame(data=test_data) # verify that we can read/write pandas df with new flag _roundtrip_pandas_dataframe( df, write_kwargs={'use_compliant_nested_type': True}, use_legacy_dataset=use_legacy_dataset) # Write to a parquet file with compliant nested type table = pa.Table.from_pandas(df, preserve_index=False) path = str(tempdir / 'data.parquet') with pq.ParquetWriter(path, table.schema, use_compliant_nested_type=True, version='2.0') as writer: writer.write_table(table) # Read back as a table new_table = _read_table(path) # Validate that "items" columns compliant to Parquet nested format # Should be like this: list<element: struct<name: string, value: string>> assert isinstance(new_table.schema.types[0], pa.ListType) assert new_table.schema.types[0].value_field.name == 'element' # Verify that the new table can be read/written correctly _check_roundtrip(new_table, use_legacy_dataset=use_legacy_dataset, use_compliant_nested_type=True)
def test_datetime_timezone_tzinfo(use_legacy_dataset): value = datetime.datetime(2018, 1, 1, 1, 23, 45, tzinfo=datetime.timezone.utc) df = pd.DataFrame({'foo': [value]}) _roundtrip_pandas_dataframe( df, write_kwargs={}, use_legacy_dataset=use_legacy_dataset)
def test_spark_flavor_preserves_pandas_metadata(): df = _test_dataframe(size=100) df.index = np.arange(0, 10 * len(df), 10) df.index.name = 'foo' result = _roundtrip_pandas_dataframe(df, {'version': '2.0', 'flavor': 'spark'}) tm.assert_frame_equal(result, df)
def test_list_of_datetime_time_roundtrip(): # ARROW-4135 times = pd.to_datetime( ['09:00', '09:30', '10:00', '10:30', '11:00', '11:30', '12:00']) df = pd.DataFrame({'time': [times.time]}) _roundtrip_pandas_dataframe(df, write_kwargs={})