Exemplo n.º 1
0
def test_write_compliant_nested_type_disable(tempdir, use_legacy_dataset,
                                             test_data):
    # prepare dataframe for testing
    df = pd.DataFrame(data=test_data)
    # verify that we can read/write with new flag disabled (default behaviour)
    _roundtrip_pandas_dataframe(df,
                                write_kwargs={},
                                use_legacy_dataset=use_legacy_dataset)

    # Write to a parquet file while disabling compliant nested type
    table = pa.Table.from_pandas(df, preserve_index=False)
    path = str(tempdir / 'data.parquet')
    with pq.ParquetWriter(path, table.schema, version='2.6') as writer:
        writer.write_table(table)
    new_table = _read_table(path)

    # Validate that "items" columns is not compliant to Parquet nested format
    # Should be like this: list<item: struct<name: string, value: string>>
    assert isinstance(new_table.schema.types[0], pa.ListType)
    assert new_table.schema.types[0].value_field.name == 'item'

    # Verify that the new table can be read/written correctly
    _check_roundtrip(new_table,
                     use_legacy_dataset=use_legacy_dataset,
                     use_compliant_nested_type=False)
Exemplo n.º 2
0
def test_write_compliant_nested_type_enable(tempdir, use_legacy_dataset,
                                            test_data):
    # prepare dataframe for testing
    df = pd.DataFrame(data=test_data)
    # verify that we can read/write pandas df with new flag
    _roundtrip_pandas_dataframe(
        df,
        write_kwargs={'use_compliant_nested_type': True},
        use_legacy_dataset=use_legacy_dataset)

    # Write to a parquet file with compliant nested type
    table = pa.Table.from_pandas(df, preserve_index=False)
    path = str(tempdir / 'data.parquet')
    with pq.ParquetWriter(path,
                          table.schema,
                          use_compliant_nested_type=True,
                          version='2.0') as writer:
        writer.write_table(table)
    # Read back as a table
    new_table = _read_table(path)
    # Validate that "items" columns compliant to Parquet nested format
    # Should be like this: list<element: struct<name: string, value: string>>
    assert isinstance(new_table.schema.types[0], pa.ListType)
    assert new_table.schema.types[0].value_field.name == 'element'

    # Verify that the new table can be read/written correctly
    _check_roundtrip(new_table,
                     use_legacy_dataset=use_legacy_dataset,
                     use_compliant_nested_type=True)
Exemplo n.º 3
0
def test_datetime_timezone_tzinfo(use_legacy_dataset):
    value = datetime.datetime(2018, 1, 1, 1, 23, 45,
                              tzinfo=datetime.timezone.utc)
    df = pd.DataFrame({'foo': [value]})

    _roundtrip_pandas_dataframe(
        df, write_kwargs={}, use_legacy_dataset=use_legacy_dataset)
Exemplo n.º 4
0
def test_spark_flavor_preserves_pandas_metadata():
    df = _test_dataframe(size=100)
    df.index = np.arange(0, 10 * len(df), 10)
    df.index.name = 'foo'

    result = _roundtrip_pandas_dataframe(df, {'version': '2.0',
                                              'flavor': 'spark'})
    tm.assert_frame_equal(result, df)
Exemplo n.º 5
0
def test_list_of_datetime_time_roundtrip():
    # ARROW-4135
    times = pd.to_datetime(
        ['09:00', '09:30', '10:00', '10:30', '11:00', '11:30', '12:00'])
    df = pd.DataFrame({'time': [times.time]})
    _roundtrip_pandas_dataframe(df, write_kwargs={})