def test_pandas_column_selection(tempdir, use_legacy_dataset): size = 10000 np.random.seed(0) df = pd.DataFrame({ 'uint8': np.arange(size, dtype=np.uint8), 'uint16': np.arange(size, dtype=np.uint16) }) filename = tempdir / 'pandas_roundtrip.parquet' arrow_table = pa.Table.from_pandas(df) _write_table(arrow_table, filename) table_read = _read_table(filename, columns=['uint8'], use_legacy_dataset=use_legacy_dataset) df_read = table_read.to_pandas() tm.assert_frame_equal(df[['uint8']], df_read) # ARROW-4267: Selection of duplicate columns still leads to these columns # being read uniquely. table_read = _read_table(filename, columns=['uint8', 'uint8'], use_legacy_dataset=use_legacy_dataset) df_read = table_read.to_pandas() tm.assert_frame_equal(df[['uint8']], df_read)
def test_pandas_parquet_configuration_options(tempdir, use_legacy_dataset): size = 10000 np.random.seed(0) df = pd.DataFrame({ 'uint8': np.arange(size, dtype=np.uint8), 'uint16': np.arange(size, dtype=np.uint16), 'uint32': np.arange(size, dtype=np.uint32), 'uint64': np.arange(size, dtype=np.uint64), 'int8': np.arange(size, dtype=np.int16), 'int16': np.arange(size, dtype=np.int16), 'int32': np.arange(size, dtype=np.int32), 'int64': np.arange(size, dtype=np.int64), 'float32': np.arange(size, dtype=np.float32), 'float64': np.arange(size, dtype=np.float64), 'bool': np.random.randn(size) > 0 }) filename = tempdir / 'pandas_roundtrip.parquet' arrow_table = pa.Table.from_pandas(df) for use_dictionary in [True, False]: _write_table(arrow_table, filename, version='2.0', use_dictionary=use_dictionary) table_read = _read_table(filename, use_legacy_dataset=use_legacy_dataset) df_read = table_read.to_pandas() tm.assert_frame_equal(df, df_read) for write_statistics in [True, False]: _write_table(arrow_table, filename, version='2.0', write_statistics=write_statistics) table_read = _read_table(filename, use_legacy_dataset=use_legacy_dataset) df_read = table_read.to_pandas() tm.assert_frame_equal(df, df_read) for compression in ['NONE', 'SNAPPY', 'GZIP', 'LZ4', 'ZSTD']: if (compression != 'NONE' and not pa.lib.Codec.is_available(compression)): continue _write_table(arrow_table, filename, version='2.0', compression=compression) table_read = _read_table(filename, use_legacy_dataset=use_legacy_dataset) df_read = table_read.to_pandas() tm.assert_frame_equal(df, df_read)
def test_fspath(tempdir, use_legacy_dataset): # ARROW-12472 support __fspath__ objects without using str() path = tempdir / "test.parquet" table = pa.table({"a": [1, 2, 3]}) _write_table(table, path) fs_protocol_obj = util.FSProtocolClass(path) result = _read_table(fs_protocol_obj, use_legacy_dataset=use_legacy_dataset) assert result.equals(table) # combined with non-local filesystem raises with pytest.raises(TypeError): _read_table(fs_protocol_obj, filesystem=FileSystem())
def test_write_compliant_nested_type_disable(tempdir, use_legacy_dataset, test_data): # prepare dataframe for testing df = pd.DataFrame(data=test_data) # verify that we can read/write with new flag disabled (default behaviour) _roundtrip_pandas_dataframe(df, write_kwargs={}, use_legacy_dataset=use_legacy_dataset) # Write to a parquet file while disabling compliant nested type table = pa.Table.from_pandas(df, preserve_index=False) path = str(tempdir / 'data.parquet') with pq.ParquetWriter(path, table.schema, version='2.6') as writer: writer.write_table(table) new_table = _read_table(path) # Validate that "items" columns is not compliant to Parquet nested format # Should be like this: list<item: struct<name: string, value: string>> assert isinstance(new_table.schema.types[0], pa.ListType) assert new_table.schema.types[0].value_field.name == 'item' # Verify that the new table can be read/written correctly _check_roundtrip(new_table, use_legacy_dataset=use_legacy_dataset, use_compliant_nested_type=False)
def test_write_compliant_nested_type_enable(tempdir, use_legacy_dataset, test_data): # prepare dataframe for testing df = pd.DataFrame(data=test_data) # verify that we can read/write pandas df with new flag _roundtrip_pandas_dataframe( df, write_kwargs={'use_compliant_nested_type': True}, use_legacy_dataset=use_legacy_dataset) # Write to a parquet file with compliant nested type table = pa.Table.from_pandas(df, preserve_index=False) path = str(tempdir / 'data.parquet') with pq.ParquetWriter(path, table.schema, use_compliant_nested_type=True, version='2.0') as writer: writer.write_table(table) # Read back as a table new_table = _read_table(path) # Validate that "items" columns compliant to Parquet nested format # Should be like this: list<element: struct<name: string, value: string>> assert isinstance(new_table.schema.types[0], pa.ListType) assert new_table.schema.types[0].value_field.name == 'element' # Verify that the new table can be read/written correctly _check_roundtrip(new_table, use_legacy_dataset=use_legacy_dataset, use_compliant_nested_type=True)
def test_multiple_path_types(tempdir, use_legacy_dataset): # Test compatibility with PEP 519 path-like objects path = tempdir / 'zzz.parquet' df = pd.DataFrame({'x': np.arange(10, dtype=np.int64)}) _write_table(df, path) table_read = _read_table(path, use_legacy_dataset=use_legacy_dataset) df_read = table_read.to_pandas() tm.assert_frame_equal(df, df_read) # Test compatibility with plain string paths path = str(tempdir) + 'zzz.parquet' df = pd.DataFrame({'x': np.arange(10, dtype=np.int64)}) _write_table(df, path) table_read = _read_table(path, use_legacy_dataset=use_legacy_dataset) df_read = table_read.to_pandas() tm.assert_frame_equal(df, df_read)
def test_coerce_timestamps_truncated(tempdir): """ ARROW-2555: Test that we can truncate timestamps when coercing if explicitly allowed. """ dt_us = datetime.datetime(year=2017, month=1, day=1, hour=1, minute=1, second=1, microsecond=1) dt_ms = datetime.datetime(year=2017, month=1, day=1, hour=1, minute=1, second=1) fields_us = [pa.field('datetime64', pa.timestamp('us'))] arrays_us = {'datetime64': [dt_us, dt_ms]} df_us = pd.DataFrame(arrays_us) schema_us = pa.schema(fields_us) filename = tempdir / 'pandas_truncated.parquet' table_us = pa.Table.from_pandas(df_us, schema=schema_us) _write_table(table_us, filename, version="2.0", coerce_timestamps='ms', allow_truncated_timestamps=True) table_ms = _read_table(filename) df_ms = table_ms.to_pandas() arrays_expected = {'datetime64': [dt_ms, dt_ms]} df_expected = pd.DataFrame(arrays_expected) tm.assert_frame_equal(df_expected, df_ms)
def test_parquet_1_0_roundtrip(tempdir, use_legacy_dataset): size = 10000 np.random.seed(0) df = pd.DataFrame({ 'uint8': np.arange(size, dtype=np.uint8), 'uint16': np.arange(size, dtype=np.uint16), 'uint32': np.arange(size, dtype=np.uint32), 'uint64': np.arange(size, dtype=np.uint64), 'int8': np.arange(size, dtype=np.int16), 'int16': np.arange(size, dtype=np.int16), 'int32': np.arange(size, dtype=np.int32), 'int64': np.arange(size, dtype=np.int64), 'float32': np.arange(size, dtype=np.float32), 'float64': np.arange(size, dtype=np.float64), 'bool': np.random.randn(size) > 0, 'str': [str(x) for x in range(size)], 'str_with_nulls': [None] + [str(x) for x in range(size - 2)] + [None], 'empty_str': [''] * size }) filename = tempdir / 'pandas_roundtrip.parquet' arrow_table = pa.Table.from_pandas(df) _write_table(arrow_table, filename, version='1.0') table_read = _read_table(filename, use_legacy_dataset=use_legacy_dataset) df_read = table_read.to_pandas() # We pass uint32_t as int64_t if we write Parquet version 1.0 df['uint32'] = df['uint32'].values.astype(np.int64) tm.assert_frame_equal(df, df_read)
def test_backwards_compatible_index_multi_level_some_named( datadir, use_legacy_dataset ): expected_string = b"""\ carat cut color clarity depth table price x y z 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43 0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31 0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31 0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63 0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75 0.24 Very Good J VVS2 62.8 57.0 336 3.94 3.96 2.48 0.24 Very Good I VVS1 62.3 57.0 336 3.95 3.98 2.47 0.26 Very Good H SI1 61.9 55.0 337 4.07 4.11 2.53 0.22 Fair E VS2 65.1 61.0 337 3.87 3.78 2.49 0.23 Very Good H VS1 59.4 61.0 338 4.00 4.05 2.39""" expected = pd.read_csv( io.BytesIO(expected_string), sep=r'\s{2,}', index_col=['cut', 'color', 'clarity'], header=0, engine='python' ).sort_index() expected.index = expected.index.set_names(['cut', None, 'clarity']) table = _read_table(datadir / 'v0.7.1.some-named-index.parquet', use_legacy_dataset=use_legacy_dataset) result = table.to_pandas() tm.assert_frame_equal(result, expected)
def test_parquet_incremental_file_build(tempdir, use_legacy_dataset): df = _test_dataframe(100) df['unique_id'] = 0 arrow_table = pa.Table.from_pandas(df, preserve_index=False) out = pa.BufferOutputStream() writer = pq.ParquetWriter(out, arrow_table.schema, version='2.0') frames = [] for i in range(10): df['unique_id'] = i arrow_table = pa.Table.from_pandas(df, preserve_index=False) writer.write_table(arrow_table) frames.append(df.copy()) writer.close() buf = out.getvalue() result = _read_table(pa.BufferReader(buf), use_legacy_dataset=use_legacy_dataset) expected = pd.concat(frames, ignore_index=True) tm.assert_frame_equal(result.to_pandas(), expected)
def test_parquet_writer_context_obj_with_exception(tempdir, use_legacy_dataset): df = _test_dataframe(100) df['unique_id'] = 0 arrow_table = pa.Table.from_pandas(df, preserve_index=False) out = pa.BufferOutputStream() error_text = 'Artificial Error' try: with pq.ParquetWriter(out, arrow_table.schema, version='2.0') as writer: frames = [] for i in range(10): df['unique_id'] = i arrow_table = pa.Table.from_pandas(df, preserve_index=False) writer.write_table(arrow_table) frames.append(df.copy()) if i == 5: raise ValueError(error_text) except Exception as e: assert str(e) == error_text buf = out.getvalue() result = _read_table(pa.BufferReader(buf), use_legacy_dataset=use_legacy_dataset) expected = pd.concat(frames, ignore_index=True) tm.assert_frame_equal(result.to_pandas(), expected)
def test_multithreaded_read(use_legacy_dataset): df = alltypes_sample(size=10000) table = pa.Table.from_pandas(df) buf = io.BytesIO() _write_table(table, buf, compression='SNAPPY', version='2.6') buf.seek(0) table1 = _read_table( buf, use_threads=True, use_legacy_dataset=use_legacy_dataset) buf.seek(0) table2 = _read_table( buf, use_threads=False, use_legacy_dataset=use_legacy_dataset) assert table1.equals(table2)
def test_special_chars_filename(tempdir, use_legacy_dataset): table = pa.Table.from_arrays([pa.array([42])], ["ints"]) filename = "foo # bar" path = tempdir / filename assert not path.exists() _write_table(table, str(path)) assert path.exists() table_read = _read_table(str(path), use_legacy_dataset=use_legacy_dataset) assert table_read.equals(table)
def test_decimal_roundtrip_negative_scale(tempdir): expected = pd.DataFrame({'decimal_num': [decimal.Decimal('1.23E4')]}) filename = tempdir / 'decimals.parquet' string_filename = str(filename) t = pa.Table.from_pandas(expected) _write_table(t, string_filename) result_table = _read_table(string_filename) result = result_table.to_pandas() tm.assert_frame_equal(result, expected)
def test_column_of_arrays(tempdir): df, schema = dataframe_with_arrays() filename = tempdir / 'pandas_roundtrip.parquet' arrow_table = pa.Table.from_pandas(df, schema=schema) _write_table(arrow_table, filename, version="2.0", coerce_timestamps='ms') table_read = _read_table(filename) df_read = table_read.to_pandas() tm.assert_frame_equal(df, df_read)
def test_pandas_parquet_empty_roundtrip(tempdir, use_legacy_dataset): df = _test_dataframe(0) arrow_table = pa.Table.from_pandas(df) imos = pa.BufferOutputStream() _write_table(arrow_table, imos, version="2.0") buf = imos.getvalue() reader = pa.BufferReader(buf) df_read = _read_table( reader, use_legacy_dataset=use_legacy_dataset).to_pandas() tm.assert_frame_equal(df, df_read)
def test_column_of_lists(tempdir): df, schema = dataframe_with_lists(parquet_compatible=True) filename = tempdir / 'pandas_roundtrip.parquet' arrow_table = pa.Table.from_pandas(df, schema=schema) _write_table(arrow_table, filename, version='2.0') table_read = _read_table(filename) df_read = table_read.to_pandas() tm.assert_frame_equal(df, df_read)
def test_parquet_writer_write_wrappers(tempdir, filesystem): df = _test_dataframe(100) table = pa.Table.from_pandas(df, preserve_index=False) batch = pa.RecordBatch.from_pandas(df, preserve_index=False) path_table = str(tempdir / 'data_table.parquet') path_batch = str(tempdir / 'data_batch.parquet') with pq.ParquetWriter(path_table, table.schema, filesystem=filesystem, version='2.6') as writer: writer.write_table(table) result = _read_table(path_table).to_pandas() tm.assert_frame_equal(result, df) with pq.ParquetWriter(path_batch, table.schema, filesystem=filesystem, version='2.6') as writer: writer.write_batch(batch) result = _read_table(path_batch).to_pandas() tm.assert_frame_equal(result, df) with pq.ParquetWriter(path_table, table.schema, filesystem=filesystem, version='2.6') as writer: writer.write(table) result = _read_table(path_table).to_pandas() tm.assert_frame_equal(result, df) with pq.ParquetWriter(path_batch, table.schema, filesystem=filesystem, version='2.6') as writer: writer.write(batch) result = _read_table(path_batch).to_pandas() tm.assert_frame_equal(result, df)
def test_parquet_writer_filesystem_s3_uri(s3_example_fs): df = _test_dataframe(100) table = pa.Table.from_pandas(df, preserve_index=False) fs, uri, path = s3_example_fs with pq.ParquetWriter(uri, table.schema, version='2.0') as writer: writer.write_table(table) result = _read_table(path, filesystem=fs).to_pandas() tm.assert_frame_equal(result, df)
def test_backwards_compatible_column_metadata_handling( datadir, use_legacy_dataset ): expected = pd.DataFrame( {'a': [1, 2, 3], 'b': [.1, .2, .3], 'c': pd.date_range("2017-01-01", periods=3, tz='Europe/Brussels')}) expected.index = pd.MultiIndex.from_arrays( [['a', 'b', 'c'], pd.date_range("2017-01-01", periods=3, tz='Europe/Brussels')], names=['index', None]) path = datadir / 'v0.7.1.column-metadata-handling.parquet' table = _read_table(path, use_legacy_dataset=use_legacy_dataset) result = table.to_pandas() tm.assert_frame_equal(result, expected) table = _read_table( path, columns=['a'], use_legacy_dataset=use_legacy_dataset) result = table.to_pandas() tm.assert_frame_equal(result, expected[['a']].reset_index(drop=True))
def test_parquet_writer_filesystem_local(tempdir, filesystem): df = _test_dataframe(100) table = pa.Table.from_pandas(df, preserve_index=False) path = str(tempdir / 'data.parquet') with pq.ParquetWriter( path, table.schema, filesystem=filesystem, version='2.0' ) as writer: writer.write_table(table) result = _read_table(path).to_pandas() tm.assert_frame_equal(result, df)
def test_parquet_writer_filesystem_s3fs(s3_example_s3fs): df = _test_dataframe(100) table = pa.Table.from_pandas(df, preserve_index=False) fs, directory = s3_example_s3fs path = directory + "/test.parquet" with pq.ParquetWriter(path, table.schema, filesystem=fs, version='2.6') as writer: writer.write_table(table) result = _read_table(path, filesystem=fs).to_pandas() tm.assert_frame_equal(result, df)
def test_single_pylist_column_roundtrip(tempdir, dtype, use_legacy_dataset): filename = tempdir / 'single_{}_column.parquet'.format(dtype.__name__) data = [pa.array(list(map(dtype, range(5))))] table = pa.Table.from_arrays(data, names=['a']) _write_table(table, filename) table_read = _read_table(filename, use_legacy_dataset=use_legacy_dataset) for i in range(table.num_columns): col_written = table[i] col_read = table_read[i] assert table.field(i).name == table_read.field(i).name assert col_read.num_chunks == 1 data_written = col_written.chunk(0) data_read = col_read.chunk(0) assert data_written.equals(data_read)
def test_min_chunksize(use_legacy_dataset): data = pd.DataFrame([np.arange(4)], columns=['A', 'B', 'C', 'D']) table = pa.Table.from_pandas(data.reset_index()) buf = io.BytesIO() _write_table(table, buf, chunk_size=-1) buf.seek(0) result = _read_table(buf, use_legacy_dataset=use_legacy_dataset) assert result.equals(table) with pytest.raises(ValueError): _write_table(table, buf, chunk_size=0)
def test_multiindex_duplicate_values(tempdir, use_legacy_dataset): num_rows = 3 numbers = list(range(num_rows)) index = pd.MultiIndex.from_arrays( [['foo', 'foo', 'bar'], numbers], names=['foobar', 'some_numbers'], ) df = pd.DataFrame({'numbers': numbers}, index=index) table = pa.Table.from_pandas(df) filename = tempdir / 'dup_multi_index_levels.parquet' _write_table(table, filename) result_table = _read_table(filename, use_legacy_dataset=use_legacy_dataset) assert table.equals(result_table) result_df = result_table.to_pandas() tm.assert_frame_equal(result_df, df)
def test_index_column_name_duplicate(tempdir, use_legacy_dataset): data = { 'close': { pd.Timestamp('2017-06-30 01:31:00'): 154.99958999999998, pd.Timestamp('2017-06-30 01:32:00'): 154.99958999999998, }, 'time': { pd.Timestamp('2017-06-30 01:31:00'): pd.Timestamp('2017-06-30 01:31:00'), pd.Timestamp('2017-06-30 01:32:00'): pd.Timestamp('2017-06-30 01:32:00'), } } path = str(tempdir / 'data.parquet') dfx = pd.DataFrame(data).set_index('time', drop=False) tdfx = pa.Table.from_pandas(dfx) _write_table(tdfx, path) arrow_table = _read_table(path, use_legacy_dataset=use_legacy_dataset) result_df = arrow_table.to_pandas() tm.assert_frame_equal(result_df, dfx)
def test_pandas_parquet_pyfile_roundtrip(tempdir, use_legacy_dataset): filename = tempdir / 'pandas_pyfile_roundtrip.parquet' size = 5 df = pd.DataFrame({ 'int64': np.arange(size, dtype=np.int64), 'float32': np.arange(size, dtype=np.float32), 'float64': np.arange(size, dtype=np.float64), 'bool': np.random.randn(size) > 0, 'strings': ['foo', 'bar', None, 'baz', 'qux'] }) arrow_table = pa.Table.from_pandas(df) with filename.open('wb') as f: _write_table(arrow_table, f, version="1.0") data = io.BytesIO(filename.read_bytes()) table_read = _read_table(data, use_legacy_dataset=use_legacy_dataset) df_read = table_read.to_pandas() tm.assert_frame_equal(df, df_read)
def test_coerce_timestamps(tempdir): from collections import OrderedDict # ARROW-622 arrays = OrderedDict() fields = [pa.field('datetime64', pa.list_(pa.timestamp('ms')))] arrays['datetime64'] = [ np.array([ '2007-07-13T01:23:34.123456789', None, '2010-08-13T05:46:57.437699912' ], dtype='datetime64[ms]'), None, None, np.array(['2007-07-13T02', None, '2010-08-13T05:46:57.437699912'], dtype='datetime64[ms]'), ] df = pd.DataFrame(arrays) schema = pa.schema(fields) filename = tempdir / 'pandas_roundtrip.parquet' arrow_table = pa.Table.from_pandas(df, schema=schema) _write_table(arrow_table, filename, version='2.6', coerce_timestamps='us') table_read = _read_table(filename) df_read = table_read.to_pandas() df_expected = df.copy() for i, x in enumerate(df_expected['datetime64']): if isinstance(x, np.ndarray): df_expected['datetime64'][i] = x.astype('M8[us]') tm.assert_frame_equal(df_expected, df_read) with pytest.raises(ValueError): _write_table(arrow_table, filename, version='2.6', coerce_timestamps='unknown')
def test_parquet_writer_with_caller_provided_filesystem(use_legacy_dataset): out = pa.BufferOutputStream() class CustomFS(FileSystem): def __init__(self): self.path = None self.mode = None def open(self, path, mode='rb'): self.path = path self.mode = mode return out fs = CustomFS() fname = 'expected_fname.parquet' df = _test_dataframe(100) table = pa.Table.from_pandas(df, preserve_index=False) with pq.ParquetWriter(fname, table.schema, filesystem=fs, version='2.0') \ as writer: writer.write_table(table) assert fs.path == fname assert fs.mode == 'wb' assert out.closed buf = out.getvalue() table_read = _read_table(pa.BufferReader(buf), use_legacy_dataset=use_legacy_dataset) df_read = table_read.to_pandas() tm.assert_frame_equal(df_read, df) # Should raise ValueError when filesystem is passed with file-like object with pytest.raises(ValueError) as err_info: pq.ParquetWriter(pa.BufferOutputStream(), table.schema, filesystem=fs) expected_msg = ("filesystem passed but where is file-like, so" " there is nothing to open with filesystem.") assert str(err_info) == expected_msg
def test_decimal_roundtrip(tempdir, use_legacy_dataset): num_values = 10 columns = {} for precision in range(1, 39): for scale in range(0, precision + 1): with util.random_seed(0): random_decimal_values = [ util.randdecimal(precision, scale) for _ in range(num_values) ] column_name = ('dec_precision_{:d}_scale_{:d}'.format( precision, scale)) columns[column_name] = random_decimal_values expected = pd.DataFrame(columns) filename = tempdir / 'decimals.parquet' string_filename = str(filename) table = pa.Table.from_pandas(expected) _write_table(table, string_filename) result_table = _read_table(string_filename, use_legacy_dataset=use_legacy_dataset) result = result_table.to_pandas() tm.assert_frame_equal(result, expected)