def test_parquet_incremental_file_build(tempdir, use_legacy_dataset): df = _test_dataframe(100) df['unique_id'] = 0 arrow_table = pa.Table.from_pandas(df, preserve_index=False) out = pa.BufferOutputStream() writer = pq.ParquetWriter(out, arrow_table.schema, version='2.0') frames = [] for i in range(10): df['unique_id'] = i arrow_table = pa.Table.from_pandas(df, preserve_index=False) writer.write_table(arrow_table) frames.append(df.copy()) writer.close() buf = out.getvalue() result = _read_table(pa.BufferReader(buf), use_legacy_dataset=use_legacy_dataset) expected = pd.concat(frames, ignore_index=True) tm.assert_frame_equal(result.to_pandas(), expected)
def test_parquet_writer_context_obj_with_exception(tempdir, use_legacy_dataset): df = _test_dataframe(100) df['unique_id'] = 0 arrow_table = pa.Table.from_pandas(df, preserve_index=False) out = pa.BufferOutputStream() error_text = 'Artificial Error' try: with pq.ParquetWriter(out, arrow_table.schema, version='2.0') as writer: frames = [] for i in range(10): df['unique_id'] = i arrow_table = pa.Table.from_pandas(df, preserve_index=False) writer.write_table(arrow_table) frames.append(df.copy()) if i == 5: raise ValueError(error_text) except Exception as e: assert str(e) == error_text buf = out.getvalue() result = _read_table(pa.BufferReader(buf), use_legacy_dataset=use_legacy_dataset) expected = pd.concat(frames, ignore_index=True) tm.assert_frame_equal(result.to_pandas(), expected)
def test_set_write_batch_size(use_legacy_dataset): df = _test_dataframe(100) table = pa.Table.from_pandas(df, preserve_index=False) _check_roundtrip( table, data_page_size=10, write_batch_size=1, version='2.4' )
def test_spark_flavor_preserves_pandas_metadata(): df = _test_dataframe(size=100) df.index = np.arange(0, 10 * len(df), 10) df.index.name = 'foo' result = _roundtrip_pandas_dataframe(df, {'version': '2.0', 'flavor': 'spark'}) tm.assert_frame_equal(result, df)
def test_pandas_parquet_empty_roundtrip(tempdir, use_legacy_dataset): df = _test_dataframe(0) arrow_table = pa.Table.from_pandas(df) imos = pa.BufferOutputStream() _write_table(arrow_table, imos, version="2.0") buf = imos.getvalue() reader = pa.BufferReader(buf) df_read = _read_table( reader, use_legacy_dataset=use_legacy_dataset).to_pandas() tm.assert_frame_equal(df, df_read)
def test_parquet_writer_filesystem_buffer_raises(): df = _test_dataframe(100) table = pa.Table.from_pandas(df, preserve_index=False) filesystem = fs.LocalFileSystem() # Should raise ValueError when filesystem is passed with file-like object with pytest.raises(ValueError, match="specified path is file-like"): pq.ParquetWriter(pa.BufferOutputStream(), table.schema, filesystem=filesystem)
def test_set_dictionary_pagesize_limit(use_legacy_dataset): df = _test_dataframe(100) table = pa.Table.from_pandas(df, preserve_index=False) _check_roundtrip(table, dictionary_pagesize_limit=1, data_page_size=10, version='2.4') with pytest.raises(TypeError): _check_roundtrip(table, dictionary_pagesize_limit="a", data_page_size=10, version='2.4')
def test_parquet_writer_filesystem_s3_uri(s3_example_fs): df = _test_dataframe(100) table = pa.Table.from_pandas(df, preserve_index=False) fs, uri, path = s3_example_fs with pq.ParquetWriter(uri, table.schema, version='2.0') as writer: writer.write_table(table) result = _read_table(path, filesystem=fs).to_pandas() tm.assert_frame_equal(result, df)
def test_read_pandas_column_subset(tempdir, use_legacy_dataset): df = _test_dataframe(10000) arrow_table = pa.Table.from_pandas(df) imos = pa.BufferOutputStream() _write_table(arrow_table, imos, version="2.0") buf = imos.getvalue() reader = pa.BufferReader(buf) df_read = pq.read_pandas( reader, columns=['strings', 'uint8'], use_legacy_dataset=use_legacy_dataset ).to_pandas() tm.assert_frame_equal(df[['strings', 'uint8']], df_read)
def test_parquet_writer_filesystem_local(tempdir, filesystem): df = _test_dataframe(100) table = pa.Table.from_pandas(df, preserve_index=False) path = str(tempdir / 'data.parquet') with pq.ParquetWriter( path, table.schema, filesystem=filesystem, version='2.0' ) as writer: writer.write_table(table) result = _read_table(path).to_pandas() tm.assert_frame_equal(result, df)
def test_parquet_writer_filesystem_s3fs(s3_example_s3fs): df = _test_dataframe(100) table = pa.Table.from_pandas(df, preserve_index=False) fs, directory = s3_example_s3fs path = directory + "/test.parquet" with pq.ParquetWriter(path, table.schema, filesystem=fs, version='2.6') as writer: writer.write_table(table) result = _read_table(path, filesystem=fs).to_pandas() tm.assert_frame_equal(result, df)
def test_parquet_writer_write_wrappers(tempdir, filesystem): df = _test_dataframe(100) table = pa.Table.from_pandas(df, preserve_index=False) batch = pa.RecordBatch.from_pandas(df, preserve_index=False) path_table = str(tempdir / 'data_table.parquet') path_batch = str(tempdir / 'data_batch.parquet') with pq.ParquetWriter(path_table, table.schema, filesystem=filesystem, version='2.6') as writer: writer.write_table(table) result = _read_table(path_table).to_pandas() tm.assert_frame_equal(result, df) with pq.ParquetWriter(path_batch, table.schema, filesystem=filesystem, version='2.6') as writer: writer.write_batch(batch) result = _read_table(path_batch).to_pandas() tm.assert_frame_equal(result, df) with pq.ParquetWriter(path_table, table.schema, filesystem=filesystem, version='2.6') as writer: writer.write(table) result = _read_table(path_table).to_pandas() tm.assert_frame_equal(result, df) with pq.ParquetWriter(path_batch, table.schema, filesystem=filesystem, version='2.6') as writer: writer.write(batch) result = _read_table(path_batch).to_pandas() tm.assert_frame_equal(result, df)
def test_read_write_parquet_files_with_uri(self): import pyarrow.parquet as pq tmpdir = pjoin(self.tmp_path, 'uri-parquet-' + guid()) self.hdfs.mkdir(tmpdir) path = _get_hdfs_uri(pjoin(tmpdir, 'test.parquet')) size = 5 df = _test_dataframe(size, seed=0) # Hack so that we don't have a dtype cast in v1 files df['uint32'] = df['uint32'].astype(np.int64) table = pa.Table.from_pandas(df, preserve_index=False) pq.write_table(table, path, filesystem=self.hdfs) result = pq.read_table(path, filesystem=self.hdfs, use_legacy_dataset=True).to_pandas() _pandas_api.assert_frame_equal(result, df)
def test_dataset_read_pandas_common_metadata(tempdir, preserve_index): # ARROW-1103 nfiles = 5 size = 5 dirpath = tempdir / guid() dirpath.mkdir() test_data = [] frames = [] paths = [] for i in range(nfiles): df = _test_dataframe(size, seed=i) df.index = pd.Index(np.arange(i * size, (i + 1) * size), name='index') path = dirpath / '{}.parquet'.format(i) table = pa.Table.from_pandas(df, preserve_index=preserve_index) # Obliterate metadata table = table.replace_schema_metadata(None) assert table.schema.metadata is None _write_table(table, path) test_data.append(table) frames.append(df) paths.append(path) # Write _metadata common file table_for_metadata = pa.Table.from_pandas( df, preserve_index=preserve_index ) pq.write_metadata(table_for_metadata.schema, dirpath / '_metadata') dataset = pq.ParquetDataset(dirpath) columns = ['uint8', 'strings'] result = dataset.read_pandas(columns=columns).to_pandas() expected = pd.concat([x[columns] for x in frames]) expected.index.name = ( df.index.name if preserve_index is not False else None) tm.assert_frame_equal(result, expected)
def test_parquet_writer_with_caller_provided_filesystem(use_legacy_dataset): out = pa.BufferOutputStream() class CustomFS(FileSystem): def __init__(self): self.path = None self.mode = None def open(self, path, mode='rb'): self.path = path self.mode = mode return out fs = CustomFS() fname = 'expected_fname.parquet' df = _test_dataframe(100) table = pa.Table.from_pandas(df, preserve_index=False) with pq.ParquetWriter(fname, table.schema, filesystem=fs, version='2.0') \ as writer: writer.write_table(table) assert fs.path == fname assert fs.mode == 'wb' assert out.closed buf = out.getvalue() table_read = _read_table(pa.BufferReader(buf), use_legacy_dataset=use_legacy_dataset) df_read = table_read.to_pandas() tm.assert_frame_equal(df_read, df) # Should raise ValueError when filesystem is passed with file-like object with pytest.raises(ValueError) as err_info: pq.ParquetWriter(pa.BufferOutputStream(), table.schema, filesystem=fs) expected_msg = ("filesystem passed but where is file-like, so" " there is nothing to open with filesystem.") assert str(err_info) == expected_msg
def _write_multiple_hdfs_pq_files(self, tmpdir): import pyarrow.parquet as pq nfiles = 10 size = 5 test_data = [] for i in range(nfiles): df = _test_dataframe(size, seed=i) df['index'] = np.arange(i * size, (i + 1) * size) # Hack so that we don't have a dtype cast in v1 files df['uint32'] = df['uint32'].astype(np.int64) path = pjoin(tmpdir, '{}.parquet'.format(i)) table = pa.Table.from_pandas(df, preserve_index=False) with self.hdfs.open(path, 'wb') as f: pq.write_table(table, f) test_data.append(table) expected = pa.concat_tables(test_data) return expected