def test_write_metadata(tempdir): path = str(tempdir / "metadata") schema = pa.schema([("a", "int64"), ("b", "float64")]) # write a pyarrow schema pq.write_metadata(schema, path) parquet_meta = pq.read_metadata(path) schema_as_arrow = parquet_meta.schema.to_arrow_schema() assert schema_as_arrow.equals(schema) # ARROW-8980: Check that the ARROW:schema metadata key was removed if schema_as_arrow.metadata: assert b'ARROW:schema' not in schema_as_arrow.metadata # pass through writer keyword arguments for version in ["1.0", "2.0"]: pq.write_metadata(schema, path, version=version) parquet_meta = pq.read_metadata(path) assert parquet_meta.format_version == version # metadata_collector: list of FileMetaData objects table = pa.table({'a': [1, 2], 'b': [.1, .2]}, schema=schema) pq.write_table(table, tempdir / "data.parquet") parquet_meta = pq.read_metadata(str(tempdir / "data.parquet")) pq.write_metadata(schema, path, metadata_collector=[parquet_meta, parquet_meta]) parquet_meta_mult = pq.read_metadata(path) assert parquet_meta_mult.num_row_groups == 2 # append metadata with different schema raises an error with pytest.raises(RuntimeError, match="requires equal schemas"): pq.write_metadata(pa.schema([("a", "int32"), ("b", "null")]), path, metadata_collector=[parquet_meta, parquet_meta])
def test_parquet_write_disable_statistics(tempdir): table = pa.Table.from_pydict( OrderedDict([('a', pa.array([1, 2, 3])), ('b', pa.array(['a', 'b', 'c']))])) _write_table(table, tempdir / 'data.parquet') meta = pq.read_metadata(tempdir / 'data.parquet') for col in [0, 1]: cc = meta.row_group(0).column(col) assert cc.is_stats_set is True assert cc.statistics is not None _write_table(table, tempdir / 'data2.parquet', write_statistics=False) meta = pq.read_metadata(tempdir / 'data2.parquet') for col in [0, 1]: cc = meta.row_group(0).column(col) assert cc.is_stats_set is False assert cc.statistics is None _write_table(table, tempdir / 'data3.parquet', write_statistics=['a']) meta = pq.read_metadata(tempdir / 'data3.parquet') cc_a = meta.row_group(0).column(0) cc_b = meta.row_group(0).column(1) assert cc_a.is_stats_set is True assert cc_b.is_stats_set is False assert cc_a.statistics is not None assert cc_b.statistics is None
def test_encrypted_parquet_read_metadata_no_decryption_config( tempdir, data_table): """Write an encrypted parquet, verify it's encrypted, but then try to read its metadata without decryption properties.""" test_encrypted_parquet_write_read(tempdir, data_table) # Read metadata without decryption properties with pytest.raises(IOError, match=r"no decryption"): pq.read_metadata(tempdir / PARQUET_NAME)
def test_read_common_metadata_files(tmpdir): import pyarrow.parquet as pq N = 100 df = pd.DataFrame({ 'index': np.arange(N), 'values': np.random.randn(N) }, columns=['index', 'values']) base_path = str(tmpdir) data_path = pjoin(base_path, 'data.parquet') table = pa.Table.from_pandas(df) _write_table(table, data_path) metadata_path = pjoin(base_path, '_metadata') pq.write_metadata(table.schema, metadata_path) dataset = pq.ParquetDataset(base_path) assert dataset.metadata_path == metadata_path common_schema = pq.read_metadata(data_path).schema assert dataset.schema.equals(common_schema) # handle list of one directory dataset2 = pq.ParquetDataset([base_path]) assert dataset2.schema.equals(dataset.schema)
def _test_read_common_metadata_files(fs, base_path): import pyarrow.parquet as pq N = 100 df = pd.DataFrame({ 'index': np.arange(N), 'values': np.random.randn(N) }, columns=['index', 'values']) data_path = pjoin(base_path, 'data.parquet') table = pa.Table.from_pandas(df) with fs.open(data_path, 'wb') as f: _write_table(table, f) metadata_path = pjoin(base_path, '_metadata') with fs.open(metadata_path, 'wb') as f: pq.write_metadata(table.schema, f) dataset = pq.ParquetDataset(base_path, filesystem=fs) assert dataset.metadata_path == metadata_path with fs.open(data_path) as f: common_schema = pq.read_metadata(f).schema assert dataset.schema.equals(common_schema) # handle list of one directory dataset2 = pq.ParquetDataset([base_path], filesystem=fs) assert dataset2.schema.equals(dataset.schema)
def run_partition_test(input_file: str, output_dir: str, filters: Optional[list] = None): milliseconds_since_epoch = int(time() * 1000) print('Parquet metadata: ' + str(pq.read_metadata(input_file))) print('Parquet schema: ' + pq.read_schema(input_file).to_string()) data = pq.read_table(source=input_file, filters=filters) # Write a dataset and collect metadata information of all written files metadata_collector = [] root_path = output_dir + 'partitioned_' + str(milliseconds_since_epoch) pq.write_to_dataset(data, root_path=root_path, partition_cols=['start_year'], metadata_collector=metadata_collector) # Write the ``_common_metadata`` parquet file without row groups statistics pq.write_metadata(data.schema, root_path + '/_common_metadata') # Write the ``_metadata`` parquet file with row groups statistics of all files # Gives following error: # File "pyarrow/_parquet.pyx", line 616, in pyarrow._parquet.FileMetaData.append_row_groups # RuntimeError: AppendRowGroups requires equal schemas. # data.schema has one more column than partitioned files when partitioning by one column # Related? https://github.com/dask/dask/issues/6243 # pq.write_metadata(data.schema, root_path + '/_metadata', metadata_collector=metadata_collector) # Read from partitioned dataset # use the new generic Dataset API start_year = 2018 value = 50000 table = pq.read_table(root_path, filters=[('start_year', '>=', start_year), ('value', '>', value)]) # filters=[('start_year', '>=', start_year)]) print(table.to_pandas())
def _test_read_common_metadata_files(fs, base_path): import pyarrow.parquet as pq N = 100 df = pd.DataFrame({ 'index': np.arange(N), 'values': np.random.randn(N) }, columns=['index', 'values']) data_path = pjoin(base_path, 'data.parquet') table = pa.Table.from_pandas(df) with fs.open(data_path, 'wb') as f: _write_table(table, f) metadata_path = pjoin(base_path, '_metadata') with fs.open(metadata_path, 'wb') as f: pq.write_metadata(table.schema, f) dataset = pq.ParquetDataset(base_path, filesystem=fs) assert dataset.metadata_path == metadata_path with fs.open(data_path) as f: common_schema = pq.read_metadata(f).schema assert dataset.schema.equals(common_schema) # handle list of one directory dataset2 = pq.ParquetDataset([base_path], filesystem=fs) assert dataset2.schema.equals(dataset.schema)
def get_row_group_info(path): fs = filesystem_factory() relative_path = os.path.relpath(path, base_path) pq_file = fs.open(path) num_row_groups = pq.read_metadata(pq_file).num_row_groups pq_file.close() return relative_path, num_row_groups
def test_parquet(tmpdir, registered_period_type): # parquet support for extension types period_type = PeriodType('D') storage = pa.array([1, 2, 3, 4], pa.int64()) arr = pa.ExtensionArray.from_storage(period_type, storage) table = pa.table([arr], names=["ext"]) import pyarrow.parquet as pq filename = tmpdir / 'extension_type.parquet' pq.write_table(table, filename) # stored in parquet as storage type but with extension metadata saved # in the serialized arrow schema meta = pq.read_metadata(filename) assert meta.schema.column(0).physical_type == "INT64" assert b"ARROW:schema" in meta.metadata import base64 decoded_schema = base64.b64decode(meta.metadata[b"ARROW:schema"]) schema = pa.read_schema(pa.BufferReader(decoded_schema)) assert schema.field("ext").metadata == { b'ARROW:extension:metadata': b'freq=D', b'ARROW:extension:name': b'pandas.period' } # when reading in, properly create extension type if it is registered result = pq.read_table(filename) assert result.column("ext").type == period_type # when the type is not registered, read in as storage type pa.unregister_extension_type(period_type.extension_name) result = pq.read_table(filename) assert result.column("ext").type == pa.int64()
def from_parquet(message_bytes, **kwargs): metadata = pq.read_metadata(message_bytes) cls = SCHEMA_TO_TYPE[metadata.metadata[b"type"]] table = pq.read_table(message_bytes, **kwargs) data = table.to_pydict() values = list( map(dict, zip(*([(key, val) for val in data[key]] for key in data.keys())))) return _deserialize(name=cls, chunk=values)
def test_parquet_metadata_empty_to_dict(tempdir): # https://issues.apache.org/jira/browse/ARROW-10146 table = pa.table({"a": pa.array([], type="int64")}) pq.write_table(table, tempdir / "data.parquet") metadata = pq.read_metadata(tempdir / "data.parquet") # ensure this doesn't error / statistics set to None metadata_dict = metadata.to_dict() assert len(metadata_dict["row_groups"]) == 1 assert len(metadata_dict["row_groups"][0]["columns"]) == 1 assert metadata_dict["row_groups"][0]["columns"][0]["statistics"] is None
def read_metadata(filename: str): """read_metadata return only the metadata from a parquet pyspark file Args: filename (str): [description] Returns: [type]: [description] """ return pq.read_metadata(filename)
def test_add_pandas_engine_metadata_to_parquet_file(tmp_path) -> None: """Pandas engine metadata added. Args: tmp_path ([type]): Pytest temporary path plugin """ filepath = tmp_path / "test_meta.parquet" pd.DataFrame().to_parquet(filepath) add_file_engine_metadata_to_parquet_file(filepath, "pandas") assert pq.read_metadata(filepath).metadata[b"file_engine"] == b"pandas"
def test_add_geopandas_engine_metadata_to_parquet_file(tmp_path: Path) -> None: """Geopandas engine metadata added. Args: tmp_path (Path): Pytest temporary path plugin """ filepath = tmp_path / "test_meta.parquet" gpd.GeoDataFrame().to_parquet(filepath) add_file_engine_metadata_to_parquet_file(filepath, "geopandas") assert pq.read_metadata(filepath).metadata[b"file_engine"] == b"geopandas"
def test_sink_transform_multiple_row_group(self): with TemporaryDirectory() as tmp_dirname: path = os.path.join(tmp_dirname + "tmp_filename") with TestPipeline() as p: # writing 623200 bytes of data _ = p \ | Create(self.RECORDS * 4000) \ | WriteToParquet( path, self.SCHEMA, num_shards=1, codec='none', shard_name_template='', row_group_buffer_size=250000) self.assertEqual(pq.read_metadata(path).num_row_groups, 3)
def genes(self) -> Tuple[str]: # noinspection PyTypeChecker metadata = pq.read_metadata(self._fname) assert metadata.num_row_groups == 1, \ "Parquet database {0:s} has more than one row group.".format(self._fname) metadata_row_group = metadata.row_group(0) # Get all gene names (exclude "features" column). return tuple( metadata_row_group.column(idx).path_in_schema for idx in range(0, metadata.num_columns) if metadata_row_group.column(idx).path_in_schema != INDEX_NAME)
def test_sink_transform_multiple_row_group(self): with tempfile.NamedTemporaryFile() as dst: path = dst.name with TestPipeline() as p: # writing 623200 bytes of data _ = p \ | Create(self.RECORDS * 4000) \ | WriteToParquet( path, self.SCHEMA, num_shards=1, codec='none', shard_name_template='', row_group_buffer_size=250000) self.assertEqual(pq.read_metadata(path).num_row_groups, 3)
def test_metadata_exceeds_message_size(): # ARROW-13655: Thrift may enable a default message size that limits # the size of Parquet metadata that can be written. NCOLS = 1000 NREPEATS = 4000 table = pa.table({str(i): np.random.randn(10) for i in range(NCOLS)}) with pa.BufferOutputStream() as out: pq.write_table(table, out) buf = out.getvalue() original_metadata = pq.read_metadata(pa.BufferReader(buf)) metadata = pq.read_metadata(pa.BufferReader(buf)) for i in range(NREPEATS): metadata.append_row_groups(original_metadata) with pa.BufferOutputStream() as out: metadata.write_metadata_file(out) buf = out.getvalue() metadata = pq.read_metadata(pa.BufferReader(buf))
def extract_parquet(file_): """ parse and extract key metadata from parquet files Args: file_ - file-like object opened in binary mode (+b) Returns: dict html - html summary of main contents (if applicable) info - metdata for user consumption """ # TODO: generalize to datasets, multipart files # As written, only works for single files, and metadata # is slanted towards the first row_group # local import reduces amortized latency, saves memory import pyarrow.parquet as pq meta = pq.read_metadata(file_) info = {} info['created_by'] = meta.created_by info['format_version'] = meta.format_version info['metadata'] = { # seems silly but sets up a simple json.dumps(info) below k.decode(): json.loads(meta.metadata[k]) for k in meta.metadata } if meta.metadata is not None else {} info['num_row_groups'] = meta.num_row_groups info['schema'] = { name: { 'logical_type': meta.schema.column(i).logical_type, 'max_definition_level': meta.schema.column(i).max_definition_level, 'max_repetition_level': meta.schema.column(i).max_repetition_level, 'path': meta.schema.column(i).path, 'physical_type': meta.schema.column(i).physical_type, } for i, name in enumerate(meta.schema.names) } info['serialized_size'] = meta.serialized_size info['shape'] = [meta.num_rows, meta.num_columns] file_.seek(0) # TODO: make this faster with n_threads > 1? row_group = pq.ParquetFile(file_).read_row_group(0) # convert to str since FileMetaData is not JSON.dumps'able (below) html = row_group.to_pandas()._repr_html_() # pylint: disable=protected-access return html, info
def run_test(input_file: str, output_dir: str, filters: list, use_pandas: bool): print('Using pyarrow') print('Parquet metadata: ' + str(pq.read_metadata(input_file))) print('Parquet schema: ' + pq.read_schema(input_file).to_string()) pq_file = pq.ParquetFile(input_file) row_group_0_metadata = pq_file.metadata.row_group(0) print('Parquet min for column 0, row group 0: ' + str(row_group_0_metadata.column(0).statistics.min)) print('Parquet max for column 0, row group 0: ' + str(row_group_0_metadata.column(0).statistics.max)) if use_pandas: unfiltered_pandas_data = pq.read_table(source=input_file).to_pandas() size = sys.getsizeof(unfiltered_pandas_data) print('Size of UN-filtered pandas DataFrame in memory: ' + str(size) + ' bytes (' + str(size / 1000000) + ' MB)') with timeblock('pyarrow read and filter'): data = pq.read_table(source=input_file, filters=filters) size = sys.getsizeof(data) print('Size of filtered pyarrow table in memory: ' + str(size) + ' bytes (' + str(size / 1000000) + ' MB)') if use_pandas: unfiltered_pandas_data = data.to_pandas() size = sys.getsizeof(unfiltered_pandas_data) print('Size of filtered pandas DataFrame in memory: ' + str(size) + ' bytes (' + str(size / 1000000) + ' MB)') # print(pandas_data.head(10)) milliseconds_since_epoch = int(time() * 1000) output_file = output_dir + str(milliseconds_since_epoch) + '.parquet' print('Output file name: ' + output_file) with timeblock('pyarrow write_table()'): pq.write_table(data, output_file) print('Parquet metadata of output: ' + str(pq.read_metadata(output_file))) print('Parquet schema of output: ' + pq.read_schema(output_file).to_string()) print('Size of output file on disk: ' + str(os.path.getsize(output_file)) + ' bytes (' + str(os.path.getsize(output_file) / 1000000) + ' MB)')
def read_encrypted_parquet(path, decryption_config, kms_connection_config, crypto_factory): file_decryption_properties = crypto_factory.file_decryption_properties( kms_connection_config, decryption_config) assert (file_decryption_properties is not None) meta = pq.read_metadata(path, decryption_properties=file_decryption_properties) assert (meta.num_columns == 3) schema = pq.read_schema(path, decryption_properties=file_decryption_properties) assert (len(schema.names) == 3) result = pq.ParquetFile(path, decryption_properties=file_decryption_properties) return result.read(use_threads=True)
def test_pandas_parquet_custom_metadata(tmpdir): import pyarrow.parquet as pq df = alltypes_sample(size=10000) filename = tmpdir.join('pandas_rountrip.parquet') arrow_table = pa.Table.from_pandas(df, timestamps_to_ms=True) assert b'pandas' in arrow_table.schema.metadata _write_table(arrow_table, filename.strpath, version="2.0") md = pq.read_metadata(filename.strpath).metadata assert b'pandas' in md js = json.loads(md[b'pandas'].decode('utf8')) assert js['index_columns'] == ['__index_level_0__']
def test_pandas_parquet_custom_metadata(tempdir): df = alltypes_sample(size=10000) filename = tempdir / 'pandas_roundtrip.parquet' arrow_table = pa.Table.from_pandas(df) assert b'pandas' in arrow_table.schema.metadata _write_table(arrow_table, filename, version='2.0', coerce_timestamps='ms') metadata = pq.read_metadata(filename).metadata assert b'pandas' in metadata js = json.loads(metadata[b'pandas'].decode('utf8')) assert js['index_columns'] == [{'kind': 'range', 'name': None, 'start': 0, 'stop': 10000, 'step': 1}]
def test_pass_separate_metadata(): # ARROW-471 df = alltypes_sample(size=10000) a_table = pa.Table.from_pandas(df) buf = io.BytesIO() _write_table(a_table, buf, compression='snappy', version='2.0') buf.seek(0) metadata = pq.read_metadata(buf) buf.seek(0) fileh = pq.ParquetFile(buf, metadata=metadata) tm.assert_frame_equal(df, fileh.read().to_pandas())
def test_pandas_parquet_custom_metadata(tmpdir): import pyarrow.parquet as pq df = alltypes_sample(size=10000) filename = tmpdir.join('pandas_rountrip.parquet') arrow_table = pa.Table.from_pandas(df) assert b'pandas' in arrow_table.schema.metadata _write_table(arrow_table, filename.strpath, version="2.0", coerce_timestamps='ms') md = pq.read_metadata(filename.strpath).metadata assert b'pandas' in md js = json.loads(md[b'pandas'].decode('utf8')) assert js['index_columns'] == ['__index_level_0__']
def get_schema(self, uri: str): fs, base_dir = FileSystem.from_uri(normalize_uri(uri)) selector = FileSelector(base_dir, allow_not_found=True, recursive=True) first_parquet = None for finfo in fs.get_file_info(selector): if finfo.path.endswith(".parquet"): first_parquet = finfo.path break metadata_file = fs.open_input_file(first_parquet) metadata = pq.read_metadata(metadata_file) kv_metadata = metadata.metadata try: return json.loads(kv_metadata[self.SPARK_PARQUET_ROW_METADATA]) except KeyError as exp: raise ValueError( f"Parquet dataset {uri} is not created via Spark") from exp
def test_splitted_parquet_writer(): if skip: return os.system("wget " "https://raw.githubusercontent.com/" "JayjeetAtGithub/zips/main/largefile.parquet") chunksize = 4 * 1024 * 1024 # 4MB writer = SplittedParquetWriter("largefile.parquet", 'mydataset', chunksize) writer.write() assert len(os.listdir('mydataset')) == 8 original_file_rows = pq.read_table('largefile.parquet').num_rows splitted_files_rows = 0 files = os.listdir('mydataset') for file in files: splitted_files_rows += pq.read_metadata(f"mydataset/{file}").num_rows assert splitted_files_rows == original_file_rows
def test_pass_separate_metadata(): import pyarrow.parquet as pq # ARROW-471 df = alltypes_sample(size=10000) a_table = pa.Table.from_pandas(df) buf = io.BytesIO() _write_table(a_table, buf, compression='snappy', version='2.0') buf.seek(0) metadata = pq.read_metadata(buf) buf.seek(0) fileh = pq.ParquetFile(buf, metadata=metadata) tm.assert_frame_equal(df, fileh.read().to_pandas())
def test_parquet_period(tmpdir, registered_period_type): # Parquet support for primitive extension types period_type, period_class = registered_period_type storage = pa.array([1, 2, 3, 4], pa.int64()) arr = pa.ExtensionArray.from_storage(period_type, storage) table = pa.table([arr], names=["ext"]) import pyarrow.parquet as pq filename = tmpdir / 'period_extension_type.parquet' pq.write_table(table, filename) # Stored in parquet as storage type but with extension metadata saved # in the serialized arrow schema meta = pq.read_metadata(filename) assert meta.schema.column(0).physical_type == "INT64" assert b"ARROW:schema" in meta.metadata import base64 decoded_schema = base64.b64decode(meta.metadata[b"ARROW:schema"]) schema = pa.ipc.read_schema(pa.BufferReader(decoded_schema)) # Since the type could be reconstructed, the extension type metadata is # absent. assert schema.field("ext").metadata == {} # When reading in, properly create extension type if it is registered result = pq.read_table(filename) assert result.schema.field("ext").type == period_type assert result.schema.field("ext").metadata == {b'PARQUET:field_id': b'1'} # Get the exact array class defined by the registered type. result_array = result.column("ext").chunk(0) assert type(result_array) is period_class # When the type is not registered, read in as storage type pa.unregister_extension_type(period_type.extension_name) result = pq.read_table(filename) assert result.schema.field("ext").type == pa.int64() # The extension metadata is present for roundtripping. assert result.schema.field("ext").metadata == { b'ARROW:extension:metadata': b'freq=D', b'ARROW:extension:name': b'test.period', b'PARQUET:field_id': b'1', }
def merge(ctx, src_contact_tables, dest_contact_table, fofn): import pyarrow.parquet as pq from pyarrow import dataset if fofn: assert len( src_contact_tables ) == 1, "If using --fofn you can only pass a single source file" src_fofn = src_contact_tables[0] src_contact_tables = [] errors = [] for file_path in open(src_fofn): input_file = Path(file_path.strip()) if not input_file.resolve().exists(): errors.append(f"Input file missing: {input_file}") src_contact_tables.append(input_file) if errors: for e in errors: logger.error(e) raise OSError("Missing input files") parts = [] for i in src_contact_tables: md = pq.read_metadata(i) if md.num_rows == 0: logger.warning( f"The following contact file has no entries, removing from merge: {i}" ) continue parts.append(i) ds = dataset.dataset(parts, format="parquet") df = dd.read_parquet(parts, engine=PQ_ENGINE, version=PQ_VERSION, index=False) df.to_parquet(dest_contact_table, engine=PQ_ENGINE, version=PQ_VERSION, schema=ds.schema, write_index=False)
def __init__(self, root_path, num_samples=None, target_name='delinquency_12', shuffle_files=False): self.parquet_files = glob.glob(os.path.join(root_path, "*.parquet")) if shuffle_files: self.parquet_files = list(np.random.permutation( self.parquet_files)) self.target_name = target_name self.metadata = [pq.read_metadata(p) for p in self.parquet_files] self.cumsum_rows = np.cumsum([m.num_rows for m in self.metadata]) self.times_through = 0 if num_samples is not None: self.num_samples = min(num_samples, self.cumsum_rows[-1]) else: self.num_samples = self.cumsum_rows[-1] self.loaded_tensors = None
def run_id_filter_test(input_file: str, input_id_file: str): # converting ids to pandas will be a "zero copy conversion" as unit_id column is int64 when: # - ids are not nulls # - a single ChunkedArray # TODO check it that is the case # https://arrow.apache.org/docs/python/pandas.html#zero-copy-series-conversions filter_ids = pq.read_table(source=input_id_file) filter_ids_as_pandas: DataFrame = filter_ids.to_pandas() # filter_ids_as_list = filter_ids_as_pandas['unit_id'].tolist() filter_ids_as_set = set(filter_ids_as_pandas['unit_id']) print('Parquet metadata: ' + str(pq.read_metadata(input_id_file))) print('Parquet schema: ' + pq.read_schema(input_id_file).to_string()) print('Using filter ids: ' + str(filter_ids.to_pandas())) table = pq.read_table(source=input_file, filters=[ # ('unit_id', 'in', filter_ids_as_list) ('unit_id', 'in', filter_ids_as_set) ]) print(table.to_pandas())
def test_multi_dataset_metadata(tempdir): filenames = ["ARROW-1983-dataset.0", "ARROW-1983-dataset.1"] metapath = str(tempdir / "_metadata") # create a test dataset df = pd.DataFrame({ 'one': [1, 2, 3], 'two': [-1, -2, -3], 'three': [[1, 2], [2, 3], [3, 4]], }) table = pa.Table.from_pandas(df) # write dataset twice and collect/merge metadata _meta = None for filename in filenames: meta = [] pq.write_table(table, str(tempdir / filename), metadata_collector=meta) meta[0].set_file_path(filename) if _meta is None: _meta = meta[0] else: _meta.append_row_groups(meta[0]) # Write merged metadata-only file with open(metapath, "wb") as f: _meta.write_metadata_file(f) # Read back the metadata meta = pq.read_metadata(metapath) md = meta.to_dict() _md = _meta.to_dict() for key in _md: if key != 'serialized_size': assert _md[key] == md[key] assert _md['num_columns'] == 3 assert _md['num_rows'] == 6 assert _md['num_row_groups'] == 2 assert _md['serialized_size'] == 0 assert md['serialized_size'] > 0
def test_read_multiple_files(tmpdir): import pyarrow.parquet as pq nfiles = 10 size = 5 dirpath = tmpdir.join(guid()).strpath os.mkdir(dirpath) test_data = [] paths = [] for i in range(nfiles): df = _test_dataframe(size, seed=i) # Hack so that we don't have a dtype cast in v1 files df['uint32'] = df['uint32'].astype(np.int64) path = pjoin(dirpath, '{0}.parquet'.format(i)) table = pa.Table.from_pandas(df) _write_table(table, path) test_data.append(table) paths.append(path) # Write a _SUCCESS.crc file with open(pjoin(dirpath, '_SUCCESS.crc'), 'wb') as f: f.write(b'0') def read_multiple_files(paths, columns=None, nthreads=None, **kwargs): dataset = pq.ParquetDataset(paths, **kwargs) return dataset.read(columns=columns, nthreads=nthreads) result = read_multiple_files(paths) expected = pa.concat_tables(test_data) assert result.equals(expected) # Read with provided metadata metadata = pq.read_metadata(paths[0]) result2 = read_multiple_files(paths, metadata=metadata) assert result2.equals(expected) result3 = pa.localfs.read_parquet(dirpath, schema=metadata.schema) assert result3.equals(expected) # Read column subset to_read = [result[0], result[2], result[6], result[result.num_columns - 1]] result = pa.localfs.read_parquet( dirpath, columns=[c.name for c in to_read]) expected = pa.Table.from_arrays(to_read, metadata=result.schema.metadata) assert result.equals(expected) # Read with multiple threads pa.localfs.read_parquet(dirpath, nthreads=2) # Test failure modes with non-uniform metadata bad_apple = _test_dataframe(size, seed=i).iloc[:, :4] bad_apple_path = tmpdir.join('{0}.parquet'.format(guid())).strpath t = pa.Table.from_pandas(bad_apple) _write_table(t, bad_apple_path) bad_meta = pq.read_metadata(bad_apple_path) with pytest.raises(ValueError): read_multiple_files(paths + [bad_apple_path]) with pytest.raises(ValueError): read_multiple_files(paths, metadata=bad_meta) mixed_paths = [bad_apple_path, paths[0]] with pytest.raises(ValueError): read_multiple_files(mixed_paths, schema=bad_meta.schema) with pytest.raises(ValueError): read_multiple_files(mixed_paths)