Пример #1
0
def test_write_metadata(tempdir):
    path = str(tempdir / "metadata")
    schema = pa.schema([("a", "int64"), ("b", "float64")])

    # write a pyarrow schema
    pq.write_metadata(schema, path)
    parquet_meta = pq.read_metadata(path)
    schema_as_arrow = parquet_meta.schema.to_arrow_schema()
    assert schema_as_arrow.equals(schema)

    # ARROW-8980: Check that the ARROW:schema metadata key was removed
    if schema_as_arrow.metadata:
        assert b'ARROW:schema' not in schema_as_arrow.metadata

    # pass through writer keyword arguments
    for version in ["1.0", "2.0"]:
        pq.write_metadata(schema, path, version=version)
        parquet_meta = pq.read_metadata(path)
        assert parquet_meta.format_version == version

    # metadata_collector: list of FileMetaData objects
    table = pa.table({'a': [1, 2], 'b': [.1, .2]}, schema=schema)
    pq.write_table(table, tempdir / "data.parquet")
    parquet_meta = pq.read_metadata(str(tempdir / "data.parquet"))
    pq.write_metadata(schema,
                      path,
                      metadata_collector=[parquet_meta, parquet_meta])
    parquet_meta_mult = pq.read_metadata(path)
    assert parquet_meta_mult.num_row_groups == 2

    # append metadata with different schema raises an error
    with pytest.raises(RuntimeError, match="requires equal schemas"):
        pq.write_metadata(pa.schema([("a", "int32"), ("b", "null")]),
                          path,
                          metadata_collector=[parquet_meta, parquet_meta])
Пример #2
0
def test_parquet_write_disable_statistics(tempdir):
    table = pa.Table.from_pydict(
        OrderedDict([('a', pa.array([1, 2, 3])),
                     ('b', pa.array(['a', 'b', 'c']))]))
    _write_table(table, tempdir / 'data.parquet')
    meta = pq.read_metadata(tempdir / 'data.parquet')
    for col in [0, 1]:
        cc = meta.row_group(0).column(col)
        assert cc.is_stats_set is True
        assert cc.statistics is not None

    _write_table(table, tempdir / 'data2.parquet', write_statistics=False)
    meta = pq.read_metadata(tempdir / 'data2.parquet')
    for col in [0, 1]:
        cc = meta.row_group(0).column(col)
        assert cc.is_stats_set is False
        assert cc.statistics is None

    _write_table(table, tempdir / 'data3.parquet', write_statistics=['a'])
    meta = pq.read_metadata(tempdir / 'data3.parquet')
    cc_a = meta.row_group(0).column(0)
    cc_b = meta.row_group(0).column(1)
    assert cc_a.is_stats_set is True
    assert cc_b.is_stats_set is False
    assert cc_a.statistics is not None
    assert cc_b.statistics is None
Пример #3
0
def test_encrypted_parquet_read_metadata_no_decryption_config(
        tempdir, data_table):
    """Write an encrypted parquet, verify it's encrypted,
    but then try to read its metadata without decryption properties."""
    test_encrypted_parquet_write_read(tempdir, data_table)
    # Read metadata without decryption properties
    with pytest.raises(IOError, match=r"no decryption"):
        pq.read_metadata(tempdir / PARQUET_NAME)
Пример #4
0
def test_read_common_metadata_files(tmpdir):
    import pyarrow.parquet as pq

    N = 100
    df = pd.DataFrame({
        'index': np.arange(N),
        'values': np.random.randn(N)
    },
                      columns=['index', 'values'])

    base_path = str(tmpdir)
    data_path = pjoin(base_path, 'data.parquet')

    table = pa.Table.from_pandas(df)
    _write_table(table, data_path)

    metadata_path = pjoin(base_path, '_metadata')
    pq.write_metadata(table.schema, metadata_path)

    dataset = pq.ParquetDataset(base_path)
    assert dataset.metadata_path == metadata_path

    common_schema = pq.read_metadata(data_path).schema
    assert dataset.schema.equals(common_schema)

    # handle list of one directory
    dataset2 = pq.ParquetDataset([base_path])
    assert dataset2.schema.equals(dataset.schema)
Пример #5
0
def _test_read_common_metadata_files(fs, base_path):
    import pyarrow.parquet as pq

    N = 100
    df = pd.DataFrame({
        'index': np.arange(N),
        'values': np.random.randn(N)
    }, columns=['index', 'values'])

    data_path = pjoin(base_path, 'data.parquet')

    table = pa.Table.from_pandas(df)

    with fs.open(data_path, 'wb') as f:
        _write_table(table, f)

    metadata_path = pjoin(base_path, '_metadata')
    with fs.open(metadata_path, 'wb') as f:
        pq.write_metadata(table.schema, f)

    dataset = pq.ParquetDataset(base_path, filesystem=fs)
    assert dataset.metadata_path == metadata_path

    with fs.open(data_path) as f:
        common_schema = pq.read_metadata(f).schema
    assert dataset.schema.equals(common_schema)

    # handle list of one directory
    dataset2 = pq.ParquetDataset([base_path], filesystem=fs)
    assert dataset2.schema.equals(dataset.schema)
Пример #6
0
def run_partition_test(input_file: str, output_dir: str, filters: Optional[list] = None):
    milliseconds_since_epoch = int(time() * 1000)

    print('Parquet metadata: ' + str(pq.read_metadata(input_file)))
    print('Parquet schema: ' + pq.read_schema(input_file).to_string())

    data = pq.read_table(source=input_file, filters=filters)

    # Write a dataset and collect metadata information of all written files
    metadata_collector = []
    root_path = output_dir + 'partitioned_' + str(milliseconds_since_epoch)
    pq.write_to_dataset(data,
                        root_path=root_path,
                        partition_cols=['start_year'],
                        metadata_collector=metadata_collector)

    # Write the ``_common_metadata`` parquet file without row groups statistics
    pq.write_metadata(data.schema, root_path + '/_common_metadata')

    # Write the ``_metadata`` parquet file with row groups statistics of all files
    # Gives following error:
    #       File "pyarrow/_parquet.pyx", line 616, in pyarrow._parquet.FileMetaData.append_row_groups
    #       RuntimeError: AppendRowGroups requires equal schemas.
    # data.schema has one more column than partitioned files when partitioning by one column
    # Related? https://github.com/dask/dask/issues/6243
    # pq.write_metadata(data.schema, root_path + '/_metadata', metadata_collector=metadata_collector)

    # Read from partitioned dataset
    # use the new generic Dataset API
    start_year = 2018
    value = 50000
    table = pq.read_table(root_path,
                          filters=[('start_year', '>=', start_year), ('value', '>', value)])
                          # filters=[('start_year', '>=', start_year)])
    print(table.to_pandas())
Пример #7
0
def _test_read_common_metadata_files(fs, base_path):
    import pyarrow.parquet as pq

    N = 100
    df = pd.DataFrame({
        'index': np.arange(N),
        'values': np.random.randn(N)
    },
                      columns=['index', 'values'])

    data_path = pjoin(base_path, 'data.parquet')

    table = pa.Table.from_pandas(df)

    with fs.open(data_path, 'wb') as f:
        _write_table(table, f)

    metadata_path = pjoin(base_path, '_metadata')
    with fs.open(metadata_path, 'wb') as f:
        pq.write_metadata(table.schema, f)

    dataset = pq.ParquetDataset(base_path, filesystem=fs)
    assert dataset.metadata_path == metadata_path

    with fs.open(data_path) as f:
        common_schema = pq.read_metadata(f).schema
    assert dataset.schema.equals(common_schema)

    # handle list of one directory
    dataset2 = pq.ParquetDataset([base_path], filesystem=fs)
    assert dataset2.schema.equals(dataset.schema)
Пример #8
0
 def get_row_group_info(path):
     fs = filesystem_factory()
     relative_path = os.path.relpath(path, base_path)
     pq_file = fs.open(path)
     num_row_groups = pq.read_metadata(pq_file).num_row_groups
     pq_file.close()
     return relative_path, num_row_groups
Пример #9
0
def test_parquet(tmpdir, registered_period_type):
    # parquet support for extension types
    period_type = PeriodType('D')
    storage = pa.array([1, 2, 3, 4], pa.int64())
    arr = pa.ExtensionArray.from_storage(period_type, storage)
    table = pa.table([arr], names=["ext"])

    import pyarrow.parquet as pq

    filename = tmpdir / 'extension_type.parquet'
    pq.write_table(table, filename)

    # stored in parquet as storage type but with extension metadata saved
    # in the serialized arrow schema
    meta = pq.read_metadata(filename)
    assert meta.schema.column(0).physical_type == "INT64"
    assert b"ARROW:schema" in meta.metadata

    import base64
    decoded_schema = base64.b64decode(meta.metadata[b"ARROW:schema"])
    schema = pa.read_schema(pa.BufferReader(decoded_schema))
    assert schema.field("ext").metadata == {
        b'ARROW:extension:metadata': b'freq=D',
        b'ARROW:extension:name': b'pandas.period'
    }

    # when reading in, properly create extension type if it is registered
    result = pq.read_table(filename)
    assert result.column("ext").type == period_type

    # when the type is not registered, read in as storage type
    pa.unregister_extension_type(period_type.extension_name)
    result = pq.read_table(filename)
    assert result.column("ext").type == pa.int64()
Пример #10
0
def from_parquet(message_bytes, **kwargs):
    metadata = pq.read_metadata(message_bytes)
    cls = SCHEMA_TO_TYPE[metadata.metadata[b"type"]]
    table = pq.read_table(message_bytes, **kwargs)
    data = table.to_pydict()
    values = list(
        map(dict,
            zip(*([(key, val) for val in data[key]] for key in data.keys()))))
    return _deserialize(name=cls, chunk=values)
Пример #11
0
def test_parquet_metadata_empty_to_dict(tempdir):
    # https://issues.apache.org/jira/browse/ARROW-10146
    table = pa.table({"a": pa.array([], type="int64")})
    pq.write_table(table, tempdir / "data.parquet")
    metadata = pq.read_metadata(tempdir / "data.parquet")
    # ensure this doesn't error / statistics set to None
    metadata_dict = metadata.to_dict()
    assert len(metadata_dict["row_groups"]) == 1
    assert len(metadata_dict["row_groups"][0]["columns"]) == 1
    assert metadata_dict["row_groups"][0]["columns"][0]["statistics"] is None
Пример #12
0
def read_metadata(filename: str):
    """read_metadata return only the metadata from a parquet pyspark file 

    Args:
        filename (str): [description]

    Returns:
        [type]: [description]
    """
    return pq.read_metadata(filename)
Пример #13
0
def test_add_pandas_engine_metadata_to_parquet_file(tmp_path) -> None:
    """Pandas engine metadata added.

    Args:
        tmp_path ([type]): Pytest temporary path plugin
    """
    filepath = tmp_path / "test_meta.parquet"
    pd.DataFrame().to_parquet(filepath)
    add_file_engine_metadata_to_parquet_file(filepath, "pandas")

    assert pq.read_metadata(filepath).metadata[b"file_engine"] == b"pandas"
Пример #14
0
def test_add_geopandas_engine_metadata_to_parquet_file(tmp_path: Path) -> None:
    """Geopandas engine metadata added.

    Args:
        tmp_path (Path): Pytest temporary path plugin
    """
    filepath = tmp_path / "test_meta.parquet"
    gpd.GeoDataFrame().to_parquet(filepath)
    add_file_engine_metadata_to_parquet_file(filepath, "geopandas")

    assert pq.read_metadata(filepath).metadata[b"file_engine"] == b"geopandas"
Пример #15
0
 def test_sink_transform_multiple_row_group(self):
     with TemporaryDirectory() as tmp_dirname:
         path = os.path.join(tmp_dirname + "tmp_filename")
         with TestPipeline() as p:
             # writing 623200 bytes of data
             _ = p \
             | Create(self.RECORDS * 4000) \
             | WriteToParquet(
                 path, self.SCHEMA, num_shards=1, codec='none',
                 shard_name_template='', row_group_buffer_size=250000)
         self.assertEqual(pq.read_metadata(path).num_row_groups, 3)
Пример #16
0
 def genes(self) -> Tuple[str]:
     # noinspection PyTypeChecker
     metadata = pq.read_metadata(self._fname)
     assert metadata.num_row_groups == 1, \
         "Parquet database {0:s} has more than one row group.".format(self._fname)
     metadata_row_group = metadata.row_group(0)
     # Get all gene names (exclude "features" column).
     return tuple(
         metadata_row_group.column(idx).path_in_schema
         for idx in range(0, metadata.num_columns)
         if metadata_row_group.column(idx).path_in_schema != INDEX_NAME)
Пример #17
0
 def test_sink_transform_multiple_row_group(self):
     with tempfile.NamedTemporaryFile() as dst:
         path = dst.name
         with TestPipeline() as p:
             # writing 623200 bytes of data
             _ = p \
             | Create(self.RECORDS * 4000) \
             | WriteToParquet(
                 path, self.SCHEMA, num_shards=1, codec='none',
                 shard_name_template='', row_group_buffer_size=250000)
         self.assertEqual(pq.read_metadata(path).num_row_groups, 3)
Пример #18
0
def test_metadata_exceeds_message_size():
    # ARROW-13655: Thrift may enable a default message size that limits
    # the size of Parquet metadata that can be written.
    NCOLS = 1000
    NREPEATS = 4000

    table = pa.table({str(i): np.random.randn(10) for i in range(NCOLS)})

    with pa.BufferOutputStream() as out:
        pq.write_table(table, out)
        buf = out.getvalue()

    original_metadata = pq.read_metadata(pa.BufferReader(buf))
    metadata = pq.read_metadata(pa.BufferReader(buf))
    for i in range(NREPEATS):
        metadata.append_row_groups(original_metadata)

    with pa.BufferOutputStream() as out:
        metadata.write_metadata_file(out)
        buf = out.getvalue()

    metadata = pq.read_metadata(pa.BufferReader(buf))
Пример #19
0
def extract_parquet(file_):
    """
    parse and extract key metadata from parquet files

    Args:
        file_ - file-like object opened in binary mode (+b)

    Returns:
        dict
            html - html summary of main contents (if applicable)
            info - metdata for user consumption
    """
    # TODO: generalize to datasets, multipart files
    # As written, only works for single files, and metadata
    # is slanted towards the first row_group

    # local import reduces amortized latency, saves memory
    import pyarrow.parquet as pq

    meta = pq.read_metadata(file_)

    info = {}
    info['created_by'] = meta.created_by
    info['format_version'] = meta.format_version
    info['metadata'] = {
        # seems silly but sets up a simple json.dumps(info) below
        k.decode(): json.loads(meta.metadata[k])
        for k in meta.metadata
    } if meta.metadata is not None else {}
    info['num_row_groups'] = meta.num_row_groups
    info['schema'] = {
        name: {
            'logical_type': meta.schema.column(i).logical_type,
            'max_definition_level': meta.schema.column(i).max_definition_level,
            'max_repetition_level': meta.schema.column(i).max_repetition_level,
            'path': meta.schema.column(i).path,
            'physical_type': meta.schema.column(i).physical_type,
        }
        for i, name in enumerate(meta.schema.names)
    }
    info['serialized_size'] = meta.serialized_size
    info['shape'] = [meta.num_rows, meta.num_columns]

    file_.seek(0)
    # TODO: make this faster with n_threads > 1?
    row_group = pq.ParquetFile(file_).read_row_group(0)
    # convert to str since FileMetaData is not JSON.dumps'able (below)
    html = row_group.to_pandas()._repr_html_()  # pylint: disable=protected-access

    return html, info
Пример #20
0
def run_test(input_file: str, output_dir: str, filters: list, use_pandas: bool):
    print('Using pyarrow')
    print('Parquet metadata: ' + str(pq.read_metadata(input_file)))
    print('Parquet schema: ' + pq.read_schema(input_file).to_string())

    pq_file = pq.ParquetFile(input_file)
    row_group_0_metadata = pq_file.metadata.row_group(0)
    print('Parquet min for column 0, row group 0: ' + str(row_group_0_metadata.column(0).statistics.min))
    print('Parquet max for column 0, row group 0: ' + str(row_group_0_metadata.column(0).statistics.max))

    if use_pandas:
        unfiltered_pandas_data = pq.read_table(source=input_file).to_pandas()
        size = sys.getsizeof(unfiltered_pandas_data)
        print('Size of UN-filtered pandas DataFrame in memory: ' + str(size) + ' bytes (' + str(size / 1000000) + ' MB)')

    with timeblock('pyarrow read and filter'):
        data = pq.read_table(source=input_file, filters=filters)
    size = sys.getsizeof(data)
    print('Size of filtered pyarrow table in memory: ' + str(size) + ' bytes (' + str(size / 1000000) + ' MB)')

    if use_pandas:
        unfiltered_pandas_data = data.to_pandas()
        size = sys.getsizeof(unfiltered_pandas_data)
        print('Size of filtered pandas DataFrame in memory: ' + str(size) + ' bytes (' + str(size / 1000000) + ' MB)')
        # print(pandas_data.head(10))

    milliseconds_since_epoch = int(time() * 1000)
    output_file = output_dir + str(milliseconds_since_epoch) + '.parquet'
    print('Output file name: ' + output_file)

    with timeblock('pyarrow write_table()'):
        pq.write_table(data, output_file)

    print('Parquet metadata of output: ' + str(pq.read_metadata(output_file)))
    print('Parquet schema of output: ' + pq.read_schema(output_file).to_string())
    print('Size of output file on disk: ' + str(os.path.getsize(output_file)) + ' bytes ('
          + str(os.path.getsize(output_file) / 1000000) + ' MB)')
Пример #21
0
def read_encrypted_parquet(path, decryption_config, kms_connection_config,
                           crypto_factory):
    file_decryption_properties = crypto_factory.file_decryption_properties(
        kms_connection_config, decryption_config)
    assert (file_decryption_properties is not None)
    meta = pq.read_metadata(path,
                            decryption_properties=file_decryption_properties)
    assert (meta.num_columns == 3)
    schema = pq.read_schema(path,
                            decryption_properties=file_decryption_properties)
    assert (len(schema.names) == 3)

    result = pq.ParquetFile(path,
                            decryption_properties=file_decryption_properties)
    return result.read(use_threads=True)
Пример #22
0
def test_pandas_parquet_custom_metadata(tmpdir):
    import pyarrow.parquet as pq

    df = alltypes_sample(size=10000)

    filename = tmpdir.join('pandas_rountrip.parquet')
    arrow_table = pa.Table.from_pandas(df, timestamps_to_ms=True)
    assert b'pandas' in arrow_table.schema.metadata

    _write_table(arrow_table, filename.strpath, version="2.0")

    md = pq.read_metadata(filename.strpath).metadata
    assert b'pandas' in md

    js = json.loads(md[b'pandas'].decode('utf8'))
    assert js['index_columns'] == ['__index_level_0__']
Пример #23
0
def test_pandas_parquet_custom_metadata(tempdir):
    df = alltypes_sample(size=10000)

    filename = tempdir / 'pandas_roundtrip.parquet'
    arrow_table = pa.Table.from_pandas(df)
    assert b'pandas' in arrow_table.schema.metadata

    _write_table(arrow_table, filename, version='2.0', coerce_timestamps='ms')

    metadata = pq.read_metadata(filename).metadata
    assert b'pandas' in metadata

    js = json.loads(metadata[b'pandas'].decode('utf8'))
    assert js['index_columns'] == [{'kind': 'range',
                                    'name': None,
                                    'start': 0, 'stop': 10000,
                                    'step': 1}]
Пример #24
0
def test_pass_separate_metadata():
    # ARROW-471
    df = alltypes_sample(size=10000)

    a_table = pa.Table.from_pandas(df)

    buf = io.BytesIO()
    _write_table(a_table, buf, compression='snappy', version='2.0')

    buf.seek(0)
    metadata = pq.read_metadata(buf)

    buf.seek(0)

    fileh = pq.ParquetFile(buf, metadata=metadata)

    tm.assert_frame_equal(df, fileh.read().to_pandas())
Пример #25
0
def test_pandas_parquet_custom_metadata(tmpdir):
    import pyarrow.parquet as pq

    df = alltypes_sample(size=10000)

    filename = tmpdir.join('pandas_rountrip.parquet')
    arrow_table = pa.Table.from_pandas(df)
    assert b'pandas' in arrow_table.schema.metadata

    _write_table(arrow_table, filename.strpath, version="2.0",
                 coerce_timestamps='ms')

    md = pq.read_metadata(filename.strpath).metadata
    assert b'pandas' in md

    js = json.loads(md[b'pandas'].decode('utf8'))
    assert js['index_columns'] == ['__index_level_0__']
Пример #26
0
    def get_schema(self, uri: str):
        fs, base_dir = FileSystem.from_uri(normalize_uri(uri))
        selector = FileSelector(base_dir, allow_not_found=True, recursive=True)

        first_parquet = None
        for finfo in fs.get_file_info(selector):
            if finfo.path.endswith(".parquet"):
                first_parquet = finfo.path
                break
        metadata_file = fs.open_input_file(first_parquet)
        metadata = pq.read_metadata(metadata_file)

        kv_metadata = metadata.metadata
        try:
            return json.loads(kv_metadata[self.SPARK_PARQUET_ROW_METADATA])
        except KeyError as exp:
            raise ValueError(
                f"Parquet dataset {uri} is not created via Spark") from exp
def test_splitted_parquet_writer():
    if skip:
        return
    os.system("wget "
              "https://raw.githubusercontent.com/"
              "JayjeetAtGithub/zips/main/largefile.parquet")
    chunksize = 4 * 1024 * 1024  # 4MB
    writer = SplittedParquetWriter("largefile.parquet", 'mydataset', chunksize)
    writer.write()
    assert len(os.listdir('mydataset')) == 8

    original_file_rows = pq.read_table('largefile.parquet').num_rows
    splitted_files_rows = 0
    files = os.listdir('mydataset')
    for file in files:
        splitted_files_rows += pq.read_metadata(f"mydataset/{file}").num_rows

    assert splitted_files_rows == original_file_rows
Пример #28
0
def test_pass_separate_metadata():
    import pyarrow.parquet as pq

    # ARROW-471
    df = alltypes_sample(size=10000)

    a_table = pa.Table.from_pandas(df)

    buf = io.BytesIO()
    _write_table(a_table, buf, compression='snappy', version='2.0')

    buf.seek(0)
    metadata = pq.read_metadata(buf)

    buf.seek(0)

    fileh = pq.ParquetFile(buf, metadata=metadata)

    tm.assert_frame_equal(df, fileh.read().to_pandas())
Пример #29
0
def test_parquet_period(tmpdir, registered_period_type):
    # Parquet support for primitive extension types
    period_type, period_class = registered_period_type
    storage = pa.array([1, 2, 3, 4], pa.int64())
    arr = pa.ExtensionArray.from_storage(period_type, storage)
    table = pa.table([arr], names=["ext"])

    import pyarrow.parquet as pq

    filename = tmpdir / 'period_extension_type.parquet'
    pq.write_table(table, filename)

    # Stored in parquet as storage type but with extension metadata saved
    # in the serialized arrow schema
    meta = pq.read_metadata(filename)
    assert meta.schema.column(0).physical_type == "INT64"
    assert b"ARROW:schema" in meta.metadata

    import base64
    decoded_schema = base64.b64decode(meta.metadata[b"ARROW:schema"])
    schema = pa.ipc.read_schema(pa.BufferReader(decoded_schema))
    # Since the type could be reconstructed, the extension type metadata is
    # absent.
    assert schema.field("ext").metadata == {}

    # When reading in, properly create extension type if it is registered
    result = pq.read_table(filename)
    assert result.schema.field("ext").type == period_type
    assert result.schema.field("ext").metadata == {b'PARQUET:field_id': b'1'}
    # Get the exact array class defined by the registered type.
    result_array = result.column("ext").chunk(0)
    assert type(result_array) is period_class

    # When the type is not registered, read in as storage type
    pa.unregister_extension_type(period_type.extension_name)
    result = pq.read_table(filename)
    assert result.schema.field("ext").type == pa.int64()
    # The extension metadata is present for roundtripping.
    assert result.schema.field("ext").metadata == {
        b'ARROW:extension:metadata': b'freq=D',
        b'ARROW:extension:name': b'test.period',
        b'PARQUET:field_id': b'1',
    }
Пример #30
0
def merge(ctx, src_contact_tables, dest_contact_table, fofn):
    import pyarrow.parquet as pq
    from pyarrow import dataset

    if fofn:
        assert len(
            src_contact_tables
        ) == 1, "If using --fofn you can only pass a single source file"
        src_fofn = src_contact_tables[0]
        src_contact_tables = []
        errors = []

        for file_path in open(src_fofn):
            input_file = Path(file_path.strip())
            if not input_file.resolve().exists():
                errors.append(f"Input file missing: {input_file}")
            src_contact_tables.append(input_file)
        if errors:
            for e in errors:
                logger.error(e)
            raise OSError("Missing input files")

    parts = []
    for i in src_contact_tables:
        md = pq.read_metadata(i)
        if md.num_rows == 0:
            logger.warning(
                f"The following contact file has no entries, removing from merge: {i}"
            )
            continue
        parts.append(i)

    ds = dataset.dataset(parts, format="parquet")
    df = dd.read_parquet(parts,
                         engine=PQ_ENGINE,
                         version=PQ_VERSION,
                         index=False)
    df.to_parquet(dest_contact_table,
                  engine=PQ_ENGINE,
                  version=PQ_VERSION,
                  schema=ds.schema,
                  write_index=False)
    def __init__(self,
                 root_path,
                 num_samples=None,
                 target_name='delinquency_12',
                 shuffle_files=False):
        self.parquet_files = glob.glob(os.path.join(root_path, "*.parquet"))
        if shuffle_files:
            self.parquet_files = list(np.random.permutation(
                self.parquet_files))
        self.target_name = target_name
        self.metadata = [pq.read_metadata(p) for p in self.parquet_files]
        self.cumsum_rows = np.cumsum([m.num_rows for m in self.metadata])

        self.times_through = 0
        if num_samples is not None:
            self.num_samples = min(num_samples, self.cumsum_rows[-1])
        else:
            self.num_samples = self.cumsum_rows[-1]

        self.loaded_tensors = None
Пример #32
0
def run_id_filter_test(input_file: str, input_id_file: str):

    # converting ids to pandas will be a "zero copy conversion" as unit_id column is int64 when:
    # - ids are not nulls
    # - a single ChunkedArray
    # TODO check it that is the case
    # https://arrow.apache.org/docs/python/pandas.html#zero-copy-series-conversions
    filter_ids = pq.read_table(source=input_id_file)
    filter_ids_as_pandas: DataFrame = filter_ids.to_pandas()
    # filter_ids_as_list = filter_ids_as_pandas['unit_id'].tolist()
    filter_ids_as_set = set(filter_ids_as_pandas['unit_id'])

    print('Parquet metadata: ' + str(pq.read_metadata(input_id_file)))
    print('Parquet schema: ' + pq.read_schema(input_id_file).to_string())
    print('Using filter ids: ' + str(filter_ids.to_pandas()))

    table = pq.read_table(source=input_file, filters=[
        # ('unit_id', 'in', filter_ids_as_list)
        ('unit_id', 'in', filter_ids_as_set)
    ])
    print(table.to_pandas())
def test_multi_dataset_metadata(tempdir):
    filenames = ["ARROW-1983-dataset.0", "ARROW-1983-dataset.1"]
    metapath = str(tempdir / "_metadata")

    # create a test dataset
    df = pd.DataFrame({
        'one': [1, 2, 3],
        'two': [-1, -2, -3],
        'three': [[1, 2], [2, 3], [3, 4]],
    })
    table = pa.Table.from_pandas(df)

    # write dataset twice and collect/merge metadata
    _meta = None
    for filename in filenames:
        meta = []
        pq.write_table(table, str(tempdir / filename),
                       metadata_collector=meta)
        meta[0].set_file_path(filename)
        if _meta is None:
            _meta = meta[0]
        else:
            _meta.append_row_groups(meta[0])

    # Write merged metadata-only file
    with open(metapath, "wb") as f:
        _meta.write_metadata_file(f)

    # Read back the metadata
    meta = pq.read_metadata(metapath)
    md = meta.to_dict()
    _md = _meta.to_dict()
    for key in _md:
        if key != 'serialized_size':
            assert _md[key] == md[key]
    assert _md['num_columns'] == 3
    assert _md['num_rows'] == 6
    assert _md['num_row_groups'] == 2
    assert _md['serialized_size'] == 0
    assert md['serialized_size'] > 0
Пример #34
0
def test_read_multiple_files(tmpdir):
    import pyarrow.parquet as pq

    nfiles = 10
    size = 5

    dirpath = tmpdir.join(guid()).strpath
    os.mkdir(dirpath)

    test_data = []
    paths = []
    for i in range(nfiles):
        df = _test_dataframe(size, seed=i)

        # Hack so that we don't have a dtype cast in v1 files
        df['uint32'] = df['uint32'].astype(np.int64)

        path = pjoin(dirpath, '{0}.parquet'.format(i))

        table = pa.Table.from_pandas(df)
        _write_table(table, path)

        test_data.append(table)
        paths.append(path)

    # Write a _SUCCESS.crc file
    with open(pjoin(dirpath, '_SUCCESS.crc'), 'wb') as f:
        f.write(b'0')

    def read_multiple_files(paths, columns=None, nthreads=None, **kwargs):
        dataset = pq.ParquetDataset(paths, **kwargs)
        return dataset.read(columns=columns, nthreads=nthreads)

    result = read_multiple_files(paths)
    expected = pa.concat_tables(test_data)

    assert result.equals(expected)

    # Read with provided metadata
    metadata = pq.read_metadata(paths[0])

    result2 = read_multiple_files(paths, metadata=metadata)
    assert result2.equals(expected)

    result3 = pa.localfs.read_parquet(dirpath, schema=metadata.schema)
    assert result3.equals(expected)

    # Read column subset
    to_read = [result[0], result[2], result[6], result[result.num_columns - 1]]

    result = pa.localfs.read_parquet(
        dirpath, columns=[c.name for c in to_read])
    expected = pa.Table.from_arrays(to_read, metadata=result.schema.metadata)
    assert result.equals(expected)

    # Read with multiple threads
    pa.localfs.read_parquet(dirpath, nthreads=2)

    # Test failure modes with non-uniform metadata
    bad_apple = _test_dataframe(size, seed=i).iloc[:, :4]
    bad_apple_path = tmpdir.join('{0}.parquet'.format(guid())).strpath

    t = pa.Table.from_pandas(bad_apple)
    _write_table(t, bad_apple_path)

    bad_meta = pq.read_metadata(bad_apple_path)

    with pytest.raises(ValueError):
        read_multiple_files(paths + [bad_apple_path])

    with pytest.raises(ValueError):
        read_multiple_files(paths, metadata=bad_meta)

    mixed_paths = [bad_apple_path, paths[0]]

    with pytest.raises(ValueError):
        read_multiple_files(mixed_paths, schema=bad_meta.schema)

    with pytest.raises(ValueError):
        read_multiple_files(mixed_paths)