Exemplo n.º 1
0
    def read_metadata(cls, fs, paths, columns, filters, index):

        import pyarrow.dataset as ds
        from pyarrow.parquet import _filters_to_expression

        # dataset discovery
        if len(paths) == 1:
            # list of 1 directory path is not supported
            paths = paths[0]
        dataset = ds.dataset(paths,
                             partitioning="hive",
                             filesystem=fs,
                             format=cls.file_format)

        # Get all (filtered) fragments
        if filters is not None:
            filter = _filters_to_expression(filters)
        else:
            filter = None

        fragments = list(dataset.get_fragments(filter=filter))

        # numeric rather than glob ordering
        # TODO how does this handle different partitioned directories?
        fragments = sorted(fragments, key=lambda f: natural_sort_key(f.path))

        # TODO potential splitting / aggregating of fragments

        # Create dask meta
        schema = dataset.schema
        # TODO add support for `categories`keyword
        meta = schema.empty_table().to_pandas()

        if index:
            meta = meta.set_index(index)

        if columns is not None:
            ex = set(columns) - set(meta.columns)
            if ex:
                raise ValueError(
                    f"Requested columns {ex} not in schema {set(meta.columns)}"
                )
            meta = meta[columns]

        return fragments, meta, schema, filter
Exemplo n.º 2
0
def read_parquet(
    filepath_or_buffer,
    engine="cudf",
    columns=None,
    filters=None,
    row_groups=None,
    skip_rows=None,
    num_rows=None,
    strings_to_categorical=False,
    use_pandas_metadata=True,
    *args,
    **kwargs,
):
    """{docstring}"""

    # Multiple sources are passed as a list. If a single source is passed,
    # wrap it in a list for unified processing downstream.
    if not is_list_like(filepath_or_buffer):
        filepath_or_buffer = [filepath_or_buffer]

    # a list of row groups per source should be passed. make the list of
    # lists that is expected for multiple sources
    if row_groups is not None:
        if not is_list_like(row_groups):
            row_groups = [[row_groups]]
        elif not is_list_like(row_groups[0]):
            row_groups = [row_groups]

    filepaths_or_buffers = []
    for source in filepath_or_buffer:
        tmp_source, compression = ioutils.get_filepath_or_buffer(
            path_or_data=source, compression=None, **kwargs)
        if compression is not None:
            raise ValueError(
                "URL content-encoding decompression is not supported")
        filepaths_or_buffers.append(tmp_source)

    if filters is not None:
        # Convert filters to ds.Expression
        filters = pq._filters_to_expression(filters)

        # Initialize ds.FilesystemDataset
        dataset = ds.dataset(filepaths_or_buffers,
                             format="parquet",
                             partitioning="hive")

        # Load IDs of filtered row groups for each file in dataset
        filtered_rg_ids = defaultdict(list)
        for fragment in dataset.get_fragments(filter=filters):
            for rg_fragment in fragment.get_row_group_fragments(filters):
                for rg_id in rg_fragment.row_groups:
                    filtered_rg_ids[rg_fragment.path].append(rg_id)

        # TODO: Use this with pyarrow 1.0.0
        # # Load IDs of filtered row groups for each file in dataset
        # filtered_row_group_ids = {}
        # for fragment in dataset.get_fragments(filters):
        #     for row_group_fragment in fragment.split_by_row_group(filters):
        #         for row_group_info in row_group_fragment.row_groups:
        #             path = row_group_fragment.path
        #             if path not in filtered_row_group_ids:
        #                 filtered_row_group_ids[path] = [row_group_info.id]
        #             else:
        #                 filtered_row_group_ids[path].append(row_group_info.id)

        # Initialize row_groups to be selected
        if row_groups is None:
            row_groups = [None for _ in dataset.files]

        # Store IDs of selected row groups for each file
        for i, file in enumerate(dataset.files):
            if row_groups[i] is None:
                row_groups[i] = filtered_rg_ids[file]
            else:
                row_groups[i] = filter(lambda id: id in row_groups[i],
                                       filtered_rg_ids[file])

    if engine == "cudf":
        return libparquet.read_parquet(
            filepaths_or_buffers,
            columns=columns,
            row_groups=row_groups,
            skip_rows=skip_rows,
            num_rows=num_rows,
            strings_to_categorical=strings_to_categorical,
            use_pandas_metadata=use_pandas_metadata,
        )
    else:
        warnings.warn("Using CPU via PyArrow to read Parquet dataset.")
        return cudf.DataFrame.from_arrow(
            pq.ParquetDataset(filepaths_or_buffers).read_pandas(
                columns=columns, *args, **kwargs))
Exemplo n.º 3
0
def _process_dataset(
    paths,
    fs,
    filters=None,
    row_groups=None,
    categorical_partitions=True,
):
    # Returns:
    #     file_list - Expanded/filtered list of paths
    #     row_groups - Filtered list of row-group selections
    #     partition_keys - list of partition keys for each file
    #     partition_categories - Categories for each partition

    # The general purpose of this function is to (1) expand
    # directory input into a list of paths (using the pyarrow
    # dataset API), (2) to apply row-group filters, and (3)
    # to discover directory-partitioning information

    # Deal with case that the user passed in a directory name
    file_list = paths
    if len(paths) == 1 and ioutils.is_directory(paths[0]):
        paths = ioutils.stringify_pathlike(paths[0])

    # Convert filters to ds.Expression
    if filters is not None:
        filters = pq._filters_to_expression(filters)

    # Initialize ds.FilesystemDataset
    dataset = ds.dataset(
        paths,
        filesystem=fs,
        format="parquet",
        partitioning="hive",
    )
    file_list = dataset.files
    if len(file_list) == 0:
        raise FileNotFoundError(f"{paths} could not be resolved to any files")

    # Deal with directory partitioning
    # Get all partition keys (without filters)
    partition_categories = defaultdict(list)
    file_fragment = None
    for file_fragment in dataset.get_fragments():
        keys = ds._get_partition_keys(file_fragment.partition_expression)
        if not (keys or partition_categories):
            # Bail - This is not a directory-partitioned dataset
            break
        for k, v in keys.items():
            if v not in partition_categories[k]:
                partition_categories[k].append(v)
        if not categorical_partitions:
            # Bail - We don't need to discover all categories.
            # We only need to save the partition keys from this
            # first `file_fragment`
            break

    if partition_categories and file_fragment is not None:
        # Check/correct order of `categories` using last file_frag,
        # because `_get_partition_keys` does NOT preserve the
        # partition-hierarchy order of the keys.
        cat_keys = [
            part.split("=")[0] for part in file_fragment.path.split(fs.sep)
            if "=" in part
        ]
        if set(partition_categories) == set(cat_keys):
            partition_categories = {
                k: partition_categories[k]
                for k in cat_keys if k in partition_categories
            }

    # If we do not have partitioned data and
    # are not filtering, we can return here
    if filters is None and not partition_categories:
        return file_list, row_groups, [], {}

    # Record initial row_groups input
    row_groups_map = {}
    if row_groups is not None:
        # Make sure paths and row_groups map 1:1
        # and save the initial mapping
        if len(paths) != len(file_list):
            raise ValueError(
                "Cannot specify a row_group selection for a directory path.")
        row_groups_map = {path: rgs for path, rgs in zip(paths, row_groups)}

    # Apply filters and discover partition columns
    partition_keys = []
    if partition_categories or filters is not None:
        file_list = []
        if filters is not None:
            row_groups = []
        for file_fragment in dataset.get_fragments(filter=filters):
            path = file_fragment.path

            # Extract hive-partition keys, and make sure they
            # are orederd the same as they are in `partition_categories`
            if partition_categories:
                raw_keys = ds._get_partition_keys(
                    file_fragment.partition_expression)
                partition_keys.append([(name, raw_keys[name])
                                       for name in partition_categories.keys()
                                       ])

            # Apply row-group filtering
            selection = row_groups_map.get(path, None)
            if selection is not None or filters is not None:
                filtered_row_groups = [
                    rg_info.id
                    for rg_fragment in file_fragment.split_by_row_group(
                        filters,
                        schema=dataset.schema,
                    ) for rg_info in rg_fragment.row_groups
                ]
            file_list.append(path)
            if filters is not None:
                if selection is None:
                    row_groups.append(filtered_row_groups)
                else:
                    row_groups.append([
                        rg_id for rg_id in filtered_row_groups
                        if rg_id in selection
                    ])

    return (
        file_list,
        row_groups,
        partition_keys,
        partition_categories if categorical_partitions else {},
    )