def read_metadata(cls, fs, paths, columns, filters, index): import pyarrow.dataset as ds from pyarrow.parquet import _filters_to_expression # dataset discovery if len(paths) == 1: # list of 1 directory path is not supported paths = paths[0] dataset = ds.dataset(paths, partitioning="hive", filesystem=fs, format=cls.file_format) # Get all (filtered) fragments if filters is not None: filter = _filters_to_expression(filters) else: filter = None fragments = list(dataset.get_fragments(filter=filter)) # numeric rather than glob ordering # TODO how does this handle different partitioned directories? fragments = sorted(fragments, key=lambda f: natural_sort_key(f.path)) # TODO potential splitting / aggregating of fragments # Create dask meta schema = dataset.schema # TODO add support for `categories`keyword meta = schema.empty_table().to_pandas() if index: meta = meta.set_index(index) if columns is not None: ex = set(columns) - set(meta.columns) if ex: raise ValueError( f"Requested columns {ex} not in schema {set(meta.columns)}" ) meta = meta[columns] return fragments, meta, schema, filter
def read_parquet( filepath_or_buffer, engine="cudf", columns=None, filters=None, row_groups=None, skip_rows=None, num_rows=None, strings_to_categorical=False, use_pandas_metadata=True, *args, **kwargs, ): """{docstring}""" # Multiple sources are passed as a list. If a single source is passed, # wrap it in a list for unified processing downstream. if not is_list_like(filepath_or_buffer): filepath_or_buffer = [filepath_or_buffer] # a list of row groups per source should be passed. make the list of # lists that is expected for multiple sources if row_groups is not None: if not is_list_like(row_groups): row_groups = [[row_groups]] elif not is_list_like(row_groups[0]): row_groups = [row_groups] filepaths_or_buffers = [] for source in filepath_or_buffer: tmp_source, compression = ioutils.get_filepath_or_buffer( path_or_data=source, compression=None, **kwargs) if compression is not None: raise ValueError( "URL content-encoding decompression is not supported") filepaths_or_buffers.append(tmp_source) if filters is not None: # Convert filters to ds.Expression filters = pq._filters_to_expression(filters) # Initialize ds.FilesystemDataset dataset = ds.dataset(filepaths_or_buffers, format="parquet", partitioning="hive") # Load IDs of filtered row groups for each file in dataset filtered_rg_ids = defaultdict(list) for fragment in dataset.get_fragments(filter=filters): for rg_fragment in fragment.get_row_group_fragments(filters): for rg_id in rg_fragment.row_groups: filtered_rg_ids[rg_fragment.path].append(rg_id) # TODO: Use this with pyarrow 1.0.0 # # Load IDs of filtered row groups for each file in dataset # filtered_row_group_ids = {} # for fragment in dataset.get_fragments(filters): # for row_group_fragment in fragment.split_by_row_group(filters): # for row_group_info in row_group_fragment.row_groups: # path = row_group_fragment.path # if path not in filtered_row_group_ids: # filtered_row_group_ids[path] = [row_group_info.id] # else: # filtered_row_group_ids[path].append(row_group_info.id) # Initialize row_groups to be selected if row_groups is None: row_groups = [None for _ in dataset.files] # Store IDs of selected row groups for each file for i, file in enumerate(dataset.files): if row_groups[i] is None: row_groups[i] = filtered_rg_ids[file] else: row_groups[i] = filter(lambda id: id in row_groups[i], filtered_rg_ids[file]) if engine == "cudf": return libparquet.read_parquet( filepaths_or_buffers, columns=columns, row_groups=row_groups, skip_rows=skip_rows, num_rows=num_rows, strings_to_categorical=strings_to_categorical, use_pandas_metadata=use_pandas_metadata, ) else: warnings.warn("Using CPU via PyArrow to read Parquet dataset.") return cudf.DataFrame.from_arrow( pq.ParquetDataset(filepaths_or_buffers).read_pandas( columns=columns, *args, **kwargs))
def _process_dataset( paths, fs, filters=None, row_groups=None, categorical_partitions=True, ): # Returns: # file_list - Expanded/filtered list of paths # row_groups - Filtered list of row-group selections # partition_keys - list of partition keys for each file # partition_categories - Categories for each partition # The general purpose of this function is to (1) expand # directory input into a list of paths (using the pyarrow # dataset API), (2) to apply row-group filters, and (3) # to discover directory-partitioning information # Deal with case that the user passed in a directory name file_list = paths if len(paths) == 1 and ioutils.is_directory(paths[0]): paths = ioutils.stringify_pathlike(paths[0]) # Convert filters to ds.Expression if filters is not None: filters = pq._filters_to_expression(filters) # Initialize ds.FilesystemDataset dataset = ds.dataset( paths, filesystem=fs, format="parquet", partitioning="hive", ) file_list = dataset.files if len(file_list) == 0: raise FileNotFoundError(f"{paths} could not be resolved to any files") # Deal with directory partitioning # Get all partition keys (without filters) partition_categories = defaultdict(list) file_fragment = None for file_fragment in dataset.get_fragments(): keys = ds._get_partition_keys(file_fragment.partition_expression) if not (keys or partition_categories): # Bail - This is not a directory-partitioned dataset break for k, v in keys.items(): if v not in partition_categories[k]: partition_categories[k].append(v) if not categorical_partitions: # Bail - We don't need to discover all categories. # We only need to save the partition keys from this # first `file_fragment` break if partition_categories and file_fragment is not None: # Check/correct order of `categories` using last file_frag, # because `_get_partition_keys` does NOT preserve the # partition-hierarchy order of the keys. cat_keys = [ part.split("=")[0] for part in file_fragment.path.split(fs.sep) if "=" in part ] if set(partition_categories) == set(cat_keys): partition_categories = { k: partition_categories[k] for k in cat_keys if k in partition_categories } # If we do not have partitioned data and # are not filtering, we can return here if filters is None and not partition_categories: return file_list, row_groups, [], {} # Record initial row_groups input row_groups_map = {} if row_groups is not None: # Make sure paths and row_groups map 1:1 # and save the initial mapping if len(paths) != len(file_list): raise ValueError( "Cannot specify a row_group selection for a directory path.") row_groups_map = {path: rgs for path, rgs in zip(paths, row_groups)} # Apply filters and discover partition columns partition_keys = [] if partition_categories or filters is not None: file_list = [] if filters is not None: row_groups = [] for file_fragment in dataset.get_fragments(filter=filters): path = file_fragment.path # Extract hive-partition keys, and make sure they # are orederd the same as they are in `partition_categories` if partition_categories: raw_keys = ds._get_partition_keys( file_fragment.partition_expression) partition_keys.append([(name, raw_keys[name]) for name in partition_categories.keys() ]) # Apply row-group filtering selection = row_groups_map.get(path, None) if selection is not None or filters is not None: filtered_row_groups = [ rg_info.id for rg_fragment in file_fragment.split_by_row_group( filters, schema=dataset.schema, ) for rg_info in rg_fragment.row_groups ] file_list.append(path) if filters is not None: if selection is None: row_groups.append(filtered_row_groups) else: row_groups.append([ rg_id for rg_id in filtered_row_groups if rg_id in selection ]) return ( file_list, row_groups, partition_keys, partition_categories if categorical_partitions else {}, )