def _get_partitioned( df, root_path, partition_cols, filename=None, fs=None, preserve_index=False, **kwargs, ): fs = ioutils._ensure_filesystem(fs, root_path, **kwargs) fs.mkdirs(root_path, exist_ok=True) if not (set(df._data) - set(partition_cols)): raise ValueError("No data left to save outside partition columns") part_names, part_offsets, _, grouped_df = df.groupby( partition_cols)._grouped() if not preserve_index: grouped_df.reset_index(drop=True, inplace=True) grouped_df.drop(columns=partition_cols, inplace=True) # Copy the entire keys df in one operation rather than using iloc part_names = part_names.to_pandas().to_frame(index=False) full_paths = [] metadata_file_paths = [] for keys in part_names.itertuples(index=False): subdir = fs.sep.join( [f"{name}={val}" for name, val in zip(partition_cols, keys)]) prefix = fs.sep.join([root_path, subdir]) fs.mkdirs(prefix, exist_ok=True) filename = filename or _generate_filename() full_path = fs.sep.join([prefix, filename]) full_paths.append(full_path) metadata_file_paths.append(fs.sep.join([subdir, filename])) return full_paths, metadata_file_paths, grouped_df, part_offsets, filename
def read_orc( filepath_or_buffer, engine="cudf", columns=None, filters=None, stripes=None, skiprows=None, num_rows=None, use_index=True, decimal_cols_as_float=None, timestamp_type=None, use_python_file_object=True, **kwargs, ): """{docstring}""" if decimal_cols_as_float is not None: warnings.warn( "`decimal_cols_as_float` is deprecated and will be removed in " "the future", FutureWarning, ) from cudf import DataFrame # Multiple sources are passed as a list. If a single source is passed, # wrap it in a list for unified processing downstream. if not is_list_like(filepath_or_buffer): filepath_or_buffer = [filepath_or_buffer] # Each source must have a correlating stripe list. If a single stripe list # is provided rather than a list of list of stripes then extrapolate that # stripe list across all input sources if stripes is not None: if any(not isinstance(stripe, list) for stripe in stripes): stripes = [stripes] # Must ensure a stripe for each source is specified, unless None if not len(stripes) == len(filepath_or_buffer): raise ValueError( "A list of stripes must be provided for each input source" ) filepaths_or_buffers = [] for source in filepath_or_buffer: if ioutils.is_directory(source, **kwargs): fs = ioutils._ensure_filesystem( passed_filesystem=None, path=source, **kwargs, ) source = stringify_path(source) source = fs.sep.join([source, "*.orc"]) tmp_source, compression = ioutils.get_filepath_or_buffer( path_or_data=source, compression=None, use_python_file_object=use_python_file_object, **kwargs, ) if compression is not None: raise ValueError( "URL content-encoding decompression is not supported" ) if isinstance(tmp_source, list): filepaths_or_buffers.extend(tmp_source) else: filepaths_or_buffers.append(tmp_source) if filters is not None: selected_stripes = _filter_stripes( filters, filepaths_or_buffers, stripes, skiprows, num_rows ) # Return empty if everything was filtered if len(selected_stripes) == 0: return _make_empty_df(filepaths_or_buffers[0], columns) else: stripes = selected_stripes if engine == "cudf": return DataFrame._from_data( *liborc.read_orc( filepaths_or_buffers, columns, stripes, skiprows, num_rows, use_index, decimal_cols_as_float, timestamp_type, ) ) else: def read_orc_stripe(orc_file, stripe, columns): pa_table = orc_file.read_stripe(stripe, columns) if isinstance(pa_table, pa.RecordBatch): pa_table = pa.Table.from_batches([pa_table]) return pa_table warnings.warn("Using CPU via PyArrow to read ORC dataset.") if len(filepath_or_buffer) > 1: raise NotImplementedError( "Using CPU via PyArrow only supports a single a " "single input source" ) orc_file = orc.ORCFile(filepath_or_buffer[0]) if stripes is not None and len(stripes) > 0: for stripe_source_file in stripes: pa_tables = [ read_orc_stripe(orc_file, i, columns) for i in stripe_source_file ] pa_table = pa.concat_tables(pa_tables) else: pa_table = orc_file.read(columns=columns) df = cudf.DataFrame.from_arrow(pa_table) return df
def read_json( path_or_buf, engine="auto", dtype=True, lines=False, compression="infer", byte_range=None, *args, **kwargs, ): """{docstring}""" if engine == "cudf" and not lines: raise ValueError("cudf engine only supports JSON Lines format") if engine == "auto": engine = "cudf" if lines else "pandas" if engine == "cudf": # Multiple sources are passed as a list. If a single source is passed, # wrap it in a list for unified processing downstream. if not is_list_like(path_or_buf): path_or_buf = [path_or_buf] filepaths_or_buffers = [] for source in path_or_buf: if ioutils.is_directory(source, **kwargs): fs = ioutils._ensure_filesystem(passed_filesystem=None, path=source) source = ioutils.stringify_pathlike(source) source = fs.sep.join([source, "*.json"]) tmp_source, compression = ioutils.get_filepath_or_buffer( path_or_data=source, compression=compression, iotypes=(BytesIO, StringIO), **kwargs, ) if isinstance(tmp_source, list): filepaths_or_buffers.extend(tmp_source) else: filepaths_or_buffers.append(tmp_source) return cudf.DataFrame._from_data(*libjson.read_json( filepaths_or_buffers, dtype, lines, compression, byte_range)) else: warnings.warn("Using CPU via Pandas to read JSON dataset, this may " "be GPU accelerated in the future") if not ioutils.ensure_single_filepath_or_buffer( path_or_data=path_or_buf, **kwargs, ): raise NotImplementedError( "`read_json` does not yet support reading " "multiple files via pandas") path_or_buf, compression = ioutils.get_filepath_or_buffer( path_or_data=path_or_buf, compression=compression, iotypes=(BytesIO, StringIO), **kwargs, ) if kwargs.get("orient") == "table": pd_value = pd.read_json( path_or_buf, lines=lines, compression=compression, *args, **kwargs, ) else: pd_value = pd.read_json( path_or_buf, lines=lines, dtype=dtype, compression=compression, *args, **kwargs, ) df = cudf.from_pandas(pd_value) return df
def write_to_dataset( df, root_path, filename=None, partition_cols=None, fs=None, preserve_index=False, return_metadata=False, **kwargs, ): """Wraps `to_parquet` to write partitioned Parquet datasets. For each combination of partition group and value, subdirectories are created as follows: .. code-block:: bash root_dir/ group=value1 <filename>.parquet ... group=valueN <filename>.parquet Parameters ---------- df : cudf.DataFrame root_path : string, The root directory of the dataset filename : string, default None The file name to use (within each partition directory). If None, a random uuid4 hex string will be used for each file name. fs : FileSystem, default None If nothing passed, paths assumed to be found in the local on-disk filesystem preserve_index : bool, default False Preserve index values in each parquet file. partition_cols : list, Column names by which to partition the dataset Columns are partitioned in the order they are given return_metadata : bool, default False Return parquet metadata for written data. Returned metadata will include the file-path metadata (relative to `root_path`). **kwargs : dict, kwargs for to_parquet function. """ fs = ioutils._ensure_filesystem(fs, root_path, **kwargs) fs.mkdirs(root_path, exist_ok=True) if partition_cols is not None and len(partition_cols) > 0: ( full_paths, metadata_file_paths, grouped_df, part_offsets, _, ) = _get_partitioned( df, root_path, partition_cols, filename, fs, preserve_index, **kwargs, ) if return_metadata: kwargs["metadata_file_path"] = metadata_file_paths metadata = to_parquet( grouped_df, full_paths, index=preserve_index, partition_offsets=part_offsets, **kwargs, ) else: filename = filename or _generate_filename() full_path = fs.sep.join([root_path, filename]) if return_metadata: kwargs["metadata_file_path"] = filename metadata = df.to_parquet(full_path, index=preserve_index, **kwargs) return metadata
def write_to_dataset( df, root_path, filename=None, partition_cols=None, fs=None, preserve_index=False, return_metadata=False, **kwargs, ): """Wraps `to_parquet` to write partitioned Parquet datasets. For each combination of partition group and value, subdirectories are created as follows: .. code-block:: bash root_dir/ group=value1 <filename>.parquet ... group=valueN <filename>.parquet Parameters ---------- df : cudf.DataFrame root_path : string, The root directory of the dataset filename : string, default None The file name to use (within each partition directory). If None, a random uuid4 hex string will be used for each file name. fs : FileSystem, default None If nothing passed, paths assumed to be found in the local on-disk filesystem preserve_index : bool, default False Preserve index values in each parquet file. partition_cols : list, Column names by which to partition the dataset Columns are partitioned in the order they are given return_metadata : bool, default False Return parquet metadata for written data. Returned metadata will include the file-path metadata (relative to `root_path`). **kwargs : dict, kwargs for to_parquet function. """ fs = ioutils._ensure_filesystem(fs, root_path) fs.mkdirs(root_path, exist_ok=True) metadata = [] if partition_cols is not None and len(partition_cols) > 0: data_cols = df.columns.drop(partition_cols) if len(data_cols) == 0: raise ValueError("No data left to save outside partition columns") # Loop through the partition groups for _, sub_df in enumerate( _get_partition_groups(df, partition_cols, preserve_index=preserve_index)): if sub_df is None or len(sub_df) == 0: continue keys = tuple([sub_df[col].iloc[0] for col in partition_cols]) if not isinstance(keys, tuple): keys = (keys, ) subdir = fs.sep.join([ "{colname}={value}".format(colname=name, value=val) for name, val in zip(partition_cols, keys) ]) prefix = fs.sep.join([root_path, subdir]) fs.mkdirs(prefix, exist_ok=True) filename = filename or uuid4().hex + ".parquet" full_path = fs.sep.join([prefix, filename]) write_df = sub_df.copy(deep=False) write_df.drop(columns=partition_cols, inplace=True) with fs.open(full_path, mode="wb") as fil: fil = ioutils.get_IOBase_writer(fil) if return_metadata: metadata.append( write_df.to_parquet( fil, index=preserve_index, metadata_file_path=fs.sep.join([subdir, filename]), **kwargs, )) else: write_df.to_parquet(fil, index=preserve_index, **kwargs) else: filename = filename or uuid4().hex + ".parquet" full_path = fs.sep.join([root_path, filename]) if return_metadata: metadata.append( df.to_parquet( full_path, index=preserve_index, metadata_file_path=filename, **kwargs, )) else: df.to_parquet(full_path, index=preserve_index, **kwargs) if metadata: return (merge_parquet_filemetadata(metadata) if len(metadata) > 1 else metadata[0])
def read_parquet( filepath_or_buffer, engine="cudf", columns=None, filters=None, row_groups=None, skiprows=None, num_rows=None, strings_to_categorical=False, use_pandas_metadata=True, *args, **kwargs, ): """{docstring}""" # Multiple sources are passed as a list. If a single source is passed, # wrap it in a list for unified processing downstream. if not is_list_like(filepath_or_buffer): filepath_or_buffer = [filepath_or_buffer] # a list of row groups per source should be passed. make the list of # lists that is expected for multiple sources if row_groups is not None: if not is_list_like(row_groups): row_groups = [[row_groups]] elif not is_list_like(row_groups[0]): row_groups = [row_groups] filepaths_or_buffers = [] for source in filepath_or_buffer: if ioutils.is_directory(source, **kwargs): fs = ioutils._ensure_filesystem(passed_filesystem=None, path=source) source = ioutils.stringify_pathlike(source) source = fs.sep.join([source, "*.parquet"]) tmp_source, compression = ioutils.get_filepath_or_buffer( path_or_data=source, compression=None, **kwargs, ) if compression is not None: raise ValueError( "URL content-encoding decompression is not supported") if isinstance(tmp_source, list): filepath_or_buffer.extend(tmp_source) else: filepaths_or_buffers.append(tmp_source) if filters is not None: # Convert filters to ds.Expression filters = pq._filters_to_expression(filters) # Initialize ds.FilesystemDataset dataset = ds.dataset(filepaths_or_buffers, format="parquet", partitioning="hive") # Load IDs of filtered row groups for each file in dataset filtered_rg_ids = defaultdict(list) for fragment in dataset.get_fragments(filter=filters): for rg_fragment in fragment.split_by_row_group(filters): for rg_info in rg_fragment.row_groups: filtered_rg_ids[rg_fragment.path].append(rg_info.id) # Initialize row_groups to be selected if row_groups is None: row_groups = [None for _ in dataset.files] # Store IDs of selected row groups for each file for i, file in enumerate(dataset.files): if row_groups[i] is None: row_groups[i] = filtered_rg_ids[file] else: row_groups[i] = filter(lambda id: id in row_groups[i], filtered_rg_ids[file]) if engine == "cudf": return libparquet.read_parquet( filepaths_or_buffers, columns=columns, row_groups=row_groups, skiprows=skiprows, num_rows=num_rows, strings_to_categorical=strings_to_categorical, use_pandas_metadata=use_pandas_metadata, ) else: warnings.warn("Using CPU via PyArrow to read Parquet dataset.") return cudf.DataFrame.from_arrow( pq.ParquetDataset(filepaths_or_buffers).read_pandas( columns=columns, *args, **kwargs))