def to_orc(df, fname, compression=None, enable_statistics=True, **kwargs): """{docstring}""" for col in df._data.columns: if isinstance(col, cudf.core.column.ListColumn): raise NotImplementedError( "Writing to ORC format is not yet supported with " "list columns." ) elif isinstance(col, cudf.core.column.StructColumn): raise NotImplementedError( "Writing to ORC format is not yet supported with " "Struct columns." ) elif isinstance(col, cudf.core.column.CategoricalColumn): raise NotImplementedError( "Writing to ORC format is not yet supported with " "Categorical columns." ) if isinstance(df.index, cudf.CategoricalIndex): raise NotImplementedError( "Writing to ORC format is not yet supported with " "Categorical columns." ) path_or_buf = ioutils.get_writer_filepath_or_buffer( path_or_data=fname, mode="wb", **kwargs ) if ioutils.is_fsspec_open_file(path_or_buf): with path_or_buf as file_obj: file_obj = ioutils.get_IOBase_writer(file_obj) libcudf.orc.write_orc(df, file_obj, compression, enable_statistics) else: libcudf.orc.write_orc(df, path_or_buf, compression, enable_statistics)
def to_orc( df, fname, compression=None, statistics="ROWGROUP", stripe_size_bytes=None, stripe_size_rows=None, row_index_stride=None, **kwargs, ): """{docstring}""" for col in df._data.columns: if isinstance(col, cudf.core.column.StructColumn): warnings.warn( "Support for writing tables with struct columns is " "currently experimental." ) if isinstance(col, cudf.core.column.CategoricalColumn): raise NotImplementedError( "Writing to ORC format is not yet supported with " "Categorical columns." ) if isinstance(df.index, cudf.CategoricalIndex): raise NotImplementedError( "Writing to ORC format is not yet supported with " "Categorical columns." ) path_or_buf = ioutils.get_writer_filepath_or_buffer( path_or_data=fname, mode="wb", **kwargs ) if ioutils.is_fsspec_open_file(path_or_buf): with path_or_buf as file_obj: file_obj = ioutils.get_IOBase_writer(file_obj) liborc.write_orc( df, file_obj, compression, statistics, stripe_size_bytes, stripe_size_rows, row_index_stride, ) else: liborc.write_orc( df, path_or_buf, compression, statistics, stripe_size_bytes, stripe_size_rows, row_index_stride, )
def to_orc(df, fname, compression=None, enable_statistics=True, **kwargs): """{docstring}""" path_or_buf = ioutils.get_writer_filepath_or_buffer( path_or_data=fname, mode="wb", **kwargs ) if ioutils.is_fsspec_open_file(path_or_buf): with path_or_buf as file_obj: file_obj = ioutils.get_IOBase_writer(file_obj) libcudf.orc.write_orc(df, file_obj, compression, enable_statistics) else: libcudf.orc.write_orc(df, path_or_buf, compression, enable_statistics)
def _write_parquet( df, paths, compression="snappy", index=None, statistics="ROWGROUP", metadata_file_path=None, int96_timestamps=False, row_group_size_bytes=None, row_group_size_rows=None, partitions_info=None, **kwargs, ): if is_list_like(paths) and len(paths) > 1: if partitions_info is None: ValueError("partition info is required for multiple paths") elif not is_list_like(partitions_info): ValueError("partition info must be list-like for multiple paths") elif not len(paths) == len(partitions_info): ValueError("partitions_info and paths must be of same size") if is_list_like(partitions_info) and len(partitions_info) > 1: if not is_list_like(paths): ValueError("paths must be list-like when partitions_info provided") paths_or_bufs = [ ioutils.get_writer_filepath_or_buffer(path, mode="wb", **kwargs) for path in paths ] common_args = { "index": index, "compression": compression, "statistics": statistics, "metadata_file_path": metadata_file_path, "int96_timestamps": int96_timestamps, "row_group_size_bytes": row_group_size_bytes, "row_group_size_rows": row_group_size_rows, "partitions_info": partitions_info, } if all([ioutils.is_fsspec_open_file(buf) for buf in paths_or_bufs]): with ExitStack() as stack: fsspec_objs = [stack.enter_context(file) for file in paths_or_bufs] file_objs = [ ioutils.get_IOBase_writer(file_obj) for file_obj in fsspec_objs ] write_parquet_res = libparquet.write_parquet( df, filepaths_or_buffers=file_objs, **common_args) else: write_parquet_res = libparquet.write_parquet( df, filepaths_or_buffers=paths_or_bufs, **common_args) return write_parquet_res
def to_csv( df, path_or_buf=None, sep=",", na_rep="", columns=None, header=True, index=True, line_terminator="\n", chunksize=None, **kwargs, ): """{docstring}""" return_as_string = False if path_or_buf is None: path_or_buf = StringIO() return_as_string = True path_or_buf = ioutils.get_writer_filepath_or_buffer( path_or_data=path_or_buf, mode="w", **kwargs) if index: from cudf import MultiIndex if not isinstance(df.index, MultiIndex): if df.index.name is None: df.index.name = "" if columns is not None: columns = columns.copy() columns.insert(0, df.index.name) df = df.reset_index() if columns is not None: try: df = df[columns] except KeyError: raise NameError( "Dataframe doesn't have the labels provided in columns") rows_per_chunk = chunksize if chunksize else len(df) if ioutils.is_fsspec_open_file(path_or_buf): with path_or_buf as file_obj: file_obj = ioutils.get_IOBase_writer(file_obj) libcudf.csv.write_csv( df, path_or_buf=file_obj, sep=sep, na_rep=na_rep, header=header, line_terminator=line_terminator, rows_per_chunk=rows_per_chunk, ) else: libcudf.csv.write_csv( df, path_or_buf=path_or_buf, sep=sep, na_rep=na_rep, header=header, line_terminator=line_terminator, rows_per_chunk=rows_per_chunk, ) if return_as_string: path_or_buf.seek(0) return path_or_buf.read()
def write_to_dataset( df, root_path, partition_cols=None, fs=None, preserve_index=False, return_metadata=False, **kwargs, ): """Wraps `to_parquet` to write partitioned Parquet datasets. For each combination of partition group and value, subdirectories are created as follows: .. code-block:: bash root_dir/ group=value1 <uuid>.parquet ... group=valueN <uuid>.parquet Parameters ---------- df : cudf.DataFrame root_path : string, The root directory of the dataset fs : FileSystem, default None If nothing passed, paths assumed to be found in the local on-disk filesystem preserve_index : bool, default False Preserve index values in each parquet file. partition_cols : list, Column names by which to partition the dataset Columns are partitioned in the order they are given return_metadata : bool, default False Return parquet metadata for written data. Returned metadata will include the file-path metadata (relative to `root_path`). **kwargs : dict, kwargs for to_parquet function. """ fs = _ensure_filesystem(fs, root_path) fs.mkdirs(root_path, exist_ok=True) metadata = [] if partition_cols is not None and len(partition_cols) > 0: data_cols = df.columns.drop(partition_cols) if len(data_cols) == 0: raise ValueError("No data left to save outside partition columns") # Loop through the partition groups for i, sub_df in enumerate( _get_partition_groups(df, partition_cols, preserve_index=preserve_index)): if sub_df is None or len(sub_df) == 0: continue keys = tuple([sub_df[col].iloc[0] for col in partition_cols]) if not isinstance(keys, tuple): keys = (keys, ) subdir = fs.sep.join([ "{colname}={value}".format(colname=name, value=val) for name, val in zip(partition_cols, keys) ]) prefix = fs.sep.join([root_path, subdir]) fs.mkdirs(prefix, exist_ok=True) outfile = guid() + ".parquet" full_path = fs.sep.join([prefix, outfile]) write_df = sub_df.copy(deep=False) write_df.drop(columns=partition_cols, inplace=True) with fs.open(full_path, mode="wb") as fil: fil = ioutils.get_IOBase_writer(fil) if return_metadata: metadata.append( write_df.to_parquet( fil, index=preserve_index, metadata_file_path=fs.sep.join([subdir, outfile]), **kwargs, )) else: write_df.to_parquet(fil, index=preserve_index, **kwargs) else: outfile = guid() + ".parquet" full_path = fs.sep.join([root_path, outfile]) if return_metadata: metadata.append( df.to_parquet( full_path, index=preserve_index, metadata_file_path=outfile, **kwargs, )) else: df.to_parquet(full_path, index=preserve_index, **kwargs) if metadata: return (merge_parquet_filemetadata(metadata) if len(metadata) > 1 else metadata[0])
def to_parquet( df, path, engine="cudf", compression="snappy", index=None, partition_cols=None, statistics="ROWGROUP", metadata_file_path=None, *args, **kwargs, ): """{docstring}""" if engine == "cudf": if partition_cols: write_to_dataset( df, root_path=path, partition_cols=partition_cols, preserve_index=index, **kwargs, ) return # Ensure that no columns dtype is 'category' for col in df.columns: if df[col].dtype.name == "category": raise ValueError( "'category' column dtypes are currently not " + "supported by the gpu accelerated parquet writer") path_or_buf = ioutils.get_writer_filepath_or_buffer(path, mode="wb", **kwargs) if ioutils.is_fsspec_open_file(path_or_buf): with path_or_buf as file_obj: file_obj = ioutils.get_IOBase_writer(file_obj) write_parquet_res = libparquet.write_parquet( df, path=file_obj, index=index, compression=compression, statistics=statistics, metadata_file_path=metadata_file_path, ) else: write_parquet_res = libparquet.write_parquet( df, path=path_or_buf, index=index, compression=compression, statistics=statistics, metadata_file_path=metadata_file_path, ) return write_parquet_res else: # If index is empty set it to the expected default value of True if index is None: index = True pa_table = df.to_arrow(preserve_index=index) return pq.write_to_dataset( pa_table, root_path=path, partition_cols=partition_cols, *args, **kwargs, )
def to_csv( df, path_or_buf=None, sep=",", na_rep="", columns=None, header=True, index=True, line_terminator="\n", chunksize=None, **kwargs, ): """{docstring}""" return_as_string = False if path_or_buf is None: path_or_buf = StringIO() return_as_string = True path_or_buf = ioutils.get_writer_filepath_or_buffer( path_or_data=path_or_buf, mode="w", **kwargs) if columns is not None: try: df = df[columns] except KeyError: raise NameError( "Dataframe doesn't have the labels provided in columns") if sep == "-": # TODO: Remove this error once following issue is fixed: # https://github.com/rapidsai/cudf/issues/6699 if any( isinstance(col, cudf.core.column.DatetimeColumn) for col in df._data.columns): raise ValueError( "sep cannot be '-' when writing a datetime64 dtype to csv, " "refer to: https://github.com/rapidsai/cudf/issues/6699") # TODO: Need to typecast categorical columns to the underlying # categories dtype to write the actual data to csv. Remove this # workaround once following issue is fixed: # https://github.com/rapidsai/cudf/issues/6661 if any( isinstance(col, cudf.core.column.CategoricalColumn) for col in df._data.columns) or isinstance(df.index, cudf.CategoricalIndex): df = df.copy(deep=False) for col_name, col in df._data.items(): if isinstance(col, cudf.core.column.CategoricalColumn): df._data[col_name] = col.astype(col.cat().categories.dtype) if isinstance(df.index, cudf.CategoricalIndex): df.index = df.index.astype(df.index.categories.dtype) rows_per_chunk = chunksize if chunksize else len(df) if ioutils.is_fsspec_open_file(path_or_buf): with path_or_buf as file_obj: file_obj = ioutils.get_IOBase_writer(file_obj) libcudf.csv.write_csv( df, path_or_buf=file_obj, sep=sep, na_rep=na_rep, header=header, line_terminator=line_terminator, rows_per_chunk=rows_per_chunk, index=index, ) else: libcudf.csv.write_csv( df, path_or_buf=path_or_buf, sep=sep, na_rep=na_rep, header=header, line_terminator=line_terminator, rows_per_chunk=rows_per_chunk, index=index, ) if return_as_string: path_or_buf.seek(0) return path_or_buf.read()
def to_csv( df, path_or_buf=None, sep=",", na_rep="", columns=None, header=True, index=True, line_terminator="\n", chunksize=None, encoding=None, compression=None, **kwargs, ): """{docstring}""" if not isinstance(sep, str): raise TypeError(f'"sep" must be string, not {type(sep).__name__}') elif len(sep) > 1: raise TypeError('"sep" must be a 1-character string') if encoding and encoding != "utf-8": error_msg = ( f"Encoding {encoding} is not supported. " + "Currently, only utf-8 encoding is supported." ) raise NotImplementedError(error_msg) if compression: error_msg = "Writing compressed csv is not currently supported in cudf" raise NotImplementedError(error_msg) return_as_string = False if path_or_buf is None: path_or_buf = StringIO() return_as_string = True path_or_buf = ioutils.get_writer_filepath_or_buffer( path_or_data=path_or_buf, mode="w", **kwargs ) if columns is not None: try: df = df[columns] except KeyError: raise NameError( "Dataframe doesn't have the labels provided in columns" ) for col in df._data.columns: if isinstance(col, cudf.core.column.ListColumn): raise NotImplementedError( "Writing to csv format is not yet supported with " "list columns." ) elif isinstance(col, cudf.core.column.StructColumn): raise NotImplementedError( "Writing to csv format is not yet supported with " "Struct columns." ) # TODO: Need to typecast categorical columns to the underlying # categories dtype to write the actual data to csv. Remove this # workaround once following issue is fixed: # https://github.com/rapidsai/cudf/issues/6661 if any( isinstance(col, cudf.core.column.CategoricalColumn) for col in df._data.columns ) or isinstance(df.index, cudf.CategoricalIndex): df = df.copy(deep=False) for col_name, col in df._data.items(): if isinstance(col, cudf.core.column.CategoricalColumn): df._data[col_name] = col.astype(col.categories.dtype) if isinstance(df.index, cudf.CategoricalIndex): df.index = df.index.astype(df.index.categories.dtype) rows_per_chunk = chunksize if chunksize else len(df) if ioutils.is_fsspec_open_file(path_or_buf): with path_or_buf as file_obj: file_obj = ioutils.get_IOBase_writer(file_obj) libcudf.csv.write_csv( df, path_or_buf=file_obj, sep=sep, na_rep=na_rep, header=header, line_terminator=line_terminator, rows_per_chunk=rows_per_chunk, index=index, ) else: libcudf.csv.write_csv( df, path_or_buf=path_or_buf, sep=sep, na_rep=na_rep, header=header, line_terminator=line_terminator, rows_per_chunk=rows_per_chunk, index=index, ) if return_as_string: path_or_buf.seek(0) return path_or_buf.read()