示例#1
0
def to_orc(df, fname, compression=None, enable_statistics=True, **kwargs):
    """{docstring}"""

    for col in df._data.columns:
        if isinstance(col, cudf.core.column.ListColumn):
            raise NotImplementedError(
                "Writing to ORC format is not yet supported with "
                "list columns."
            )
        elif isinstance(col, cudf.core.column.StructColumn):
            raise NotImplementedError(
                "Writing to ORC format is not yet supported with "
                "Struct columns."
            )
        elif isinstance(col, cudf.core.column.CategoricalColumn):
            raise NotImplementedError(
                "Writing to ORC format is not yet supported with "
                "Categorical columns."
            )

    if isinstance(df.index, cudf.CategoricalIndex):
        raise NotImplementedError(
            "Writing to ORC format is not yet supported with "
            "Categorical columns."
        )

    path_or_buf = ioutils.get_writer_filepath_or_buffer(
        path_or_data=fname, mode="wb", **kwargs
    )
    if ioutils.is_fsspec_open_file(path_or_buf):
        with path_or_buf as file_obj:
            file_obj = ioutils.get_IOBase_writer(file_obj)
            libcudf.orc.write_orc(df, file_obj, compression, enable_statistics)
    else:
        libcudf.orc.write_orc(df, path_or_buf, compression, enable_statistics)
示例#2
0
def to_orc(
    df,
    fname,
    compression=None,
    statistics="ROWGROUP",
    stripe_size_bytes=None,
    stripe_size_rows=None,
    row_index_stride=None,
    **kwargs,
):
    """{docstring}"""

    for col in df._data.columns:
        if isinstance(col, cudf.core.column.StructColumn):
            warnings.warn(
                "Support for writing tables with struct columns is "
                "currently experimental."
            )
        if isinstance(col, cudf.core.column.CategoricalColumn):
            raise NotImplementedError(
                "Writing to ORC format is not yet supported with "
                "Categorical columns."
            )

    if isinstance(df.index, cudf.CategoricalIndex):
        raise NotImplementedError(
            "Writing to ORC format is not yet supported with "
            "Categorical columns."
        )

    path_or_buf = ioutils.get_writer_filepath_or_buffer(
        path_or_data=fname, mode="wb", **kwargs
    )
    if ioutils.is_fsspec_open_file(path_or_buf):
        with path_or_buf as file_obj:
            file_obj = ioutils.get_IOBase_writer(file_obj)
            liborc.write_orc(
                df,
                file_obj,
                compression,
                statistics,
                stripe_size_bytes,
                stripe_size_rows,
                row_index_stride,
            )
    else:
        liborc.write_orc(
            df,
            path_or_buf,
            compression,
            statistics,
            stripe_size_bytes,
            stripe_size_rows,
            row_index_stride,
        )
示例#3
0
文件: orc.py 项目: gerashegalov/cudf
def to_orc(df, fname, compression=None, enable_statistics=True, **kwargs):
    """{docstring}"""

    path_or_buf = ioutils.get_writer_filepath_or_buffer(
        path_or_data=fname, mode="wb", **kwargs
    )
    if ioutils.is_fsspec_open_file(path_or_buf):
        with path_or_buf as file_obj:
            file_obj = ioutils.get_IOBase_writer(file_obj)
            libcudf.orc.write_orc(df, file_obj, compression, enable_statistics)
    else:
        libcudf.orc.write_orc(df, path_or_buf, compression, enable_statistics)
示例#4
0
def _write_parquet(
    df,
    paths,
    compression="snappy",
    index=None,
    statistics="ROWGROUP",
    metadata_file_path=None,
    int96_timestamps=False,
    row_group_size_bytes=None,
    row_group_size_rows=None,
    partitions_info=None,
    **kwargs,
):
    if is_list_like(paths) and len(paths) > 1:
        if partitions_info is None:
            ValueError("partition info is required for multiple paths")
        elif not is_list_like(partitions_info):
            ValueError("partition info must be list-like for multiple paths")
        elif not len(paths) == len(partitions_info):
            ValueError("partitions_info and paths must be of same size")
    if is_list_like(partitions_info) and len(partitions_info) > 1:
        if not is_list_like(paths):
            ValueError("paths must be list-like when partitions_info provided")

    paths_or_bufs = [
        ioutils.get_writer_filepath_or_buffer(path, mode="wb", **kwargs)
        for path in paths
    ]
    common_args = {
        "index": index,
        "compression": compression,
        "statistics": statistics,
        "metadata_file_path": metadata_file_path,
        "int96_timestamps": int96_timestamps,
        "row_group_size_bytes": row_group_size_bytes,
        "row_group_size_rows": row_group_size_rows,
        "partitions_info": partitions_info,
    }
    if all([ioutils.is_fsspec_open_file(buf) for buf in paths_or_bufs]):
        with ExitStack() as stack:
            fsspec_objs = [stack.enter_context(file) for file in paths_or_bufs]
            file_objs = [
                ioutils.get_IOBase_writer(file_obj) for file_obj in fsspec_objs
            ]
            write_parquet_res = libparquet.write_parquet(
                df, filepaths_or_buffers=file_objs, **common_args)
    else:
        write_parquet_res = libparquet.write_parquet(
            df, filepaths_or_buffers=paths_or_bufs, **common_args)

    return write_parquet_res
示例#5
0
def to_csv(
    df,
    path_or_buf=None,
    sep=",",
    na_rep="",
    columns=None,
    header=True,
    index=True,
    line_terminator="\n",
    chunksize=None,
    **kwargs,
):
    """{docstring}"""

    return_as_string = False
    if path_or_buf is None:
        path_or_buf = StringIO()
        return_as_string = True

    path_or_buf = ioutils.get_writer_filepath_or_buffer(
        path_or_data=path_or_buf, mode="w", **kwargs)

    if index:
        from cudf import MultiIndex

        if not isinstance(df.index, MultiIndex):
            if df.index.name is None:
                df.index.name = ""
            if columns is not None:
                columns = columns.copy()
                columns.insert(0, df.index.name)
        df = df.reset_index()

    if columns is not None:
        try:
            df = df[columns]
        except KeyError:
            raise NameError(
                "Dataframe doesn't have the labels provided in columns")

    rows_per_chunk = chunksize if chunksize else len(df)

    if ioutils.is_fsspec_open_file(path_or_buf):
        with path_or_buf as file_obj:
            file_obj = ioutils.get_IOBase_writer(file_obj)
            libcudf.csv.write_csv(
                df,
                path_or_buf=file_obj,
                sep=sep,
                na_rep=na_rep,
                header=header,
                line_terminator=line_terminator,
                rows_per_chunk=rows_per_chunk,
            )
    else:
        libcudf.csv.write_csv(
            df,
            path_or_buf=path_or_buf,
            sep=sep,
            na_rep=na_rep,
            header=header,
            line_terminator=line_terminator,
            rows_per_chunk=rows_per_chunk,
        )

    if return_as_string:
        path_or_buf.seek(0)
        return path_or_buf.read()
示例#6
0
def write_to_dataset(
    df,
    root_path,
    partition_cols=None,
    fs=None,
    preserve_index=False,
    return_metadata=False,
    **kwargs,
):
    """Wraps `to_parquet` to write partitioned Parquet datasets.
    For each combination of partition group and value,
    subdirectories are created as follows:

    .. code-block:: bash

        root_dir/
            group=value1
                <uuid>.parquet
            ...
            group=valueN
                <uuid>.parquet

    Parameters
    ----------
    df : cudf.DataFrame
    root_path : string,
        The root directory of the dataset
    fs : FileSystem, default None
        If nothing passed, paths assumed to be found in the local on-disk
        filesystem
    preserve_index : bool, default False
        Preserve index values in each parquet file.
    partition_cols : list,
        Column names by which to partition the dataset
        Columns are partitioned in the order they are given
    return_metadata : bool, default False
        Return parquet metadata for written data. Returned metadata will
        include the file-path metadata (relative to `root_path`).
    **kwargs : dict,
        kwargs for to_parquet function.
    """

    fs = _ensure_filesystem(fs, root_path)
    fs.mkdirs(root_path, exist_ok=True)
    metadata = []

    if partition_cols is not None and len(partition_cols) > 0:

        data_cols = df.columns.drop(partition_cols)
        if len(data_cols) == 0:
            raise ValueError("No data left to save outside partition columns")

        #  Loop through the partition groups
        for i, sub_df in enumerate(
                _get_partition_groups(df,
                                      partition_cols,
                                      preserve_index=preserve_index)):
            if sub_df is None or len(sub_df) == 0:
                continue
            keys = tuple([sub_df[col].iloc[0] for col in partition_cols])
            if not isinstance(keys, tuple):
                keys = (keys, )
            subdir = fs.sep.join([
                "{colname}={value}".format(colname=name, value=val)
                for name, val in zip(partition_cols, keys)
            ])
            prefix = fs.sep.join([root_path, subdir])
            fs.mkdirs(prefix, exist_ok=True)
            outfile = guid() + ".parquet"
            full_path = fs.sep.join([prefix, outfile])
            write_df = sub_df.copy(deep=False)
            write_df.drop(columns=partition_cols, inplace=True)
            with fs.open(full_path, mode="wb") as fil:
                fil = ioutils.get_IOBase_writer(fil)
                if return_metadata:
                    metadata.append(
                        write_df.to_parquet(
                            fil,
                            index=preserve_index,
                            metadata_file_path=fs.sep.join([subdir, outfile]),
                            **kwargs,
                        ))
                else:
                    write_df.to_parquet(fil, index=preserve_index, **kwargs)

    else:
        outfile = guid() + ".parquet"
        full_path = fs.sep.join([root_path, outfile])
        if return_metadata:
            metadata.append(
                df.to_parquet(
                    full_path,
                    index=preserve_index,
                    metadata_file_path=outfile,
                    **kwargs,
                ))
        else:
            df.to_parquet(full_path, index=preserve_index, **kwargs)

    if metadata:
        return (merge_parquet_filemetadata(metadata)
                if len(metadata) > 1 else metadata[0])
示例#7
0
def to_parquet(
    df,
    path,
    engine="cudf",
    compression="snappy",
    index=None,
    partition_cols=None,
    statistics="ROWGROUP",
    metadata_file_path=None,
    *args,
    **kwargs,
):
    """{docstring}"""

    if engine == "cudf":
        if partition_cols:
            write_to_dataset(
                df,
                root_path=path,
                partition_cols=partition_cols,
                preserve_index=index,
                **kwargs,
            )
            return

        # Ensure that no columns dtype is 'category'
        for col in df.columns:
            if df[col].dtype.name == "category":
                raise ValueError(
                    "'category' column dtypes are currently not " +
                    "supported by the gpu accelerated parquet writer")

        path_or_buf = ioutils.get_writer_filepath_or_buffer(path,
                                                            mode="wb",
                                                            **kwargs)
        if ioutils.is_fsspec_open_file(path_or_buf):
            with path_or_buf as file_obj:
                file_obj = ioutils.get_IOBase_writer(file_obj)
                write_parquet_res = libparquet.write_parquet(
                    df,
                    path=file_obj,
                    index=index,
                    compression=compression,
                    statistics=statistics,
                    metadata_file_path=metadata_file_path,
                )
        else:
            write_parquet_res = libparquet.write_parquet(
                df,
                path=path_or_buf,
                index=index,
                compression=compression,
                statistics=statistics,
                metadata_file_path=metadata_file_path,
            )

        return write_parquet_res

    else:

        # If index is empty set it to the expected default value of True
        if index is None:
            index = True

        pa_table = df.to_arrow(preserve_index=index)
        return pq.write_to_dataset(
            pa_table,
            root_path=path,
            partition_cols=partition_cols,
            *args,
            **kwargs,
        )
示例#8
0
def to_csv(
    df,
    path_or_buf=None,
    sep=",",
    na_rep="",
    columns=None,
    header=True,
    index=True,
    line_terminator="\n",
    chunksize=None,
    **kwargs,
):
    """{docstring}"""

    return_as_string = False
    if path_or_buf is None:
        path_or_buf = StringIO()
        return_as_string = True

    path_or_buf = ioutils.get_writer_filepath_or_buffer(
        path_or_data=path_or_buf, mode="w", **kwargs)

    if columns is not None:
        try:
            df = df[columns]
        except KeyError:
            raise NameError(
                "Dataframe doesn't have the labels provided in columns")

    if sep == "-":
        # TODO: Remove this error once following issue is fixed:
        # https://github.com/rapidsai/cudf/issues/6699
        if any(
                isinstance(col, cudf.core.column.DatetimeColumn)
                for col in df._data.columns):
            raise ValueError(
                "sep cannot be '-' when writing a datetime64 dtype to csv, "
                "refer to: https://github.com/rapidsai/cudf/issues/6699")

    # TODO: Need to typecast categorical columns to the underlying
    # categories dtype to write the actual data to csv. Remove this
    # workaround once following issue is fixed:
    # https://github.com/rapidsai/cudf/issues/6661
    if any(
            isinstance(col, cudf.core.column.CategoricalColumn)
            for col in df._data.columns) or isinstance(df.index,
                                                       cudf.CategoricalIndex):
        df = df.copy(deep=False)
        for col_name, col in df._data.items():
            if isinstance(col, cudf.core.column.CategoricalColumn):
                df._data[col_name] = col.astype(col.cat().categories.dtype)

        if isinstance(df.index, cudf.CategoricalIndex):
            df.index = df.index.astype(df.index.categories.dtype)

    rows_per_chunk = chunksize if chunksize else len(df)

    if ioutils.is_fsspec_open_file(path_or_buf):
        with path_or_buf as file_obj:
            file_obj = ioutils.get_IOBase_writer(file_obj)
            libcudf.csv.write_csv(
                df,
                path_or_buf=file_obj,
                sep=sep,
                na_rep=na_rep,
                header=header,
                line_terminator=line_terminator,
                rows_per_chunk=rows_per_chunk,
                index=index,
            )
    else:
        libcudf.csv.write_csv(
            df,
            path_or_buf=path_or_buf,
            sep=sep,
            na_rep=na_rep,
            header=header,
            line_terminator=line_terminator,
            rows_per_chunk=rows_per_chunk,
            index=index,
        )

    if return_as_string:
        path_or_buf.seek(0)
        return path_or_buf.read()
示例#9
0
def to_csv(
    df,
    path_or_buf=None,
    sep=",",
    na_rep="",
    columns=None,
    header=True,
    index=True,
    line_terminator="\n",
    chunksize=None,
    encoding=None,
    compression=None,
    **kwargs,
):
    """{docstring}"""

    if not isinstance(sep, str):
        raise TypeError(f'"sep" must be string, not {type(sep).__name__}')
    elif len(sep) > 1:
        raise TypeError('"sep" must be a 1-character string')

    if encoding and encoding != "utf-8":
        error_msg = (
            f"Encoding {encoding} is not supported. "
            + "Currently, only utf-8 encoding is supported."
        )
        raise NotImplementedError(error_msg)

    if compression:
        error_msg = "Writing compressed csv is not currently supported in cudf"
        raise NotImplementedError(error_msg)

    return_as_string = False
    if path_or_buf is None:
        path_or_buf = StringIO()
        return_as_string = True

    path_or_buf = ioutils.get_writer_filepath_or_buffer(
        path_or_data=path_or_buf, mode="w", **kwargs
    )

    if columns is not None:
        try:
            df = df[columns]
        except KeyError:
            raise NameError(
                "Dataframe doesn't have the labels provided in columns"
            )

    for col in df._data.columns:
        if isinstance(col, cudf.core.column.ListColumn):
            raise NotImplementedError(
                "Writing to csv format is not yet supported with "
                "list columns."
            )
        elif isinstance(col, cudf.core.column.StructColumn):
            raise NotImplementedError(
                "Writing to csv format is not yet supported with "
                "Struct columns."
            )

    # TODO: Need to typecast categorical columns to the underlying
    # categories dtype to write the actual data to csv. Remove this
    # workaround once following issue is fixed:
    # https://github.com/rapidsai/cudf/issues/6661
    if any(
        isinstance(col, cudf.core.column.CategoricalColumn)
        for col in df._data.columns
    ) or isinstance(df.index, cudf.CategoricalIndex):
        df = df.copy(deep=False)
        for col_name, col in df._data.items():
            if isinstance(col, cudf.core.column.CategoricalColumn):
                df._data[col_name] = col.astype(col.categories.dtype)

        if isinstance(df.index, cudf.CategoricalIndex):
            df.index = df.index.astype(df.index.categories.dtype)

    rows_per_chunk = chunksize if chunksize else len(df)

    if ioutils.is_fsspec_open_file(path_or_buf):
        with path_or_buf as file_obj:
            file_obj = ioutils.get_IOBase_writer(file_obj)
            libcudf.csv.write_csv(
                df,
                path_or_buf=file_obj,
                sep=sep,
                na_rep=na_rep,
                header=header,
                line_terminator=line_terminator,
                rows_per_chunk=rows_per_chunk,
                index=index,
            )
    else:
        libcudf.csv.write_csv(
            df,
            path_or_buf=path_or_buf,
            sep=sep,
            na_rep=na_rep,
            header=header,
            line_terminator=line_terminator,
            rows_per_chunk=rows_per_chunk,
            index=index,
        )

    if return_as_string:
        path_or_buf.seek(0)
        return path_or_buf.read()