Exemplo n.º 1
0
def _get_partitioned(
    df,
    root_path,
    partition_cols,
    filename=None,
    fs=None,
    preserve_index=False,
    **kwargs,
):
    fs = ioutils._ensure_filesystem(fs, root_path, **kwargs)
    fs.mkdirs(root_path, exist_ok=True)
    if not (set(df._data) - set(partition_cols)):
        raise ValueError("No data left to save outside partition columns")

    part_names, part_offsets, _, grouped_df = df.groupby(
        partition_cols)._grouped()
    if not preserve_index:
        grouped_df.reset_index(drop=True, inplace=True)
    grouped_df.drop(columns=partition_cols, inplace=True)
    # Copy the entire keys df in one operation rather than using iloc
    part_names = part_names.to_pandas().to_frame(index=False)

    full_paths = []
    metadata_file_paths = []
    for keys in part_names.itertuples(index=False):
        subdir = fs.sep.join(
            [f"{name}={val}" for name, val in zip(partition_cols, keys)])
        prefix = fs.sep.join([root_path, subdir])
        fs.mkdirs(prefix, exist_ok=True)
        filename = filename or _generate_filename()
        full_path = fs.sep.join([prefix, filename])
        full_paths.append(full_path)
        metadata_file_paths.append(fs.sep.join([subdir, filename]))

    return full_paths, metadata_file_paths, grouped_df, part_offsets, filename
Exemplo n.º 2
0
def read_orc(
    filepath_or_buffer,
    engine="cudf",
    columns=None,
    filters=None,
    stripes=None,
    skiprows=None,
    num_rows=None,
    use_index=True,
    decimal_cols_as_float=None,
    timestamp_type=None,
    use_python_file_object=True,
    **kwargs,
):
    """{docstring}"""
    if decimal_cols_as_float is not None:
        warnings.warn(
            "`decimal_cols_as_float` is deprecated and will be removed in "
            "the future",
            FutureWarning,
        )
    from cudf import DataFrame

    # Multiple sources are passed as a list. If a single source is passed,
    # wrap it in a list for unified processing downstream.
    if not is_list_like(filepath_or_buffer):
        filepath_or_buffer = [filepath_or_buffer]

    # Each source must have a correlating stripe list. If a single stripe list
    # is provided rather than a list of list of stripes then extrapolate that
    # stripe list across all input sources
    if stripes is not None:
        if any(not isinstance(stripe, list) for stripe in stripes):
            stripes = [stripes]

        # Must ensure a stripe for each source is specified, unless None
        if not len(stripes) == len(filepath_or_buffer):
            raise ValueError(
                "A list of stripes must be provided for each input source"
            )

    filepaths_or_buffers = []
    for source in filepath_or_buffer:
        if ioutils.is_directory(source, **kwargs):
            fs = ioutils._ensure_filesystem(
                passed_filesystem=None, path=source, **kwargs,
            )
            source = stringify_path(source)
            source = fs.sep.join([source, "*.orc"])

        tmp_source, compression = ioutils.get_filepath_or_buffer(
            path_or_data=source,
            compression=None,
            use_python_file_object=use_python_file_object,
            **kwargs,
        )
        if compression is not None:
            raise ValueError(
                "URL content-encoding decompression is not supported"
            )
        if isinstance(tmp_source, list):
            filepaths_or_buffers.extend(tmp_source)
        else:
            filepaths_or_buffers.append(tmp_source)

    if filters is not None:
        selected_stripes = _filter_stripes(
            filters, filepaths_or_buffers, stripes, skiprows, num_rows
        )

        # Return empty if everything was filtered
        if len(selected_stripes) == 0:
            return _make_empty_df(filepaths_or_buffers[0], columns)
        else:
            stripes = selected_stripes

    if engine == "cudf":
        return DataFrame._from_data(
            *liborc.read_orc(
                filepaths_or_buffers,
                columns,
                stripes,
                skiprows,
                num_rows,
                use_index,
                decimal_cols_as_float,
                timestamp_type,
            )
        )
    else:

        def read_orc_stripe(orc_file, stripe, columns):
            pa_table = orc_file.read_stripe(stripe, columns)
            if isinstance(pa_table, pa.RecordBatch):
                pa_table = pa.Table.from_batches([pa_table])
            return pa_table

        warnings.warn("Using CPU via PyArrow to read ORC dataset.")
        if len(filepath_or_buffer) > 1:
            raise NotImplementedError(
                "Using CPU via PyArrow only supports a single a "
                "single input source"
            )

        orc_file = orc.ORCFile(filepath_or_buffer[0])
        if stripes is not None and len(stripes) > 0:
            for stripe_source_file in stripes:
                pa_tables = [
                    read_orc_stripe(orc_file, i, columns)
                    for i in stripe_source_file
                ]
                pa_table = pa.concat_tables(pa_tables)
        else:
            pa_table = orc_file.read(columns=columns)
        df = cudf.DataFrame.from_arrow(pa_table)

    return df
Exemplo n.º 3
0
def read_json(
    path_or_buf,
    engine="auto",
    dtype=True,
    lines=False,
    compression="infer",
    byte_range=None,
    *args,
    **kwargs,
):
    """{docstring}"""

    if engine == "cudf" and not lines:
        raise ValueError("cudf engine only supports JSON Lines format")
    if engine == "auto":
        engine = "cudf" if lines else "pandas"
    if engine == "cudf":
        # Multiple sources are passed as a list. If a single source is passed,
        # wrap it in a list for unified processing downstream.
        if not is_list_like(path_or_buf):
            path_or_buf = [path_or_buf]

        filepaths_or_buffers = []
        for source in path_or_buf:
            if ioutils.is_directory(source, **kwargs):
                fs = ioutils._ensure_filesystem(passed_filesystem=None,
                                                path=source)
                source = ioutils.stringify_pathlike(source)
                source = fs.sep.join([source, "*.json"])

            tmp_source, compression = ioutils.get_filepath_or_buffer(
                path_or_data=source,
                compression=compression,
                iotypes=(BytesIO, StringIO),
                **kwargs,
            )
            if isinstance(tmp_source, list):
                filepaths_or_buffers.extend(tmp_source)
            else:
                filepaths_or_buffers.append(tmp_source)

        return cudf.DataFrame._from_data(*libjson.read_json(
            filepaths_or_buffers, dtype, lines, compression, byte_range))
    else:
        warnings.warn("Using CPU via Pandas to read JSON dataset, this may "
                      "be GPU accelerated in the future")

        if not ioutils.ensure_single_filepath_or_buffer(
                path_or_data=path_or_buf,
                **kwargs,
        ):
            raise NotImplementedError(
                "`read_json` does not yet support reading "
                "multiple files via pandas")

        path_or_buf, compression = ioutils.get_filepath_or_buffer(
            path_or_data=path_or_buf,
            compression=compression,
            iotypes=(BytesIO, StringIO),
            **kwargs,
        )

        if kwargs.get("orient") == "table":
            pd_value = pd.read_json(
                path_or_buf,
                lines=lines,
                compression=compression,
                *args,
                **kwargs,
            )
        else:
            pd_value = pd.read_json(
                path_or_buf,
                lines=lines,
                dtype=dtype,
                compression=compression,
                *args,
                **kwargs,
            )
        df = cudf.from_pandas(pd_value)

    return df
Exemplo n.º 4
0
def write_to_dataset(
    df,
    root_path,
    filename=None,
    partition_cols=None,
    fs=None,
    preserve_index=False,
    return_metadata=False,
    **kwargs,
):
    """Wraps `to_parquet` to write partitioned Parquet datasets.
    For each combination of partition group and value,
    subdirectories are created as follows:

    .. code-block:: bash

        root_dir/
            group=value1
                <filename>.parquet
            ...
            group=valueN
                <filename>.parquet

    Parameters
    ----------
    df : cudf.DataFrame
    root_path : string,
        The root directory of the dataset
    filename : string, default None
        The file name to use (within each partition directory). If None,
        a random uuid4 hex string will be used for each file name.
    fs : FileSystem, default None
        If nothing passed, paths assumed to be found in the local on-disk
        filesystem
    preserve_index : bool, default False
        Preserve index values in each parquet file.
    partition_cols : list,
        Column names by which to partition the dataset
        Columns are partitioned in the order they are given
    return_metadata : bool, default False
        Return parquet metadata for written data. Returned metadata will
        include the file-path metadata (relative to `root_path`).
    **kwargs : dict,
        kwargs for to_parquet function.
    """

    fs = ioutils._ensure_filesystem(fs, root_path, **kwargs)
    fs.mkdirs(root_path, exist_ok=True)

    if partition_cols is not None and len(partition_cols) > 0:

        (
            full_paths,
            metadata_file_paths,
            grouped_df,
            part_offsets,
            _,
        ) = _get_partitioned(
            df,
            root_path,
            partition_cols,
            filename,
            fs,
            preserve_index,
            **kwargs,
        )

        if return_metadata:
            kwargs["metadata_file_path"] = metadata_file_paths
        metadata = to_parquet(
            grouped_df,
            full_paths,
            index=preserve_index,
            partition_offsets=part_offsets,
            **kwargs,
        )

    else:
        filename = filename or _generate_filename()
        full_path = fs.sep.join([root_path, filename])
        if return_metadata:
            kwargs["metadata_file_path"] = filename
        metadata = df.to_parquet(full_path, index=preserve_index, **kwargs)

    return metadata
Exemplo n.º 5
0
def write_to_dataset(
    df,
    root_path,
    filename=None,
    partition_cols=None,
    fs=None,
    preserve_index=False,
    return_metadata=False,
    **kwargs,
):
    """Wraps `to_parquet` to write partitioned Parquet datasets.
    For each combination of partition group and value,
    subdirectories are created as follows:

    .. code-block:: bash

        root_dir/
            group=value1
                <filename>.parquet
            ...
            group=valueN
                <filename>.parquet

    Parameters
    ----------
    df : cudf.DataFrame
    root_path : string,
        The root directory of the dataset
    filename : string, default None
        The file name to use (within each partition directory). If None,
        a random uuid4 hex string will be used for each file name.
    fs : FileSystem, default None
        If nothing passed, paths assumed to be found in the local on-disk
        filesystem
    preserve_index : bool, default False
        Preserve index values in each parquet file.
    partition_cols : list,
        Column names by which to partition the dataset
        Columns are partitioned in the order they are given
    return_metadata : bool, default False
        Return parquet metadata for written data. Returned metadata will
        include the file-path metadata (relative to `root_path`).
    **kwargs : dict,
        kwargs for to_parquet function.
    """

    fs = ioutils._ensure_filesystem(fs, root_path)
    fs.mkdirs(root_path, exist_ok=True)
    metadata = []

    if partition_cols is not None and len(partition_cols) > 0:

        data_cols = df.columns.drop(partition_cols)
        if len(data_cols) == 0:
            raise ValueError("No data left to save outside partition columns")

        #  Loop through the partition groups
        for _, sub_df in enumerate(
                _get_partition_groups(df,
                                      partition_cols,
                                      preserve_index=preserve_index)):
            if sub_df is None or len(sub_df) == 0:
                continue
            keys = tuple([sub_df[col].iloc[0] for col in partition_cols])
            if not isinstance(keys, tuple):
                keys = (keys, )
            subdir = fs.sep.join([
                "{colname}={value}".format(colname=name, value=val)
                for name, val in zip(partition_cols, keys)
            ])
            prefix = fs.sep.join([root_path, subdir])
            fs.mkdirs(prefix, exist_ok=True)
            filename = filename or uuid4().hex + ".parquet"
            full_path = fs.sep.join([prefix, filename])
            write_df = sub_df.copy(deep=False)
            write_df.drop(columns=partition_cols, inplace=True)
            with fs.open(full_path, mode="wb") as fil:
                fil = ioutils.get_IOBase_writer(fil)
                if return_metadata:
                    metadata.append(
                        write_df.to_parquet(
                            fil,
                            index=preserve_index,
                            metadata_file_path=fs.sep.join([subdir, filename]),
                            **kwargs,
                        ))
                else:
                    write_df.to_parquet(fil, index=preserve_index, **kwargs)

    else:
        filename = filename or uuid4().hex + ".parquet"
        full_path = fs.sep.join([root_path, filename])
        if return_metadata:
            metadata.append(
                df.to_parquet(
                    full_path,
                    index=preserve_index,
                    metadata_file_path=filename,
                    **kwargs,
                ))
        else:
            df.to_parquet(full_path, index=preserve_index, **kwargs)

    if metadata:
        return (merge_parquet_filemetadata(metadata)
                if len(metadata) > 1 else metadata[0])
Exemplo n.º 6
0
def read_parquet(
    filepath_or_buffer,
    engine="cudf",
    columns=None,
    filters=None,
    row_groups=None,
    skiprows=None,
    num_rows=None,
    strings_to_categorical=False,
    use_pandas_metadata=True,
    *args,
    **kwargs,
):
    """{docstring}"""

    # Multiple sources are passed as a list. If a single source is passed,
    # wrap it in a list for unified processing downstream.
    if not is_list_like(filepath_or_buffer):
        filepath_or_buffer = [filepath_or_buffer]

    # a list of row groups per source should be passed. make the list of
    # lists that is expected for multiple sources
    if row_groups is not None:
        if not is_list_like(row_groups):
            row_groups = [[row_groups]]
        elif not is_list_like(row_groups[0]):
            row_groups = [row_groups]

    filepaths_or_buffers = []
    for source in filepath_or_buffer:
        if ioutils.is_directory(source, **kwargs):
            fs = ioutils._ensure_filesystem(passed_filesystem=None,
                                            path=source)
            source = ioutils.stringify_pathlike(source)
            source = fs.sep.join([source, "*.parquet"])

        tmp_source, compression = ioutils.get_filepath_or_buffer(
            path_or_data=source,
            compression=None,
            **kwargs,
        )
        if compression is not None:
            raise ValueError(
                "URL content-encoding decompression is not supported")
        if isinstance(tmp_source, list):
            filepath_or_buffer.extend(tmp_source)
        else:
            filepaths_or_buffers.append(tmp_source)

    if filters is not None:
        # Convert filters to ds.Expression
        filters = pq._filters_to_expression(filters)

        # Initialize ds.FilesystemDataset
        dataset = ds.dataset(filepaths_or_buffers,
                             format="parquet",
                             partitioning="hive")

        # Load IDs of filtered row groups for each file in dataset
        filtered_rg_ids = defaultdict(list)
        for fragment in dataset.get_fragments(filter=filters):
            for rg_fragment in fragment.split_by_row_group(filters):
                for rg_info in rg_fragment.row_groups:
                    filtered_rg_ids[rg_fragment.path].append(rg_info.id)

        # Initialize row_groups to be selected
        if row_groups is None:
            row_groups = [None for _ in dataset.files]

        # Store IDs of selected row groups for each file
        for i, file in enumerate(dataset.files):
            if row_groups[i] is None:
                row_groups[i] = filtered_rg_ids[file]
            else:
                row_groups[i] = filter(lambda id: id in row_groups[i],
                                       filtered_rg_ids[file])

    if engine == "cudf":
        return libparquet.read_parquet(
            filepaths_or_buffers,
            columns=columns,
            row_groups=row_groups,
            skiprows=skiprows,
            num_rows=num_rows,
            strings_to_categorical=strings_to_categorical,
            use_pandas_metadata=use_pandas_metadata,
        )
    else:
        warnings.warn("Using CPU via PyArrow to read Parquet dataset.")
        return cudf.DataFrame.from_arrow(
            pq.ParquetDataset(filepaths_or_buffers).read_pandas(
                columns=columns, *args, **kwargs))