Exemplo n.º 1
0
def read_orc_statistics(
    filepath_or_buffer, columns=None, **kwargs,
):
    """{docstring}"""

    is_single_filepath_or_buffer = ioutils.ensure_single_filepath_or_buffer(
        path_or_data=filepath_or_buffer, **kwargs,
    )
    if not is_single_filepath_or_buffer:
        raise NotImplementedError(
            "`read_orc_statistics` does not support reading multiple files"
        )

    filepath_or_buffer, compression = ioutils.get_filepath_or_buffer(
        path_or_data=filepath_or_buffer, compression=None, **kwargs
    )
    if compression is not None:
        ValueError("URL content-encoding decompression is not supported")

    # Read in statistics and unpack
    (
        column_names,
        raw_file_statistics,
        raw_stripes_statistics,
    ) = libcudf.orc.read_raw_orc_statistics(filepath_or_buffer)

    # Parse column names
    column_names = [
        column_name.decode("utf-8") for column_name in column_names
    ]

    # Parse statistics
    cs = cs_pb2.ColumnStatistics()

    file_statistics = {
        column_names[i]: _parse_column_statistics(cs, raw_file_stats)
        for i, raw_file_stats in enumerate(raw_file_statistics)
        if columns is None or column_names[i] in columns
    }
    if any(
        not parsed_statistics for parsed_statistics in file_statistics.values()
    ):
        return None

    stripes_statistics = []
    for raw_stripe_statistics in raw_stripes_statistics:
        stripe_statistics = {
            column_names[i]: _parse_column_statistics(cs, raw_file_stats)
            for i, raw_file_stats in enumerate(raw_stripe_statistics)
            if columns is None or column_names[i] in columns
        }
        if any(
            not parsed_statistics
            for parsed_statistics in stripe_statistics.values()
        ):
            return None
        else:
            stripes_statistics.append(stripe_statistics)

    return file_statistics, stripes_statistics
Exemplo n.º 2
0
def read_orc_statistics(
    filepaths_or_buffers, columns=None, **kwargs,
):
    """{docstring}"""

    files_statistics = []
    stripes_statistics = []
    for source in filepaths_or_buffers:
        filepath_or_buffer, compression = ioutils.get_filepath_or_buffer(
            path_or_data=source, compression=None, **kwargs
        )
        if compression is not None:
            ValueError("URL content-encoding decompression is not supported")

        # Read in statistics and unpack
        (
            column_names,
            raw_file_statistics,
            raw_stripes_statistics,
        ) = liborc.read_raw_orc_statistics(filepath_or_buffer)

        # Parse column names
        column_names = [
            column_name.decode("utf-8") for column_name in column_names
        ]

        # Parse statistics
        cs = cs_pb2.ColumnStatistics()

        file_statistics = {
            column_names[i]: _parse_column_statistics(cs, raw_file_stats)
            for i, raw_file_stats in enumerate(raw_file_statistics)
            if columns is None or column_names[i] in columns
        }
        if any(
            not parsed_statistics
            for parsed_statistics in file_statistics.values()
        ):
            continue
        else:
            files_statistics.append(file_statistics)

        for raw_stripe_statistics in raw_stripes_statistics:
            stripe_statistics = {
                column_names[i]: _parse_column_statistics(cs, raw_file_stats)
                for i, raw_file_stats in enumerate(raw_stripe_statistics)
                if columns is None or column_names[i] in columns
            }
            if any(
                not parsed_statistics
                for parsed_statistics in stripe_statistics.values()
            ):
                continue
            else:
                stripes_statistics.append(stripe_statistics)

    return files_statistics, stripes_statistics
Exemplo n.º 3
0
def read_orc_statistics(
    filepath_or_buffer,
    **kwargs,
):
    """{docstring}"""

    filepath_or_buffer, compression = ioutils.get_filepath_or_buffer(
        path_or_data=filepath_or_buffer, compression=None, **kwargs)
    if compression is not None:
        ValueError("URL content-encoding decompression is not supported")

    # Read in statistics and unpack
    statistics = libcudf.orc.read_orc_statistics(filepath_or_buffer)
    if not statistics:
        return None
    (
        column_names,
        raw_file_statistics,
        *raw_stripes_statistics,
    ) = statistics

    # Parse statistics
    cs = cs_pb2.ColumnStatistics()
    file_statistics = {}
    stripes_statistics = []
    for i, raw_file_stats in enumerate(raw_file_statistics):
        parsed_statistics = _parse_column_statistics(cs, raw_file_stats)
        if not parsed_statistics:
            return None
        file_statistics[column_names[i].decode("utf-8")] = parsed_statistics
    for raw_stripe_statistics in raw_stripes_statistics:
        stripe_statistics = {}
        for i, raw_file_stats in enumerate(raw_stripe_statistics):
            parsed_statistics = _parse_column_statistics(cs, raw_file_stats)
            if not parsed_statistics:
                return None
            stripe_statistics[column_names[i].decode(
                "utf-8")] = parsed_statistics
        stripes_statistics.append(stripe_statistics)

    return file_statistics, stripes_statistics