Exemplo n.º 1
0
def _filter_stripes(
    filters, filepath_or_buffer, stripes=None, skip_rows=None, num_rows=None
):
    # Multiple sources are passed as a list. If a single source is passed,
    # wrap it in a list for unified processing downstream.
    if not is_list_like(filepath_or_buffer):
        filepath_or_buffer = [filepath_or_buffer]

    # Prepare filters
    filters = ioutils._prepare_filters(filters)

    # Get columns relevant to filtering
    columns_in_predicate = [
        col for conjunction in filters for (col, op, val) in conjunction
    ]

    # Read and parse file-level and stripe-level statistics
    file_statistics, stripes_statistics = read_orc_statistics(
        filepath_or_buffer, columns_in_predicate
    )

    file_stripe_map = []
    for file_stat in file_statistics:
        # Filter using file-level statistics
        if not ioutils._apply_filters(filters, file_stat):
            continue

        # Filter using stripe-level statistics
        selected_stripes = []
        num_rows_scanned = 0
        for i, stripe_statistics in enumerate(stripes_statistics):
            num_rows_before_stripe = num_rows_scanned
            num_rows_scanned += next(iter(stripe_statistics.values()))[
                "number_of_values"
            ]
            if stripes is not None and i not in stripes:
                continue
            if skip_rows is not None and num_rows_scanned <= skip_rows:
                continue
            else:
                skip_rows = 0
            if (
                skip_rows is not None
                and num_rows is not None
                and num_rows_before_stripe >= skip_rows + num_rows
            ):
                continue
            if ioutils._apply_filters(filters, stripe_statistics):
                selected_stripes.append(i)

        file_stripe_map.append(selected_stripes)

    return file_stripe_map
Exemplo n.º 2
0
def _filter_stripes(
    filters, filepath_or_buffer, stripes=None, skip_rows=None, num_rows=None
):
    # Prepare filters
    filters = ioutils._prepare_filters(filters)

    # Get columns relevant to filtering
    columns_in_predicate = [
        col for conjunction in filters for (col, op, val) in conjunction
    ]

    # Read and parse file-level and stripe-level statistics
    file_statistics, stripes_statistics = read_orc_statistics(
        filepath_or_buffer, columns_in_predicate
    )

    # Filter using file-level statistics
    if not ioutils._apply_filters(filters, file_statistics):
        return []

    # Filter using stripe-level statistics
    selected_stripes = []
    num_rows_scanned = 0
    for i, stripe_statistics in enumerate(stripes_statistics):
        num_rows_before_stripe = num_rows_scanned
        num_rows_scanned += next(iter(stripe_statistics.values()))[
            "number_of_values"
        ]
        if stripes is not None and i not in stripes:
            continue
        if skip_rows is not None and num_rows_scanned <= skip_rows:
            continue
        else:
            skip_rows = 0
        if (
            skip_rows is not None
            and num_rows is not None
            and num_rows_before_stripe >= skip_rows + num_rows
        ):
            continue
        if ioutils._apply_filters(filters, stripe_statistics):
            selected_stripes.append(i)

    return selected_stripes