Пример #1
0
def test_calculate_chunk_offsets():
    arr = pa.chunked_array([[1, 1, 1]])
    npt.assert_array_equal(_calculate_chunk_offsets(arr), np.array([0]))
    arr = pa.chunked_array([[1], [1, 1]])
    npt.assert_array_equal(_calculate_chunk_offsets(arr), np.array([0, 1]))
    arr = pa.chunked_array([[1, 1], [1]])
    npt.assert_array_equal(_calculate_chunk_offsets(arr), np.array([0, 2]))
Пример #2
0
def _text_cat_chunked_mixed(a: pa.ChunkedArray,
                            b: pa.Array) -> pa.ChunkedArray:
    new_chunks = []
    offsets = _calculate_chunk_offsets(a)
    for chunk, offset in zip(a.iterchunks(), offsets):
        new_chunks.append(_text_cat(chunk, b[offset:offset + len(chunk)]))
    return pa.chunked_array(new_chunks)
Пример #3
0
def test_in_chunk_offsets(data: List[List[int]]):
    arr = pa.chunked_array(data, type=pa.int64())
    # Simple case: Passing in the actual chunk offsets should yield a valid selection
    offsets = list(_calculate_chunk_offsets(arr))
    in_offsets = _in_chunk_offsets(arr, offsets)
    check_valid_in_offsets(arr, in_offsets)
Пример #4
0
def _text_cat_chunked_2(a: pa.Array, b: pa.ChunkedArray) -> pa.ChunkedArray:
    new_chunks = []
    offsets = _calculate_chunk_offsets(b)
    for chunk, offset in zip(b.iterchunks(), offsets):
        new_chunks.append(_text_cat(a[offset:offset + len(chunk)], chunk))
    return pa.chunked_array(new_chunks)
Пример #5
0
def apply_binary_str(
    a: Union[pa.Array, pa.ChunkedArray],
    b: Union[pa.Array, pa.ChunkedArray],
    *,
    func: Callable,
    output_dtype,
    parallel: bool = False,
):
    """
    Apply an element-wise numba-jitted function on two Arrow columns.

    The supplied function must return a numpy-compatible scalar.
    Handling of missing data and chunking of the inputs is done automatically.
    """
    if len(a) != len(b):
        raise ValueError("Inputs don't have the same length.")

    if isinstance(a, pa.ChunkedArray):
        if isinstance(b, pa.ChunkedArray):
            in_a_offsets, in_b_offsets = _combined_in_chunk_offsets(a, b)

            new_chunks: List[pa.Array] = []
            for a_offset, b_offset in zip(in_a_offsets, in_b_offsets):
                a_slice = a.chunk(a_offset[0])[a_offset[1]:a_offset[1] +
                                               a_offset[2]]
                b_slice = b.chunk(b_offset[0])[b_offset[1]:b_offset[1] +
                                               b_offset[2]]
                new_chunks.append(
                    _apply_binary_str_array(
                        a_slice,
                        b_slice,
                        func=func,
                        output_dtype=output_dtype,
                        parallel=parallel,
                    ))
            return pa.chunked_array(new_chunks)
        elif isinstance(b, pa.Array):
            new_chunks = []
            offsets = _calculate_chunk_offsets(a)
            for chunk, offset in zip(a.iterchunks(), offsets):
                new_chunks.append(
                    _apply_binary_str_array(
                        chunk,
                        b[offset:offset + len(chunk)],
                        func=func,
                        output_dtype=output_dtype,
                        parallel=parallel,
                    ))
            return pa.chunked_array(new_chunks)
        else:
            raise ValueError(f"left operand has unsupported type {type(b)}")
    elif isinstance(a, pa.Array):
        if isinstance(b, pa.ChunkedArray):
            new_chunks = []
            offsets = _calculate_chunk_offsets(b)
            for chunk, offset in zip(b.iterchunks(), offsets):
                new_chunks.append(
                    _apply_binary_str_array(
                        a[offset:offset + len(chunk)],
                        chunk,
                        func=func,
                        output_dtype=output_dtype,
                        parallel=parallel,
                    ))
            return pa.chunked_array(new_chunks)
        elif isinstance(b, pa.Array):
            return _apply_binary_str_array(a,
                                           b,
                                           func=func,
                                           output_dtype=output_dtype,
                                           parallel=parallel)
        else:
            raise ValueError(f"left operand has unsupported type {type(b)}")
    else:
        raise ValueError(f"left operand has unsupported type {type(a)}")