def test_calculate_chunk_offsets(): arr = pa.chunked_array([[1, 1, 1]]) npt.assert_array_equal(_calculate_chunk_offsets(arr), np.array([0])) arr = pa.chunked_array([[1], [1, 1]]) npt.assert_array_equal(_calculate_chunk_offsets(arr), np.array([0, 1])) arr = pa.chunked_array([[1, 1], [1]]) npt.assert_array_equal(_calculate_chunk_offsets(arr), np.array([0, 2]))
def _text_cat_chunked_mixed(a: pa.ChunkedArray, b: pa.Array) -> pa.ChunkedArray: new_chunks = [] offsets = _calculate_chunk_offsets(a) for chunk, offset in zip(a.iterchunks(), offsets): new_chunks.append(_text_cat(chunk, b[offset:offset + len(chunk)])) return pa.chunked_array(new_chunks)
def test_in_chunk_offsets(data: List[List[int]]): arr = pa.chunked_array(data, type=pa.int64()) # Simple case: Passing in the actual chunk offsets should yield a valid selection offsets = list(_calculate_chunk_offsets(arr)) in_offsets = _in_chunk_offsets(arr, offsets) check_valid_in_offsets(arr, in_offsets)
def _text_cat_chunked_2(a: pa.Array, b: pa.ChunkedArray) -> pa.ChunkedArray: new_chunks = [] offsets = _calculate_chunk_offsets(b) for chunk, offset in zip(b.iterchunks(), offsets): new_chunks.append(_text_cat(a[offset:offset + len(chunk)], chunk)) return pa.chunked_array(new_chunks)
def apply_binary_str( a: Union[pa.Array, pa.ChunkedArray], b: Union[pa.Array, pa.ChunkedArray], *, func: Callable, output_dtype, parallel: bool = False, ): """ Apply an element-wise numba-jitted function on two Arrow columns. The supplied function must return a numpy-compatible scalar. Handling of missing data and chunking of the inputs is done automatically. """ if len(a) != len(b): raise ValueError("Inputs don't have the same length.") if isinstance(a, pa.ChunkedArray): if isinstance(b, pa.ChunkedArray): in_a_offsets, in_b_offsets = _combined_in_chunk_offsets(a, b) new_chunks: List[pa.Array] = [] for a_offset, b_offset in zip(in_a_offsets, in_b_offsets): a_slice = a.chunk(a_offset[0])[a_offset[1]:a_offset[1] + a_offset[2]] b_slice = b.chunk(b_offset[0])[b_offset[1]:b_offset[1] + b_offset[2]] new_chunks.append( _apply_binary_str_array( a_slice, b_slice, func=func, output_dtype=output_dtype, parallel=parallel, )) return pa.chunked_array(new_chunks) elif isinstance(b, pa.Array): new_chunks = [] offsets = _calculate_chunk_offsets(a) for chunk, offset in zip(a.iterchunks(), offsets): new_chunks.append( _apply_binary_str_array( chunk, b[offset:offset + len(chunk)], func=func, output_dtype=output_dtype, parallel=parallel, )) return pa.chunked_array(new_chunks) else: raise ValueError(f"left operand has unsupported type {type(b)}") elif isinstance(a, pa.Array): if isinstance(b, pa.ChunkedArray): new_chunks = [] offsets = _calculate_chunk_offsets(b) for chunk, offset in zip(b.iterchunks(), offsets): new_chunks.append( _apply_binary_str_array( a[offset:offset + len(chunk)], chunk, func=func, output_dtype=output_dtype, parallel=parallel, )) return pa.chunked_array(new_chunks) elif isinstance(b, pa.Array): return _apply_binary_str_array(a, b, func=func, output_dtype=output_dtype, parallel=parallel) else: raise ValueError(f"left operand has unsupported type {type(b)}") else: raise ValueError(f"left operand has unsupported type {type(a)}")