def _text_contains_case_sensitive(data: pa.Array, pat: str) -> pa.Array: """ Check for each element in the data whether it contains the pattern ``pat``. This implementation does basic byte-by-byte comparison and is independent of any locales or encodings. """ # Convert to UTF-8 bytes pat_bytes: bytes = pat.encode() offsets_buffer, data_buffer = _extract_string_buffers(data) if data.null_count == 0: valid_buffer = np.empty(0, dtype=np.uint8) else: valid_buffer = _buffer_to_view(data.buffers()[0]) output = _text_contains_case_sensitive_numba(len(data), valid_buffer, data.offset, offsets_buffer, data_buffer, pat_bytes) if data.null_count == 0: output_valid = None else: output_valid = data.buffers()[0].slice(data.offset // 8) if data.offset % 8 != 0: output_valid = shift_unaligned_bitmap(output_valid, data.offset % 8, len(data)) buffers = [output_valid, pa.py_buffer(output)] return pa.Array.from_buffers(pa.bool_(), len(data), buffers, data.null_count)
def _slice_handle_chunk(pa_arr, start, end, step): """Slice each string according to the (start, end, step) inputs.""" offsets, data = _extract_string_buffers(pa_arr) valid = _buffer_to_view(pa_arr.buffers()[0]) if step == 0: raise ValueError("step cannot be zero.") if start >= 0 and (end is None or end >= 0) and step >= 1: if step == 1: res = _slice_pos_inputs_nostep(offsets, data, valid, pa_arr.offset, start, end) else: res = _slice_pos_inputs_step(offsets, data, valid, pa_arr.offset, start, end, step) else: res = _slice_generic(offsets, data, valid, pa_arr.offset, start, end, step) return finalize_string_array(res, pa.string())
def _text_contains_case_sensitive(data: pa.Array, pat: str) -> pa.Array: """ Check for each element in the data whether it contains the pattern ``pat``. This implementation does basic byte-by-byte comparison and is independent of any locales or encodings. """ # Convert to UTF-8 bytes pat_bytes: bytes = pat.encode() # Initialise boolean (bit-packaed) output array. output_size = len(data) // 8 if len(data) % 8 > 0: output_size += 1 output = np.empty(output_size, dtype=np.uint8) if len(data) % 8 > 0: # Zero trailing bits output[-1] = 0 offsets, data_buffer = _extract_string_buffers(data) if data.null_count == 0: valid_buffer = None _text_contains_case_sensitive_nonnull( len(data), offsets, data_buffer, pat_bytes, output ) else: valid = _buffer_to_view(data.buffers()[0]) _text_contains_case_sensitive_nulls( len(data), valid, data.offset, offsets, data_buffer, pat_bytes, output ) valid_buffer = data.buffers()[0].slice(data.offset // 8) if data.offset % 8 != 0: valid_buffer = shift_unaligned_bitmap( valid_buffer, data.offset % 8, len(data) ) return pa.Array.from_buffers( pa.bool_(), len(data), [valid_buffer, pa.py_buffer(output)], data.null_count )
def _text_replace_case_sensitive(data: pa.Array, pat: str, repl: str, max_repl: int) -> pa.Array: """ Replace occurrences of ``pat`` with ``repl`` in the Series/Index with some other string. For every row, only the first ``max_repl`` replacements will be performed. If ``max_repl = -1`` we consider that we have no limit for the number of replacements. This implementation does basic byte-by-byte comparison and is independent of any locales or encodings. """ # Convert to UTF-8 bytes pat_bytes: bytes = pat.encode() repl_bytes: bytes = repl.encode() offsets_buffer, data_buffer = _extract_string_buffers(data) if data.null_count == 0: valid_buffer = np.empty(0, dtype=np.uint8) else: valid_buffer = _buffer_to_view(data.buffers()[0]) if len(pat) > 0: output_t = _text_replace_case_sensitive_numba( len(data), valid_buffer, data.offset, offsets_buffer, data_buffer, pat_bytes, repl_bytes, max_repl, ) else: output_t = _text_replace_case_sensitive_empty_pattern( len(data), valid_buffer, data.offset, offsets_buffer, data_buffer, repl_bytes, max_repl, ) output_offsets, output_buffer = output_t if data.null_count == 0: output_valid = None else: output_valid = data.buffers()[0].slice(data.offset // 8) if data.offset % 8 != 0: output_valid = shift_unaligned_bitmap(output_valid, data.offset % 8, len(data)) buffers = [ output_valid, pa.py_buffer(output_offsets), pa.py_buffer(output_buffer) ] return pa.Array.from_buffers(pa.string(), len(data), buffers, data.null_count)