def _text_contains_case_sensitive_numba( length: int, valid_bits: np.ndarray, valid_offset: int, offsets: np.ndarray, data: np.ndarray, pat: bytes, ) -> np.ndarray: failure_function = compute_kmp_failure_function(pat) # Initialise boolean (bit-packaed) output array. output_size = length // 8 if length % 8 > 0: output_size += 1 output = np.empty(output_size, dtype=np.uint8) if length % 8 > 0: # Zero trailing bits output[-1] = 0 has_nulls = valid_bits.size > 0 for row_idx in range(length): if has_nulls and not _check_valid_row(row_idx, valid_bits, valid_offset): continue matched_len = 0 contains = False for str_idx in range(offsets[row_idx], offsets[row_idx + 1]): if matched_len == len(pat): contains = True break # Manually inlined utils.kmp.append_to_kmp_matching for # performance while matched_len > -1 and pat[matched_len] != data[str_idx]: matched_len = failure_function[matched_len] matched_len = matched_len + 1 if matched_len == len(pat): contains = True # Write out the result into the bit-mask byte_offset_result = row_idx // 8 bit_offset_result = row_idx % 8 mask_result = np.uint8(1 << bit_offset_result) current = output[byte_offset_result] if contains: # must be logical, not bit-wise as different bits may be flagged output[byte_offset_result] = current | mask_result else: output[byte_offset_result] = current & ~mask_result return output
def _text_count_case_sensitive_numba( length: int, valid_bits: np.ndarray, valid_offset: int, offsets: np.ndarray, data: np.ndarray, pat: bytes, ) -> np.ndarray: failure_function = compute_kmp_failure_function(pat) output = np.empty(length, dtype=np.int64) has_nulls = valid_bits.size > 0 for row_idx in range(length): if has_nulls and not _check_valid_row(row_idx, valid_bits, valid_offset): continue matched_len = 0 output[row_idx] = 0 if len(pat) == 0: output[row_idx] = offsets[row_idx + 1] - offsets[row_idx] + 1 continue for str_idx in range(offsets[row_idx], offsets[row_idx + 1]): # Manually inlined utils.kmp.append_to_kmp_matching for performance while matched_len > -1 and pat[matched_len] != data[str_idx]: matched_len = failure_function[matched_len] matched_len = matched_len + 1 if matched_len == len(pat): output[row_idx] += 1 # `matched_len=0` ensures overlapping matches are not counted. # This matches the behavior of Python's builtin `count` # function. matched_len = 0 return output
def _text_replace_case_sensitive_numba( length: int, valid_bits: np.ndarray, valid_offset: int, offsets: np.ndarray, data: np.ndarray, pat: bytes, repl: bytes, max_repl: int, ) -> Tuple[np.ndarray, np.ndarray]: failure_function = compute_kmp_failure_function(pat) # Computes output buffer offsets output_offsets = np.empty(length + 1, dtype=np.int32) cumulative_offset = 0 has_nulls = valid_bits.size > 0 match_len_change = len(repl) - len(pat) for row_idx in range(length): output_offsets[row_idx] = cumulative_offset if has_nulls and not _check_valid_row(row_idx, valid_bits, valid_offset): continue row_len = offsets[row_idx + 1] - offsets[row_idx] cumulative_offset += row_len matched_len = 0 matches_done = 0 for str_idx in range(offsets[row_idx], offsets[row_idx + 1]): # Manually inlined utils.kmp.append_to_kmp_matching for performance while matched_len > -1 and pat[matched_len] != data[str_idx]: matched_len = failure_function[matched_len] matched_len = matched_len + 1 if matched_len == len(pat): matches_done += 1 matched_len = 0 if matches_done == max_repl: break cumulative_offset += match_len_change * matches_done output_offsets[length] = cumulative_offset output_buffer = np.empty(cumulative_offset, dtype=np.uint8) output_pos = 0 for row_idx in range(length): if has_nulls and not _check_valid_row(row_idx, valid_bits, valid_offset): continue matched_len = 0 matches_done = 0 write_idx = offsets[row_idx] for read_idx in range(offsets[row_idx], offsets[row_idx + 1]): # A modified version of utils.kmp.append_to_kmp_matching while matched_len > -1 and pat[matched_len] != data[read_idx]: matched_len = failure_function[matched_len] matched_len = matched_len + 1 if read_idx - write_idx == len(pat): output_buffer[output_pos] = data[write_idx] output_pos += 1 write_idx += 1 if matched_len == len(pat): matched_len = 0 if matches_done != max_repl: matches_done += 1 write_idx = read_idx + 1 for char in repl: output_buffer[output_pos] = char output_pos += 1 while write_idx < offsets[row_idx + 1]: output_buffer[output_pos] = data[write_idx] output_pos += 1 write_idx += 1 return output_offsets, output_buffer