Exemplo n.º 1
0
def _text_contains_case_sensitive(data: pa.Array, pat: str) -> pa.Array:
    """
    Check for each element in the data whether it contains the pattern ``pat``.

    This implementation does basic byte-by-byte comparison and is independent
    of any locales or encodings.
    """
    # Convert to UTF-8 bytes
    pat_bytes: bytes = pat.encode()

    offsets_buffer, data_buffer = _extract_string_buffers(data)

    if data.null_count == 0:
        valid_buffer = np.empty(0, dtype=np.uint8)
    else:
        valid_buffer = _buffer_to_view(data.buffers()[0])

    output = _text_contains_case_sensitive_numba(len(data), valid_buffer,
                                                 data.offset, offsets_buffer,
                                                 data_buffer, pat_bytes)

    if data.null_count == 0:
        output_valid = None
    else:
        output_valid = data.buffers()[0].slice(data.offset // 8)
        if data.offset % 8 != 0:
            output_valid = shift_unaligned_bitmap(output_valid,
                                                  data.offset % 8, len(data))

    buffers = [output_valid, pa.py_buffer(output)]
    return pa.Array.from_buffers(pa.bool_(), len(data), buffers,
                                 data.null_count)
Exemplo n.º 2
0
def _slice_handle_chunk(pa_arr, start, end, step):
    """Slice each string according to the (start, end, step) inputs."""
    offsets, data = _extract_string_buffers(pa_arr)
    valid = _buffer_to_view(pa_arr.buffers()[0])
    if step == 0:
        raise ValueError("step cannot be zero.")

    if start >= 0 and (end is None or end >= 0) and step >= 1:
        if step == 1:
            res = _slice_pos_inputs_nostep(offsets, data, valid, pa_arr.offset,
                                           start, end)
        else:
            res = _slice_pos_inputs_step(offsets, data, valid, pa_arr.offset,
                                         start, end, step)
    else:
        res = _slice_generic(offsets, data, valid, pa_arr.offset, start, end,
                             step)

    return finalize_string_array(res, pa.string())
Exemplo n.º 3
0
def _text_contains_case_sensitive(data: pa.Array, pat: str) -> pa.Array:
    """
    Check for each element in the data whether it contains the pattern ``pat``.

    This implementation does basic byte-by-byte comparison and is independent
    of any locales or encodings.
    """
    # Convert to UTF-8 bytes
    pat_bytes: bytes = pat.encode()

    # Initialise boolean (bit-packaed) output array.
    output_size = len(data) // 8
    if len(data) % 8 > 0:
        output_size += 1
    output = np.empty(output_size, dtype=np.uint8)
    if len(data) % 8 > 0:
        # Zero trailing bits
        output[-1] = 0

    offsets, data_buffer = _extract_string_buffers(data)

    if data.null_count == 0:
        valid_buffer = None
        _text_contains_case_sensitive_nonnull(
            len(data), offsets, data_buffer, pat_bytes, output
        )
    else:
        valid = _buffer_to_view(data.buffers()[0])
        _text_contains_case_sensitive_nulls(
            len(data), valid, data.offset, offsets, data_buffer, pat_bytes, output
        )
        valid_buffer = data.buffers()[0].slice(data.offset // 8)
        if data.offset % 8 != 0:
            valid_buffer = shift_unaligned_bitmap(
                valid_buffer, data.offset % 8, len(data)
            )

    return pa.Array.from_buffers(
        pa.bool_(), len(data), [valid_buffer, pa.py_buffer(output)], data.null_count
    )
Exemplo n.º 4
0
def _text_replace_case_sensitive(data: pa.Array, pat: str, repl: str,
                                 max_repl: int) -> pa.Array:
    """
    Replace occurrences of ``pat`` with ``repl`` in the Series/Index with some other string. For every
    row, only the first ``max_repl`` replacements will be performed. If ``max_repl = -1`` we consider that
    we have no limit for the number of replacements.

    This implementation does basic byte-by-byte comparison and is independent
    of any locales or encodings.
    """

    # Convert to UTF-8 bytes
    pat_bytes: bytes = pat.encode()
    repl_bytes: bytes = repl.encode()

    offsets_buffer, data_buffer = _extract_string_buffers(data)

    if data.null_count == 0:
        valid_buffer = np.empty(0, dtype=np.uint8)
    else:
        valid_buffer = _buffer_to_view(data.buffers()[0])

    if len(pat) > 0:
        output_t = _text_replace_case_sensitive_numba(
            len(data),
            valid_buffer,
            data.offset,
            offsets_buffer,
            data_buffer,
            pat_bytes,
            repl_bytes,
            max_repl,
        )
    else:
        output_t = _text_replace_case_sensitive_empty_pattern(
            len(data),
            valid_buffer,
            data.offset,
            offsets_buffer,
            data_buffer,
            repl_bytes,
            max_repl,
        )

    output_offsets, output_buffer = output_t

    if data.null_count == 0:
        output_valid = None
    else:
        output_valid = data.buffers()[0].slice(data.offset // 8)
        if data.offset % 8 != 0:
            output_valid = shift_unaligned_bitmap(output_valid,
                                                  data.offset % 8, len(data))

    buffers = [
        output_valid,
        pa.py_buffer(output_offsets),
        pa.py_buffer(output_buffer)
    ]
    return pa.Array.from_buffers(pa.string(), len(data), buffers,
                                 data.null_count)