Exemplo n.º 1
0
def _extract_data_buffer_as_np_array(array: pa.Array) -> np.ndarray:
    """Extract the data buffer of a numeric-typed pyarrow.Array as an np.ndarray."""
    dtype = array.type.to_pandas_dtype()
    start = array.offset
    end = array.offset + len(array)
    if pa.types.is_boolean(array.type):
        return np.unpackbits(_buffer_to_view(array.buffers()[1]).view(
            np.uint8),
                             bitorder="little")[start:end].astype(bool)
    else:
        return _buffer_to_view(array.buffers()[1]).view(dtype)[start:end]
Exemplo n.º 2
0
 def from_arrow(cls, data: pa.Array):
     dtype = Decimal64Dtype.from_arrow(data.type)
     mask_buf = data.buffers()[0]
     mask = (mask_buf if mask_buf is None else pa_mask_buffer_to_mask(
         mask_buf, len(data)))
     data_128 = cp.array(np.frombuffer(data.buffers()[1]).view("int64"))
     data_64 = data_128[::2].copy()
     return cls(
         data=Buffer(data_64.view("uint8")),
         size=len(data),
         dtype=dtype,
         mask=mask,
     )
Exemplo n.º 3
0
    def _get_data_buffer(
        self, arr: pa.Array
    ) -> Tuple[OmnisciProtocolBuffer, Tuple[DTypeKind, int, str, str]]:
        """
        Get column's data buffer.

        Parameters
        ----------
        arr : pa.Array
            PyArrow array holding column's data.

        Returns
        -------
        tuple
            Tuple of ``OmnisciProtocolBuffer`` and protocol dtype representation of the buffer's underlying data.
        """
        if self.dtype[0] == DTypeKind.CATEGORICAL:
            # For dictionary data the buffer has to return categories codes
            arr = arr.indices

        arrow_type = self._dtype_from_pyarrow(arr.type)
        buff_size = (
            self._get_buffer_size(
                bit_width=arrow_type[1]) if self.dtype[0] != DTypeKind.STRING
            # We don't chunk string buffers as it would require modifying offset values,
            # so just return the whole data buffer for every chunk.
            else None)

        return (
            # According to the Arrow's memory layout, the data buffer is always present
            # at the last position of `.buffers()`:
            # https://arrow.apache.org/docs/format/Columnar.html#buffer-listing-for-each-layout
            OmnisciProtocolBuffer(arr.buffers()[-1], buff_size),
            arrow_type,
        )
Exemplo n.º 4
0
    def _get_validity_buffer(
        self, arr: pa.Array
    ) -> Optional[Tuple[OmnisciProtocolBuffer, Tuple[DTypeKind, int, str,
                                                     str]]]:
        """
        Get column's validity buffer.

        Parameters
        ----------
        arr : pa.Array
            PyArrow array holding column's data.

        Returns
        -------
        tuple or None
            Tuple of ``OmnisciProtocolBuffer`` and protocol dtype representation of the buffer's underlying data.
            None if column is non-nullable (``self.describe_null == ColumnNullType.NON_NULLABLE``).
        """
        # According to the Arrow's memory layout, the validity buffer is always present at zero position:
        # https://arrow.apache.org/docs/format/Columnar.html#buffer-listing-for-each-layout
        validity_buffer = arr.buffers()[0]
        if validity_buffer is None:
            return None

        # If exist, validity buffer is always a bit-mask.
        data_size = self._get_buffer_size(bit_width=1)
        return (
            OmnisciProtocolBuffer(validity_buffer, data_size),
            (DTypeKind.BOOL, 1, ArrowCTypes.BOOL, Endianness.NATIVE),
        )
Exemplo n.º 5
0
def _text_strip(data: pa.Array, to_strip) -> pa.Array:
    """
    Strip the characters of ``to_strip`` from start and end of each element in the data.
    """
    if len(data) == 0:
        return data

    offsets, data_buffer = _extract_string_buffers(data)

    valid_buffer = data.buffers()[0]
    valid_offset = data.offset
    builder = StringArrayBuilder(max(len(data_buffer), len(data)))

    _do_strip(
        valid_buffer,
        valid_offset,
        offsets,
        data_buffer,
        len(data),
        to_strip,
        inout_builder=builder,
    )

    result_array = finalize_string_array(builder, pa.string())
    return result_array
Exemplo n.º 6
0
def or_array_array(a: pa.Array, b: pa.Array) -> pa.Array:
    """Perform ``pyarrow.Array | pyarrow.Array``."""
    output_length = len(a) // 8
    if len(a) % 8 != 0:
        output_length += 1

    if a.null_count == 0 and b.null_count == 0:
        result = np.zeros(output_length, dtype=np.uint8)
        bitmap_or_unaligned(len(a),
                            a.buffers()[1], a.offset,
                            b.buffers()[1], b.offset, result)
        return pa.Array.from_buffers(pa.bool_(), len(a),
                                     [None, pa.py_buffer(result)], 0)
    elif a.null_count == 0:
        result = np.zeros(output_length, dtype=np.uint8)
        bitmap_or_unaligned(len(a),
                            a.buffers()[1], a.offset,
                            b.buffers()[1], b.offset, result)
        # b has nulls, mark all occasions of b(None) & a(True) as True -> valid_bits = a.data or b.valid_bits
        valid_bits = np.zeros(output_length, dtype=np.uint8)
        bitmap_or_unaligned(len(a),
                            a.buffers()[1], a.offset,
                            b.buffers()[0], b.offset, valid_bits)
        return pa.Array.from_buffers(
            pa.bool_(), len(a),
            [pa.py_buffer(valid_bits),
             pa.py_buffer(result)])
        pass
    elif b.null_count == 0:
        return or_array_array(b, a)
    else:
        result = np.zeros(output_length, dtype=np.uint8)
        valid_bits = np.zeros(output_length, dtype=np.uint8)
        null_count = masked_bitmap_or_unaligned(
            len(a),
            a.buffers()[0],
            a.buffers()[1],
            a.offset,
            b.buffers()[0],
            b.buffers()[1],
            b.offset,
            result,
            valid_bits,
        )
        return pa.Array.from_buffers(
            pa.bool_(),
            len(a),
            [pa.py_buffer(valid_bits),
             pa.py_buffer(result)],
            null_count,
        )
Exemplo n.º 7
0
def _extract_isnull_bitmap(arr: pa.Array, offset: int, length: int):
    """
    Extract isnull bitmap with offset and padding.

    Ensures that even when pyarrow does return an empty bitmap that a filled
    one will be returned.
    """
    buf = _buffer_to_view(arr.buffers()[0])
    if len(buf) > 0:
        return buf[offset:offset + length]
    else:
        return np.full(length, fill_value=255, dtype=np.uint8)
Exemplo n.º 8
0
def _text_contains_case_sensitive(data: pa.Array, pat: str) -> pa.Array:
    """
    Check for each element in the data whether it contains the pattern ``pat``.

    This implementation does basic byte-by-byte comparison and is independent
    of any locales or encodings.
    """
    # Convert to UTF-8 bytes
    pat_bytes: bytes = pat.encode()

    # Initialise boolean (bit-packaed) output array.
    output_size = len(data) // 8
    if len(data) % 8 > 0:
        output_size += 1
    output = np.empty(output_size, dtype=np.uint8)
    if len(data) % 8 > 0:
        # Zero trailing bits
        output[-1] = 0

    offsets, data_buffer = _extract_string_buffers(data)

    if data.null_count == 0:
        valid_buffer = None
        _text_contains_case_sensitive_nonnull(
            len(data), offsets, data_buffer, pat_bytes, output
        )
    else:
        valid = _buffer_to_view(data.buffers()[0])
        _text_contains_case_sensitive_nulls(
            len(data), valid, data.offset, offsets, data_buffer, pat_bytes, output
        )
        valid_buffer = data.buffers()[0].slice(data.offset // 8)
        if data.offset % 8 != 0:
            valid_buffer = shift_unaligned_bitmap(
                valid_buffer, data.offset % 8, len(data)
            )

    return pa.Array.from_buffers(
        pa.bool_(), len(data), [valid_buffer, pa.py_buffer(output)], data.null_count
    )
Exemplo n.º 9
0
def or_na(arr: pa.Array) -> pa.Array:
    """Apply ``array | NA`` with a boolean pyarrow.Array."""
    output_length = len(arr) // 8
    if len(arr) % 8 != 0:
        output_length += 1

    if arr.null_count == 0:
        return pa.Array.from_buffers(
            pa.bool_(),
            len(arr),
            [arr.buffers()[1], arr.buffers()[1]],
            null_count=-1,
            offset=arr.offset,
        )
    else:
        output = np.zeros(output_length, dtype=np.uint8)
        null_count = _or_na(len(arr), arr.offset,
                            arr.buffers()[0],
                            arr.buffers()[1], output)
        buf = pa.py_buffer(output)
        return pa.Array.from_buffers(pa.bool_(), len(arr), [buf, buf],
                                     null_count)
Exemplo n.º 10
0
 def schedule(self, array: pyarrow.Array, info: ArrayInfo):
     if array.type.num_children == 0 and str(
             array.type
     ) != "directory":  # primitive, contains null_bitmap, data
         array_meta = info.new_child("__arrow_array__")
         field_orders = self.field_order[array.type.num_buffers]
         for name, buffer in zip(field_orders, array.buffers()):
             array_meta.new_child("values",
                                  backend="buffer",
                                  compression="mmap",
                                  data=buffer).move_to(self.directory)
         array_meta.new_child(
             "metadata",
             backend="json",
             data={
                 **arrow_type_to_dict(array.type),  # type, args,
                 "field_order":
                 field_orders  # field_order
             }).move_to(self.directory)
         return info
     raise NotImplementedError()
Exemplo n.º 11
0
def all_true_like(arr: pa.Array) -> pa.Array:
    """Return a boolean array with all-True with the same size as the input and the same valid bitmap."""
    valid_buffer = arr.buffers()[0]
    if valid_buffer:
        valid_buffer = valid_buffer.slice(arr.offset // 8)

    output_offset = arr.offset % 8
    output_length = len(arr) + output_offset

    output_size = output_length // 8
    if output_length % 8 > 0:
        output_size += 1
    output = np.full(output_size, fill_value=255, dtype=np.uint8)

    return pa.Array.from_buffers(
        pa.bool_(),
        len(arr),
        [valid_buffer, pa.py_buffer(output)],
        arr.null_count,
        output_offset,
    )
Exemplo n.º 12
0
    def _get_offsets_buffer(
        self, arr: pa.Array
    ) -> Optional[Tuple[OmnisciProtocolBuffer, Tuple[DTypeKind, int, str,
                                                     str]]]:
        """
        Get column's offsets buffer.

        Parameters
        ----------
        arr : pa.Array
            PyArrow array holding column's data.

        Returns
        -------
        tuple or None
            Tuple of ``OmnisciProtocolBuffer`` and protocol dtype representation of the buffer's underlying data.
            None if the column's dtype is fixed-size.
        """
        buffs = arr.buffers()
        # According to the Arrow's memory layout, the offsets buffer is always at the second position
        # of `.buffers()` if present. Considering the support of only Primitive, Variable-length binary,
        # and Dict-encoded types from the layout table, we can assume that there's no offsets buffer
        # if there are fewer than 3 buffers available.
        # https://arrow.apache.org/docs/format/Columnar.html#buffer-listing-for-each-layout
        if len(buffs) < 3:
            return None

        offset_buff = buffs[1]
        # According to Arrow's data layout, the offset buffer type is "int32"
        dtype = self._dtype_from_primitive_numpy(np.dtype("int32"))
        return (
            OmnisciProtocolBuffer(
                offset_buff,
                self._get_buffer_size(bit_width=dtype[1],
                                      is_offset_buffer=True),
            ),
            dtype,
        )
Exemplo n.º 13
0
def format_timestamp_array(arr: pa.Array) -> pa.Array:
    """Build a PyArrow utf8 array from a timestamp array.

    The output Array will have the same length as the input.

    The output Array will consume RAM using two new, contiguous buffers.

    The format will be ISO8601, as precise as needed.
    """
    valid_buf, num_buf = arr.buffers()
    if arr.type.unit != "ns":
        raise NotImplementedError("TODO handle non-ns")  # pragma: no cover

    nums = memoryview(num_buf).cast("l")  # l = int64
    num_iter = _num_iter(valid_buf, nums)

    offset = 0
    out_offsets = array.array("I")  # uint32
    out_utf8 = io.BytesIO()

    for num in num_iter:
        # At each number, output the _start_ offset of that number
        out_offsets.append(offset)
        if num is not None:
            formatted, n = codecs.readbuffer_encode(_ns_to_iso8601(num))
            out_utf8.write(formatted)
            offset += n

    out_offsets.append(offset)

    return pa.StringArray.from_buffers(
        length=len(arr),
        value_offsets=pa.py_buffer(out_offsets.tobytes()),
        data=pa.py_buffer(bytes(out_utf8.getbuffer())),
        null_bitmap=valid_buf,
        null_count=arr.null_count,
    )
Exemplo n.º 14
0
def format_date_array(arr: pa.Array, unit: DateUnit) -> pa.Array:
    """Build a PyArrow utf8 array from a date32 array.

    The output Array will have the same length as the input.

    The output Array will consume RAM using two new, contiguous buffers.

    Formats (for date "2022-08-01", a Monday):

    * day: "2022-08-01"
    * week: "2022-08-01"
    * month: "2022-08"
    * quarter: "2022 Q3"
    * year: "2022"

    The format will be ISO8601, as precise as needed.
    """
    valid_buf, num_buf = arr.buffers()
    nums = memoryview(num_buf).cast("i")  # i = int32
    num_iter = _num_iter(valid_buf, nums)

    offset = 0
    out_offsets = array.array("I")  # uint32
    out_utf8 = io.BytesIO()

    # date32 allows negative years; Python `datetime.date` doesn't. Don't use
    # datetime.date.
    if unit == "year":

        def _format(day: int) -> str:
            return str(time.gmtime(86400 * day).tm_year)

    elif unit == "quarter":

        def _format(day: int) -> str:
            st = time.gmtime(86400 * day)
            return str(st.tm_year) + " Q" + str((st.tm_mon + 2) // 3)

    elif unit == "month":

        def _format(day: int) -> str:
            st = time.gmtime(86400 * day)
            return str(st.tm_year) + "-" + str(st.tm_mon).zfill(2)

    else:

        def _format(day: int) -> str:
            st = time.gmtime(86400 * day)
            return (str(st.tm_year) + "-" + str(st.tm_mon).zfill(2) + "-" +
                    str(st.tm_mday).zfill(2))

    for num in num_iter:
        # At each number, output the _start_ offset of that number
        out_offsets.append(offset)
        if num is not None:
            formatted, n = codecs.readbuffer_encode(_format(num))
            out_utf8.write(formatted)
            offset += n

    out_offsets.append(offset)

    return pa.StringArray.from_buffers(
        length=len(arr),
        value_offsets=pa.py_buffer(out_offsets.tobytes()),
        data=pa.py_buffer(bytes(out_utf8.getbuffer())),
        null_bitmap=valid_buf,
        null_count=arr.null_count,
    )
Exemplo n.º 15
0
def format_number_array(arr: pa.Array, fn: NumberFormatter) -> pa.Array:
    """
    Build a PyArrow utf8 array from a number array.

    The output Array will have the same length as the input. Input NULL, NaN and
    NaT will become NULL outputs.

    The output Array will consume RAM using three new, contiguous buffers.
    """
    # num_buf: byte-buffer holding numbers. num_buf[i*size:(i+1)*size] is the
    # little-endian ith value in arr.
    #
    # valid_buf: bitset of "valid" integers. valid_buf[(1 << i)] is 1 when
    # the ith entry in arr is set; it's 0 when the ith entry in arr is pa.NULL.
    valid_buf, num_buf = arr.buffers()
    for detect, struct_format in [
        (pa.types.is_uint8, "B"),
        (pa.types.is_uint16, "H"),
        (pa.types.is_uint32, "I"),
        (pa.types.is_uint64, "L"),
        (pa.types.is_int8, "b"),
        (pa.types.is_int16, "h"),
        (pa.types.is_int32, "i"),
        (pa.types.is_int64, "l"),
        (pa.types.is_float16, "e"),
        (pa.types.is_float32, "f"),
        (pa.types.is_float64, "d"),
    ]:
        if detect(arr.type):
            break
    else:
        raise TypeError("Unknown array type %r" % arr.type)  # pragma: no cover

    if valid_buf is None:
        # HACK: give the same interface as PyArrow bitmap buffer.
        # Make validity bitmap all-ones.
        valid_buf = b"\xff" * ((len(arr) + 8) // 8)

    nums = memoryview(num_buf).cast(struct_format)
    num_iter = iter(nums)
    offset = 0
    n_extra_nulls = 0

    out_valid8s = array.array("B")  # uint8
    out_offsets = array.array("I")  # uint32
    out_utf8 = io.BytesIO()

    # valid_buf is a bitset: 8 numbers per byte.
    # Iterate in groups of 8.
    for in_valid8 in valid_buf:
        out_valid8 = in_valid8
        try:
            for valid_i in range(8):
                valid_mask = 1 << valid_i
                is_valid = in_valid8 & valid_mask
                num = next(num_iter)
                # At each number, output the _start_ offset of that number
                out_offsets.append(offset)
                if is_valid:
                    if math.isfinite(num):
                        formatted, _ = codecs.utf_8_encode(fn(num))
                        out_utf8.write(formatted)
                        offset += len(formatted)
                    else:
                        n_extra_nulls += 1
                        # Input was NaN, inf, -inf. We don't format those: we
                        # set them to null.
                        #
                        # Flip output bit to 0
                        out_valid8 &= out_valid8 ^ valid_mask
                        # ... and offset does not change: next number will write
                        # the same offset, meaning _this_ number consumes 0
                        # bytes in out_utf8.
        except StopIteration:
            pass
        out_valid8s.append(out_valid8)

    out_offsets.append(offset)

    return pa.StringArray.from_buffers(
        length=len(arr),
        value_offsets=pa.py_buffer(out_offsets.tobytes()),
        data=pa.py_buffer(bytes(out_utf8.getbuffer())),
        null_bitmap=pa.py_buffer(out_valid8s.tobytes()),
        null_count=arr.null_count + n_extra_nulls,
    )
Exemplo n.º 16
0
def _extract_data_buffer_as_np_array(array: pa.Array) -> np.ndarray:
    """Extract the data buffer of a numeric-typed pyarrow.Array as an np.ndarray."""
    dtype = array.type.to_pandas_dtype()
    start = array.offset
    end = array.offset + len(array)
    return _buffer_to_view(array.buffers()[1]).view(dtype)[start:end]
Exemplo n.º 17
0
def _nix_utf8_chunk_empty_strings(chunk: pyarrow.Array) -> pyarrow.Array:
    """
    Return a pa.Array that replaces "" with null.

    Assume `arr` is of type `utf8` or a dictionary of `utf8`.
    """
    # pyarrow's cast() can't handle empty string. Create a new Array with
    # "" changed to null.
    _, offsets_buf, data_buf = chunk.buffers()

    # Build a new validity buffer, based on offsets. Empty string = null.
    # Assume `data` has no padding bytes in the already-null values. That way
    # we can ignore the _original_ validity buffer and assume all original
    # values are not-null. (Null values are stored as "" plus "invalid".)
    #
    # Validity-bitmap spec:
    # https://arrow.apache.org/docs/format/Columnar.html#validity-bitmaps

    # first offset must be 0. Next offsets are used to calculate lengths
    offsets = array.array("i")
    assert offsets.itemsize == 4
    offsets.frombytes(offsets_buf)
    if sys.byteorder != "little":
        offsets.byteswap()  # pyarrow is little-endian

    validity = bytearray()
    null_count = 0
    last_offset = offsets[0]
    assert last_offset == 0
    pos = 1
    while True:
        # Travel offsets in strides of 8: one per char in the validity bitmap.
        # Pad with an extra 1 bit -- [2020-02-20, adamhooper] I think I read
        # this is needed somewhere.
        valid_byte = 0x00
        block = offsets[pos:pos + 8]
        try:
            if block[0] > last_offset:
                valid_byte |= 0x1
            else:
                null_count += 1
            if block[1] > block[0]:
                valid_byte |= 0x2
            else:
                null_count += 1
            if block[2] > block[1]:
                valid_byte |= 0x4
            else:
                null_count += 1
            if block[3] > block[2]:
                valid_byte |= 0x8
            else:
                null_count += 1
            if block[4] > block[3]:
                valid_byte |= 0x10
            else:
                null_count += 1
            if block[5] > block[4]:
                valid_byte |= 0x20
            else:
                null_count += 1
            if block[6] > block[5]:
                valid_byte |= 0x40
            else:
                null_count += 1
            if block[7] > block[6]:
                valid_byte |= 0x80
            else:
                null_count += 1
            validity.append(valid_byte)
            last_offset = block[7]
            pos += 8
        except IndexError:
            validity.append(valid_byte)
            break  # end of offsets

    validity_buf = pyarrow.py_buffer(validity)

    # We may have over-counted in null_count: anything before `chunk.offset`
    # should not count.
    #
    # It's less work to "undo" the counting we did before -- otherwise we'd
    # riddle the above loop with if-statements.
    for i in range(chunk.offset):
        if offsets[i + 1] == offsets[i]:
            null_count -= 1

    return pyarrow.StringArray.from_buffers(
        length=len(chunk),
        value_offsets=offsets_buf,
        data=data_buf,
        null_bitmap=validity_buf,
        null_count=null_count,
        offset=chunk.offset,
    )
Exemplo n.º 18
0
def _text_replace_case_sensitive(data: pa.Array, pat: str, repl: str,
                                 max_repl: int) -> pa.Array:
    """
    Replace occurrences of ``pat`` with ``repl`` in the Series/Index with some other string. For every
    row, only the first ``max_repl`` replacements will be performed. If ``max_repl = -1`` we consider that
    we have no limit for the number of replacements.

    This implementation does basic byte-by-byte comparison and is independent
    of any locales or encodings.
    """

    # Convert to UTF-8 bytes
    pat_bytes: bytes = pat.encode()
    repl_bytes: bytes = repl.encode()

    offsets_buffer, data_buffer = _extract_string_buffers(data)

    if data.null_count == 0:
        valid_buffer = np.empty(0, dtype=np.uint8)
    else:
        valid_buffer = _buffer_to_view(data.buffers()[0])

    if len(pat) > 0:
        output_t = _text_replace_case_sensitive_numba(
            len(data),
            valid_buffer,
            data.offset,
            offsets_buffer,
            data_buffer,
            pat_bytes,
            repl_bytes,
            max_repl,
        )
    else:
        output_t = _text_replace_case_sensitive_empty_pattern(
            len(data),
            valid_buffer,
            data.offset,
            offsets_buffer,
            data_buffer,
            repl_bytes,
            max_repl,
        )

    output_offsets, output_buffer = output_t

    if data.null_count == 0:
        output_valid = None
    else:
        output_valid = data.buffers()[0].slice(data.offset // 8)
        if data.offset % 8 != 0:
            output_valid = shift_unaligned_bitmap(output_valid,
                                                  data.offset % 8, len(data))

    buffers = [
        output_valid,
        pa.py_buffer(output_offsets),
        pa.py_buffer(output_buffer)
    ]
    return pa.Array.from_buffers(pa.string(), len(data), buffers,
                                 data.null_count)