def _extract_data_buffer_as_np_array(array: pa.Array) -> np.ndarray: """Extract the data buffer of a numeric-typed pyarrow.Array as an np.ndarray.""" dtype = array.type.to_pandas_dtype() start = array.offset end = array.offset + len(array) if pa.types.is_boolean(array.type): return np.unpackbits(_buffer_to_view(array.buffers()[1]).view( np.uint8), bitorder="little")[start:end].astype(bool) else: return _buffer_to_view(array.buffers()[1]).view(dtype)[start:end]
def from_arrow(cls, data: pa.Array): dtype = Decimal64Dtype.from_arrow(data.type) mask_buf = data.buffers()[0] mask = (mask_buf if mask_buf is None else pa_mask_buffer_to_mask( mask_buf, len(data))) data_128 = cp.array(np.frombuffer(data.buffers()[1]).view("int64")) data_64 = data_128[::2].copy() return cls( data=Buffer(data_64.view("uint8")), size=len(data), dtype=dtype, mask=mask, )
def _get_data_buffer( self, arr: pa.Array ) -> Tuple[OmnisciProtocolBuffer, Tuple[DTypeKind, int, str, str]]: """ Get column's data buffer. Parameters ---------- arr : pa.Array PyArrow array holding column's data. Returns ------- tuple Tuple of ``OmnisciProtocolBuffer`` and protocol dtype representation of the buffer's underlying data. """ if self.dtype[0] == DTypeKind.CATEGORICAL: # For dictionary data the buffer has to return categories codes arr = arr.indices arrow_type = self._dtype_from_pyarrow(arr.type) buff_size = ( self._get_buffer_size( bit_width=arrow_type[1]) if self.dtype[0] != DTypeKind.STRING # We don't chunk string buffers as it would require modifying offset values, # so just return the whole data buffer for every chunk. else None) return ( # According to the Arrow's memory layout, the data buffer is always present # at the last position of `.buffers()`: # https://arrow.apache.org/docs/format/Columnar.html#buffer-listing-for-each-layout OmnisciProtocolBuffer(arr.buffers()[-1], buff_size), arrow_type, )
def _get_validity_buffer( self, arr: pa.Array ) -> Optional[Tuple[OmnisciProtocolBuffer, Tuple[DTypeKind, int, str, str]]]: """ Get column's validity buffer. Parameters ---------- arr : pa.Array PyArrow array holding column's data. Returns ------- tuple or None Tuple of ``OmnisciProtocolBuffer`` and protocol dtype representation of the buffer's underlying data. None if column is non-nullable (``self.describe_null == ColumnNullType.NON_NULLABLE``). """ # According to the Arrow's memory layout, the validity buffer is always present at zero position: # https://arrow.apache.org/docs/format/Columnar.html#buffer-listing-for-each-layout validity_buffer = arr.buffers()[0] if validity_buffer is None: return None # If exist, validity buffer is always a bit-mask. data_size = self._get_buffer_size(bit_width=1) return ( OmnisciProtocolBuffer(validity_buffer, data_size), (DTypeKind.BOOL, 1, ArrowCTypes.BOOL, Endianness.NATIVE), )
def _text_strip(data: pa.Array, to_strip) -> pa.Array: """ Strip the characters of ``to_strip`` from start and end of each element in the data. """ if len(data) == 0: return data offsets, data_buffer = _extract_string_buffers(data) valid_buffer = data.buffers()[0] valid_offset = data.offset builder = StringArrayBuilder(max(len(data_buffer), len(data))) _do_strip( valid_buffer, valid_offset, offsets, data_buffer, len(data), to_strip, inout_builder=builder, ) result_array = finalize_string_array(builder, pa.string()) return result_array
def or_array_array(a: pa.Array, b: pa.Array) -> pa.Array: """Perform ``pyarrow.Array | pyarrow.Array``.""" output_length = len(a) // 8 if len(a) % 8 != 0: output_length += 1 if a.null_count == 0 and b.null_count == 0: result = np.zeros(output_length, dtype=np.uint8) bitmap_or_unaligned(len(a), a.buffers()[1], a.offset, b.buffers()[1], b.offset, result) return pa.Array.from_buffers(pa.bool_(), len(a), [None, pa.py_buffer(result)], 0) elif a.null_count == 0: result = np.zeros(output_length, dtype=np.uint8) bitmap_or_unaligned(len(a), a.buffers()[1], a.offset, b.buffers()[1], b.offset, result) # b has nulls, mark all occasions of b(None) & a(True) as True -> valid_bits = a.data or b.valid_bits valid_bits = np.zeros(output_length, dtype=np.uint8) bitmap_or_unaligned(len(a), a.buffers()[1], a.offset, b.buffers()[0], b.offset, valid_bits) return pa.Array.from_buffers( pa.bool_(), len(a), [pa.py_buffer(valid_bits), pa.py_buffer(result)]) pass elif b.null_count == 0: return or_array_array(b, a) else: result = np.zeros(output_length, dtype=np.uint8) valid_bits = np.zeros(output_length, dtype=np.uint8) null_count = masked_bitmap_or_unaligned( len(a), a.buffers()[0], a.buffers()[1], a.offset, b.buffers()[0], b.buffers()[1], b.offset, result, valid_bits, ) return pa.Array.from_buffers( pa.bool_(), len(a), [pa.py_buffer(valid_bits), pa.py_buffer(result)], null_count, )
def _extract_isnull_bitmap(arr: pa.Array, offset: int, length: int): """ Extract isnull bitmap with offset and padding. Ensures that even when pyarrow does return an empty bitmap that a filled one will be returned. """ buf = _buffer_to_view(arr.buffers()[0]) if len(buf) > 0: return buf[offset:offset + length] else: return np.full(length, fill_value=255, dtype=np.uint8)
def _text_contains_case_sensitive(data: pa.Array, pat: str) -> pa.Array: """ Check for each element in the data whether it contains the pattern ``pat``. This implementation does basic byte-by-byte comparison and is independent of any locales or encodings. """ # Convert to UTF-8 bytes pat_bytes: bytes = pat.encode() # Initialise boolean (bit-packaed) output array. output_size = len(data) // 8 if len(data) % 8 > 0: output_size += 1 output = np.empty(output_size, dtype=np.uint8) if len(data) % 8 > 0: # Zero trailing bits output[-1] = 0 offsets, data_buffer = _extract_string_buffers(data) if data.null_count == 0: valid_buffer = None _text_contains_case_sensitive_nonnull( len(data), offsets, data_buffer, pat_bytes, output ) else: valid = _buffer_to_view(data.buffers()[0]) _text_contains_case_sensitive_nulls( len(data), valid, data.offset, offsets, data_buffer, pat_bytes, output ) valid_buffer = data.buffers()[0].slice(data.offset // 8) if data.offset % 8 != 0: valid_buffer = shift_unaligned_bitmap( valid_buffer, data.offset % 8, len(data) ) return pa.Array.from_buffers( pa.bool_(), len(data), [valid_buffer, pa.py_buffer(output)], data.null_count )
def or_na(arr: pa.Array) -> pa.Array: """Apply ``array | NA`` with a boolean pyarrow.Array.""" output_length = len(arr) // 8 if len(arr) % 8 != 0: output_length += 1 if arr.null_count == 0: return pa.Array.from_buffers( pa.bool_(), len(arr), [arr.buffers()[1], arr.buffers()[1]], null_count=-1, offset=arr.offset, ) else: output = np.zeros(output_length, dtype=np.uint8) null_count = _or_na(len(arr), arr.offset, arr.buffers()[0], arr.buffers()[1], output) buf = pa.py_buffer(output) return pa.Array.from_buffers(pa.bool_(), len(arr), [buf, buf], null_count)
def schedule(self, array: pyarrow.Array, info: ArrayInfo): if array.type.num_children == 0 and str( array.type ) != "directory": # primitive, contains null_bitmap, data array_meta = info.new_child("__arrow_array__") field_orders = self.field_order[array.type.num_buffers] for name, buffer in zip(field_orders, array.buffers()): array_meta.new_child("values", backend="buffer", compression="mmap", data=buffer).move_to(self.directory) array_meta.new_child( "metadata", backend="json", data={ **arrow_type_to_dict(array.type), # type, args, "field_order": field_orders # field_order }).move_to(self.directory) return info raise NotImplementedError()
def all_true_like(arr: pa.Array) -> pa.Array: """Return a boolean array with all-True with the same size as the input and the same valid bitmap.""" valid_buffer = arr.buffers()[0] if valid_buffer: valid_buffer = valid_buffer.slice(arr.offset // 8) output_offset = arr.offset % 8 output_length = len(arr) + output_offset output_size = output_length // 8 if output_length % 8 > 0: output_size += 1 output = np.full(output_size, fill_value=255, dtype=np.uint8) return pa.Array.from_buffers( pa.bool_(), len(arr), [valid_buffer, pa.py_buffer(output)], arr.null_count, output_offset, )
def _get_offsets_buffer( self, arr: pa.Array ) -> Optional[Tuple[OmnisciProtocolBuffer, Tuple[DTypeKind, int, str, str]]]: """ Get column's offsets buffer. Parameters ---------- arr : pa.Array PyArrow array holding column's data. Returns ------- tuple or None Tuple of ``OmnisciProtocolBuffer`` and protocol dtype representation of the buffer's underlying data. None if the column's dtype is fixed-size. """ buffs = arr.buffers() # According to the Arrow's memory layout, the offsets buffer is always at the second position # of `.buffers()` if present. Considering the support of only Primitive, Variable-length binary, # and Dict-encoded types from the layout table, we can assume that there's no offsets buffer # if there are fewer than 3 buffers available. # https://arrow.apache.org/docs/format/Columnar.html#buffer-listing-for-each-layout if len(buffs) < 3: return None offset_buff = buffs[1] # According to Arrow's data layout, the offset buffer type is "int32" dtype = self._dtype_from_primitive_numpy(np.dtype("int32")) return ( OmnisciProtocolBuffer( offset_buff, self._get_buffer_size(bit_width=dtype[1], is_offset_buffer=True), ), dtype, )
def format_timestamp_array(arr: pa.Array) -> pa.Array: """Build a PyArrow utf8 array from a timestamp array. The output Array will have the same length as the input. The output Array will consume RAM using two new, contiguous buffers. The format will be ISO8601, as precise as needed. """ valid_buf, num_buf = arr.buffers() if arr.type.unit != "ns": raise NotImplementedError("TODO handle non-ns") # pragma: no cover nums = memoryview(num_buf).cast("l") # l = int64 num_iter = _num_iter(valid_buf, nums) offset = 0 out_offsets = array.array("I") # uint32 out_utf8 = io.BytesIO() for num in num_iter: # At each number, output the _start_ offset of that number out_offsets.append(offset) if num is not None: formatted, n = codecs.readbuffer_encode(_ns_to_iso8601(num)) out_utf8.write(formatted) offset += n out_offsets.append(offset) return pa.StringArray.from_buffers( length=len(arr), value_offsets=pa.py_buffer(out_offsets.tobytes()), data=pa.py_buffer(bytes(out_utf8.getbuffer())), null_bitmap=valid_buf, null_count=arr.null_count, )
def format_date_array(arr: pa.Array, unit: DateUnit) -> pa.Array: """Build a PyArrow utf8 array from a date32 array. The output Array will have the same length as the input. The output Array will consume RAM using two new, contiguous buffers. Formats (for date "2022-08-01", a Monday): * day: "2022-08-01" * week: "2022-08-01" * month: "2022-08" * quarter: "2022 Q3" * year: "2022" The format will be ISO8601, as precise as needed. """ valid_buf, num_buf = arr.buffers() nums = memoryview(num_buf).cast("i") # i = int32 num_iter = _num_iter(valid_buf, nums) offset = 0 out_offsets = array.array("I") # uint32 out_utf8 = io.BytesIO() # date32 allows negative years; Python `datetime.date` doesn't. Don't use # datetime.date. if unit == "year": def _format(day: int) -> str: return str(time.gmtime(86400 * day).tm_year) elif unit == "quarter": def _format(day: int) -> str: st = time.gmtime(86400 * day) return str(st.tm_year) + " Q" + str((st.tm_mon + 2) // 3) elif unit == "month": def _format(day: int) -> str: st = time.gmtime(86400 * day) return str(st.tm_year) + "-" + str(st.tm_mon).zfill(2) else: def _format(day: int) -> str: st = time.gmtime(86400 * day) return (str(st.tm_year) + "-" + str(st.tm_mon).zfill(2) + "-" + str(st.tm_mday).zfill(2)) for num in num_iter: # At each number, output the _start_ offset of that number out_offsets.append(offset) if num is not None: formatted, n = codecs.readbuffer_encode(_format(num)) out_utf8.write(formatted) offset += n out_offsets.append(offset) return pa.StringArray.from_buffers( length=len(arr), value_offsets=pa.py_buffer(out_offsets.tobytes()), data=pa.py_buffer(bytes(out_utf8.getbuffer())), null_bitmap=valid_buf, null_count=arr.null_count, )
def format_number_array(arr: pa.Array, fn: NumberFormatter) -> pa.Array: """ Build a PyArrow utf8 array from a number array. The output Array will have the same length as the input. Input NULL, NaN and NaT will become NULL outputs. The output Array will consume RAM using three new, contiguous buffers. """ # num_buf: byte-buffer holding numbers. num_buf[i*size:(i+1)*size] is the # little-endian ith value in arr. # # valid_buf: bitset of "valid" integers. valid_buf[(1 << i)] is 1 when # the ith entry in arr is set; it's 0 when the ith entry in arr is pa.NULL. valid_buf, num_buf = arr.buffers() for detect, struct_format in [ (pa.types.is_uint8, "B"), (pa.types.is_uint16, "H"), (pa.types.is_uint32, "I"), (pa.types.is_uint64, "L"), (pa.types.is_int8, "b"), (pa.types.is_int16, "h"), (pa.types.is_int32, "i"), (pa.types.is_int64, "l"), (pa.types.is_float16, "e"), (pa.types.is_float32, "f"), (pa.types.is_float64, "d"), ]: if detect(arr.type): break else: raise TypeError("Unknown array type %r" % arr.type) # pragma: no cover if valid_buf is None: # HACK: give the same interface as PyArrow bitmap buffer. # Make validity bitmap all-ones. valid_buf = b"\xff" * ((len(arr) + 8) // 8) nums = memoryview(num_buf).cast(struct_format) num_iter = iter(nums) offset = 0 n_extra_nulls = 0 out_valid8s = array.array("B") # uint8 out_offsets = array.array("I") # uint32 out_utf8 = io.BytesIO() # valid_buf is a bitset: 8 numbers per byte. # Iterate in groups of 8. for in_valid8 in valid_buf: out_valid8 = in_valid8 try: for valid_i in range(8): valid_mask = 1 << valid_i is_valid = in_valid8 & valid_mask num = next(num_iter) # At each number, output the _start_ offset of that number out_offsets.append(offset) if is_valid: if math.isfinite(num): formatted, _ = codecs.utf_8_encode(fn(num)) out_utf8.write(formatted) offset += len(formatted) else: n_extra_nulls += 1 # Input was NaN, inf, -inf. We don't format those: we # set them to null. # # Flip output bit to 0 out_valid8 &= out_valid8 ^ valid_mask # ... and offset does not change: next number will write # the same offset, meaning _this_ number consumes 0 # bytes in out_utf8. except StopIteration: pass out_valid8s.append(out_valid8) out_offsets.append(offset) return pa.StringArray.from_buffers( length=len(arr), value_offsets=pa.py_buffer(out_offsets.tobytes()), data=pa.py_buffer(bytes(out_utf8.getbuffer())), null_bitmap=pa.py_buffer(out_valid8s.tobytes()), null_count=arr.null_count + n_extra_nulls, )
def _extract_data_buffer_as_np_array(array: pa.Array) -> np.ndarray: """Extract the data buffer of a numeric-typed pyarrow.Array as an np.ndarray.""" dtype = array.type.to_pandas_dtype() start = array.offset end = array.offset + len(array) return _buffer_to_view(array.buffers()[1]).view(dtype)[start:end]
def _nix_utf8_chunk_empty_strings(chunk: pyarrow.Array) -> pyarrow.Array: """ Return a pa.Array that replaces "" with null. Assume `arr` is of type `utf8` or a dictionary of `utf8`. """ # pyarrow's cast() can't handle empty string. Create a new Array with # "" changed to null. _, offsets_buf, data_buf = chunk.buffers() # Build a new validity buffer, based on offsets. Empty string = null. # Assume `data` has no padding bytes in the already-null values. That way # we can ignore the _original_ validity buffer and assume all original # values are not-null. (Null values are stored as "" plus "invalid".) # # Validity-bitmap spec: # https://arrow.apache.org/docs/format/Columnar.html#validity-bitmaps # first offset must be 0. Next offsets are used to calculate lengths offsets = array.array("i") assert offsets.itemsize == 4 offsets.frombytes(offsets_buf) if sys.byteorder != "little": offsets.byteswap() # pyarrow is little-endian validity = bytearray() null_count = 0 last_offset = offsets[0] assert last_offset == 0 pos = 1 while True: # Travel offsets in strides of 8: one per char in the validity bitmap. # Pad with an extra 1 bit -- [2020-02-20, adamhooper] I think I read # this is needed somewhere. valid_byte = 0x00 block = offsets[pos:pos + 8] try: if block[0] > last_offset: valid_byte |= 0x1 else: null_count += 1 if block[1] > block[0]: valid_byte |= 0x2 else: null_count += 1 if block[2] > block[1]: valid_byte |= 0x4 else: null_count += 1 if block[3] > block[2]: valid_byte |= 0x8 else: null_count += 1 if block[4] > block[3]: valid_byte |= 0x10 else: null_count += 1 if block[5] > block[4]: valid_byte |= 0x20 else: null_count += 1 if block[6] > block[5]: valid_byte |= 0x40 else: null_count += 1 if block[7] > block[6]: valid_byte |= 0x80 else: null_count += 1 validity.append(valid_byte) last_offset = block[7] pos += 8 except IndexError: validity.append(valid_byte) break # end of offsets validity_buf = pyarrow.py_buffer(validity) # We may have over-counted in null_count: anything before `chunk.offset` # should not count. # # It's less work to "undo" the counting we did before -- otherwise we'd # riddle the above loop with if-statements. for i in range(chunk.offset): if offsets[i + 1] == offsets[i]: null_count -= 1 return pyarrow.StringArray.from_buffers( length=len(chunk), value_offsets=offsets_buf, data=data_buf, null_bitmap=validity_buf, null_count=null_count, offset=chunk.offset, )
def _text_replace_case_sensitive(data: pa.Array, pat: str, repl: str, max_repl: int) -> pa.Array: """ Replace occurrences of ``pat`` with ``repl`` in the Series/Index with some other string. For every row, only the first ``max_repl`` replacements will be performed. If ``max_repl = -1`` we consider that we have no limit for the number of replacements. This implementation does basic byte-by-byte comparison and is independent of any locales or encodings. """ # Convert to UTF-8 bytes pat_bytes: bytes = pat.encode() repl_bytes: bytes = repl.encode() offsets_buffer, data_buffer = _extract_string_buffers(data) if data.null_count == 0: valid_buffer = np.empty(0, dtype=np.uint8) else: valid_buffer = _buffer_to_view(data.buffers()[0]) if len(pat) > 0: output_t = _text_replace_case_sensitive_numba( len(data), valid_buffer, data.offset, offsets_buffer, data_buffer, pat_bytes, repl_bytes, max_repl, ) else: output_t = _text_replace_case_sensitive_empty_pattern( len(data), valid_buffer, data.offset, offsets_buffer, data_buffer, repl_bytes, max_repl, ) output_offsets, output_buffer = output_t if data.null_count == 0: output_valid = None else: output_valid = data.buffers()[0].slice(data.offset // 8) if data.offset % 8 != 0: output_valid = shift_unaligned_bitmap(output_valid, data.offset % 8, len(data)) buffers = [ output_valid, pa.py_buffer(output_offsets), pa.py_buffer(output_buffer) ] return pa.Array.from_buffers(pa.string(), len(data), buffers, data.null_count)