def sub_enumerate(ids, id_max): """Wrapper of :c:`sub_enumerate()` from src/ragged_array.c Args: ids (numpy.ndarray): A group number for each element. id_max (int): A strict upper bound for the **ids**. Returns: counts (numpy.ndarray): :py:`counts[x] := ids.count(x)`. sub_ids (numpy.ndarray): :py:`sub_ids[i] := ids[:i].count(ids[i])`. Raises: IndexError: If either :py:`(0 <= ids).all() or :py`(ids < id_max).all()` are not satisfied. """ ids = np.ascontiguousarray(ids, dtype=np.intc) counts = np.zeros(int(id_max), np.intc) sub_ids = np.empty_like(ids) slug.dll.sub_enumerate(ptr(ids), ids.size, ptr(counts), ptr(sub_ids)) return counts, sub_ids
def add(self, keys) -> np.ndarray: """Add **keys** to the table. Any key which is already in :attr:`keys` is not added again. Returns the index of each key in :attr:`keys` similarly to :meth:`get`. Raises: ValueError: If the :attr:`~numpy.ndarray.dtype` of **keys** doesn't match the :attr:`dtype` of this table. exceptions.HashTableFullError: If there is no space to place new keys. exceptions.HashTableDestroyed: If the :meth:`destroy` method has been previously called. """ self._check_destroyed() keys, shape = self._norm_input_keys(keys) out = np.empty(shape, np.intp) index = slug.dll.HT_adds(self._raw._ptr, ptr(keys), ptr(out), out.size) if index != -1: from hirola.exceptions import HashTableFullError source, value = self._blame_key(index, keys, shape) raise HashTableFullError( f"Failed to add {source} = {value} to the " f"hash table because the table is full and {value} " f"isn't already in it.") return out if shape else out.item()
def vectorise_hash(hash, key_size, keys): """Apply a hash() function to an array of **keys**. Only used for testing. """ keys = np.ascontiguousarray(keys) out = np.empty(keys.size * keys.dtype.itemsize // key_size, dtype=np.int32) slug.dll.vectorise_hash(ctypes.cast(hash, ctypes.c_void_p), ptr(keys), ptr(out), key_size, out.size) return out
def cumsum(x): # Normalise `x`. x = np.asarray(x, dtype=np.int32, order="C") # Create an empty array of the correct size and dtype. # Note that ``order="C"`` is already the default - no need to set it. out = np.empty(len(x), dtype=np.int32) # Call cumsum() C function. slug.dll.cumsum(ptr(x), ptr(out), len(x)) return out
def cumsum(x): # Some form of type check. if not (isinstance(x, array) and x.typecode == int32_t): x = array(int32_t, x) # Create an array with the same length as `x`. # Python really lacks an efficient way to create an empty array, out = array(int32_t, [0] * len(x)) # Call cumsum() C function. slug.dll.cumsum(ptr(x), ptr(out), len(x)) return out
def add_1_(arr): # Ensure `arr` is an array, is of the correct dtype and is C contiguous. arr = np.asarray(arr, dtype=np.intc, order="C") # Create an empty output array with the same type and shape as `arr`. out = np.empty_like(arr) # Call the C function on our arrays. # It doesn't matter that they're not 1D - they look the same to C. # Note the length parameter ``arr.size`` instead of ``len(arr)``. slug.dll.add_1(ptr(arr), ptr(out), arr.size) return out
def test_walk_through(): data = np.array([100, 101, 100, 103, 104, 105, 103, 107], dtype=np.float32) self = HashTable(5, dtype=data.dtype) assert self.dtype == data.dtype assert np.all(self._hash_owners == -1) assert self.key_size == 4 assert self.length == 0 assert self.max == 5 hash = slug.dll.hash(ptr(data), self.key_size) for i in range(2): assert slug.dll.HT_hash_for(self._raw._ptr, ptr(data), False) \ == hash % self.max assert self._add(data) == 0 assert self.length == 1 assert len(self) == 1 assert np.array_equal(self.keys, [100]) assert self._hash_owners[hash % self.max] == 0 assert self._get(data) == 0 assert self._add(data[1]) == 1 assert self._add(data[2]) == 0 assert self._add(data[3]) == 2 assert self._add(data[4]) == 3 assert self._add(data[5]) == 4 assert self._add(data[6]) == 2 assert self._add(data[7]) == -1 assert self.add(data[:7]).tolist() == [0, 1, 0, 2, 3, 4, 2] assert self.get(data).tolist() == [0, 1, 0, 2, 3, 4, 2, -1] assert self[data[:-1]].tolist() == [0, 1, 0, 2, 3, 4, 2] assert isinstance(self.add(data[0]), int) assert isinstance(self.get(data[0]), int) with pytest.raises(exceptions.HashTableFullError, match=r".* add keys\[7\] = 107\.0 to .* and 107\.0 is"): self.add(data) with pytest.raises( exceptions.HashTableFullError, match=r".* add keys\[1, 3\] = 107\.0 to .* and 107\.0 "): self.add(data.reshape((2, 4))) with pytest.raises(exceptions.HashTableFullError, match=r".* add key = 107\.0 to .* and 107\.0 is"): self.add(data[7])
def test_strided(): # Create a multidimensional PyBuffer using memoryview(). In practice, users # will more likely use multi-dimensional numpy array. _a = array.array("i", range(100)) a = memoryview(_a).cast("b", (10, 10, _a.itemsize)) address = _a.buffer_info()[0] assert ptr(a) == address assert nc_ptr(a) == address with pytest.raises(ValueError): ptr(a[::2]) # I would put more tests here but memoryview doesn't support them. assert nc_ptr(a[::2]) == address
def flatten_strided_3D(arr): """Wrapper for ``flatten_strided_3D()``.""" # Normalise array dtype but no need to enforce contiguity. arr = np.asarray(arr, dtype=np.double) assert arr.ndim == 3 # As before, create a flat empty output array to populate. out = np.empty(arr.size, arr.dtype) # Pass the arrays, the shape and strides to C. # Note, you must use `nc_ptr(arr)` instead of `ptr()` because `arr` is # not necessarily contiguous. slug.dll.flatten_strided_3D(nc_ptr(arr), ptr(out), ptr(arr.ctypes.shape), ptr(arr.ctypes.strides)) return out
def test_dump_load(dtype, byteorder): dtype = np.dtype(dtype).newbyteorder(byteorder) flat = np.arange(5, dtype=np.int8) self = RaggedArray.from_lengths(flat, [2, 3, 0]) _byteorder = "big" if _big_endian(dtype) else "little" _bin_int = lambda x: int.to_bytes(x, dtype.itemsize, _byteorder) bin = self.dumps(ldtype=dtype) target = (_bin_int(2), flat[0:2].tobytes(), _bin_int(3), flat[2:5].tobytes(), _bin_int(0), b"") # yapf: disable # Convert to lists only to make the pytest traceback more readable. assert list(bin) == list(b"".join(target)) from rockhopper._ragged_array import slug assert slug.dll.count_rows(ptr(bin), len(bin), _2_power(dtype), _big_endian(dtype), flat.itemsize) == len(self) with pytest.raises(ValueError): RaggedArray.loads(bin.tobytes() + b"\x01", dtype=self.dtype, ldtype=dtype) parsed, consumed = RaggedArray.loads(bin, dtype=self.dtype, ldtype=dtype) assert np.array_equal(self.starts, parsed.starts) assert np.array_equal(self.ends, parsed.ends) assert np.array_equal(self.flat, parsed.flat) assert consumed == len(bin)
def add_1(arr): # Ensure `arr` is an array, is of the correct dtype and is C contiguous. arr = np.asarray(arr, dtype=np.intc, order="C") # Store the original shape, then flatten `arr` to make it 1D. old_shape = arr.shape arr = arr.ravel() # Set up an empty output array with the same type and shape as `arr`. out = np.empty_like(arr) # Call the C function on our 1D arrays. slug.dll.add_1(ptr(arr), ptr(out), len(arr)) # Return the output after restoring the original shape. return out.reshape(old_shape)
def test_inc_reffing(): buffer = array.array("i", range(100)) old = sys.getrefcount(buffer) p = ptr(buffer) assert sys.getrefcount(buffer) == old + 1 del p assert sys.getrefcount(buffer) == old buffer_ = weakref.ref(buffer) assert sys.getrefcount(buffer) == old p = ptr(buffer) assert sys.getrefcount(buffer) == old + 1 del buffer assert buffer_() is not None del p assert buffer_() is None
def flatten_3D(arr): """Wrapper for ``flatten_3D()``.""" # Normalise `arr`. arr = np.asarray(arr, dtype=np.double, order="C") # Sanity check that `arr` is 3D. assert arr.ndim == 3 # Create a flat empty output array to populate. out = np.empty(arr.size, arr.dtype) # Pass `arr`, `out` and the shape of `arr` to the C function. # Note the use of `arr.ctypes.shape` which is a ctypes size_t array, # instead of `arr.shape` which is a tuple. slug.dll.flatten_3D(ptr(arr), ptr(out), ptr(arr.ctypes.shape)) return out
def sum_(arr): """Wrapper for the ``sum()`` function from ``arrays-demos.c``.""" # If not the correct type: if not (isinstance(arr, array) and arr.typecode == "d"): # Make it the correct type. arr = array("d", arr) # Run the C function. return slug.dll.sum(ptr(arr), len(arr))
def test_hash(): x = np.array([123, 4234, 213], dtype=np.int32) out = np.int32(0) old = np.seterr(over="ignore") for i in range(3): out ^= x[i] * np.int32(0x10001) out *= np.int32(0x0B070503) np.seterr(**old) assert slug.dll.hash(ptr(x), 12) == out
def __init__(self, max: Number, dtype: dtype_types): """ Args: max: An upper bound for the number of keys which can fit in this table. Sets the :attr:`max` attribute. dtype: The data type for the table's keys. Sets the :attr:`dtype` attribute. The **max** parameter is silently normalised to :class:`int` and clipped to a minimum of 1 if it is less than 1. """ self._dtype = np.dtype(dtype) key_size = self.dtype.itemsize self._base_dtype, self._dtype_shape = self.dtype.base, self.dtype.shape if self._base_dtype == object: raise TypeError("Object arrays are not permitted.") if self._base_dtype.kind in "SUV": # String-like types are checked differently. self._check_dtype = self._check_str_dtype if max <= 0: # Zero-sized tables get in the way of modulo. # Negative-sized tables obviously don't make sense. max = 1 max = int(max) self._hash_owners = np.full(max, -1, np.intp) self._keys = np.empty(max, dtype=self.dtype) self._keys_readonly = np.frombuffer(self._keys, self.dtype) self._keys_readonly.flags.writeable = False hash = choose_hash(key_size) self._destroyed = False self._raw = slug.dll.HashTable(max, key_size, ptr(self._hash_owners), ptr(self._keys_readonly), hash=ctypes.cast(hash, ctypes.c_void_p))
def contains(self, keys) -> Union[bool, np.ndarray]: """Check if a key or keys are in the table. Args: keys: Elements to check for. Returns: Either true or false for each key in **keys**. This function is equivalent to but faster than :py:`table.get(keys) != -1`. To check only one key you may also use :py:`key in table`. """ self._check_destroyed() keys, shape = self._norm_input_keys(keys) out = np.empty(shape, bool) slug.dll.HT_contains(self._raw._ptr, ptr(keys), ptr(out), out.size) return out if shape else out.item()
def get(self, keys, default=-1) -> np.ndarray: """Lookup indices of **keys** in :attr:`keys`. Arguments: keys: Elements to search for. default: Returned inplace of a missing key. May be any object. Returns: The index/indices of **keys** in this table's :attr:`keys`. If a key is not there, returns :py:`-1` in its place. Raises: ValueError: If the :attr:`~numpy.ndarray.dtype` of **keys** doesn't match the :attr:`dtype` of this table. exceptions.HashTableDestroyed: If the :meth:`destroy` method has been previously called. """ keys, shape = self._norm_input_keys(keys) out = np.empty(shape, np.intp) # This function forks out to several similar C functions depending on # how missing keys are to be handled. if default is self._NO_DEFAULT: # Default disabled - raise a key error if anything is missing. index = slug.dll.HT_gets_no_default(self._raw._ptr, ptr(keys), ptr(out), out.size) if index != -1: source, value = self._blame_key(index, keys, shape) raise KeyError(f"{source} = {value} is not in this table.") elif isinstance(default, numbers.Integral): if default == -1: # The default behaviour - use -1 to indicate missing keys. # This is already how the underlying C functions communicate # missing keys so nothing special needs to be done. slug.dll.HT_gets(self._raw._ptr, ptr(keys), ptr(out), out.size) else: # Not the default of -1 but still an integer default which can # be handled faster in C. slug.dll.HT_gets_default(self._raw._ptr, ptr(keys), ptr(out), out.size, default) else: # The slowest case: Return some non integer user defined default. slug.dll.HT_gets(self._raw._ptr, ptr(keys), ptr(out), out.size) out = np.where(out == -1, default, out) return out if shape else out.item()
def range_of(values): """ A user friendly wrapper around the raw C ``range_of()`` function. """ # Molly-coddle `values` to make sure it's of the right type. if not (isinstance(values, array.ArrayType) and values.typecode == "d"): values = array.array("d", values) # Create uninitialised `min_` and `max_` values to be written to. min_, max_ = ctypes.c_double(), ctypes.c_double() # Use `ctypes.byref()` to pass them to C as writable pointers. slug.dll.range_of(ptr(values), len(values), ctypes.byref(min_), ctypes.byref(max_)) # Return the contents of `min_` and `max_` as native Python floats. return min_.value, max_.value
def dumps(self, ldtype=np.intc): """Serialise into a :class:`memoryview`. Args: ldtype (Union[numpy.dtype, Type[numpy.generic]]): Integer type for the row lengths. Returns: memoryview: A bytes-like binary blob. The binary format is an undelimited sequence of ``(len(row), row)`` pairs. A pure Python approximation would be:: b"".join((len(row).tobytes() + row.tobytes() for row in ragged_array)) The integer types of the row lengths can be controlled by the **ldtype** parameter. To change the type or byteorder of the data itself, cast to that type with :meth:`astype` then call this function. """ ldtype = np.dtype(ldtype) # --- Work out how many bytes the output will need. --- # The total length of the flat data. Note, `self.flat.size` would not be # a safe shortcut unless `self.repacked()` has been called 1st. length = (self.ends - self.starts).sum() * self.itemsize # And the lengths of the lengths... length += len(self) * ldtype.itemsize # Allocate `length` bytes to write to. `numpy.empty()` seems to be one # of the only ways to create a lump of memory in Python without wasting # time initialising it. out = np.empty(length, dtype=np.byte) failed_row = slug.dll.dump(self._c_struct._ptr, ptr(out), _2_power(ldtype), _big_endian(ldtype)) if failed_row != -1: raise OverflowError( f"Row {failed_row} with length {len(self[failed_row])} " f"is too long to write with an {ldtype.name} integer.") return out.data
def test_character_arrays_dont_need_null_termination(binary): """ Test that a C string converted to Python doesn't need to be null terminated. I make this rather heavy assumption that it's ok in the docs. If this fails then a lot of examples are wrong. """ char = ctypes.c_char if binary else ctypes.c_wchar text = str(copyright) assert "\x00" not in text if binary: text = text.encode() assert b"\x00" not in text array = ctypes.create_string_buffer(text) else: array = ctypes.create_unicode_buffer(text) for i in range(100): unterminated = (char * i).from_address(ptr(array)) value = unterminated.value assert len(value) == i assert value == text[:i]
def __init__(self, flat, starts, ends=None, dtype=None, check=True): """The default way to construct a :class:`RaggedArray` is explicitly from a :attr:`flat` contents array and either row :attr:`starts` and :attr:`ends` arrays or, more commonly, a *bounds* array. Args: flat: The contents of the array with no structure. starts: The index of **flat** where each row starts. Or if **ends** is unspecified, the start of each row and the end of the previous row. ends: The index of **flat** where each row ends. dtype: The :class:`numpy.dtype` of the array. Usually this can be inferred from **flat** and is therefore not required to be set explicitly. To indicate that multiple scalars should be considered as one item, use a :class:`tuple` dtype. check: If true (default), verify that **starts** and **ends** are valid (via :meth:`check`). Please only disable this if you need to a construct a ragged array by first creating an uninitialised array to then populating it. Invalid arrays can lead to seg-faults. .. seealso:: Explicit construction is rarely the most convenient way to build a :class:`RaggedArray`. See :meth:`from_nested` to construct from lists of lists. Or :meth:`from_lengths` to construct from flat data and row lengths. Or :meth:`group_by` to specify the row number explicitly for each item. Examples: Assuming the setup code:: import numpy as np from rockhopper import RaggedArray flat = np.arange(10) :: >>> bounds = [0, 4, 7, 10] >>> RaggedArray(flat, bounds) RaggedArray.from_nested([ [0, 1, 2, 3], [4, 5, 6], [7, 8, 9], ]) The **bounds** need not start at the beginning and end and the end. Note however that the leading and trailing items in **flat** are not represented in the repr. :: >>> bounds = [2, 4, 4, 5, 9] >>> RaggedArray(flat, bounds) RaggedArray.from_nested([ [2, 3], [], [4], [5, 6, 7, 8], ]) To be able to have gaps between rows or overlapping rows set both **starts** and **ends**. :: >>> starts = [0, 3, 1] >>> ends = [6, 6, 5] >>> RaggedArray(flat, starts, ends) RaggedArray.from_nested([ [0, 1, 2, 3, 4, 5], # flat[0:6] [3, 4, 5], # flat[3:6] [1, 2, 3, 4], # flat[1:5] ]) This form is typically not very useful but is given more to explain how the :class:`RaggedArray` works internally. Copy-less slicing uses this form heavily. """ self.flat = np.asarray(flat, dtype=dtype, order="C") if len(self.flat) >= (1 << 31): # pragma: 64bit # Supporting large arrays would require promoting all ints in the C # code to int64_t. Given that it takes at least 2GB of memory to get # an array this big, I doubt that this would be useful but I could # be wrong... raise RequestMeError( "Flat lengths >= 2^31 are disabled at compile time to save " "memory at runtime.") if ends is None: bounds = np.asarray(starts, dtype=np.intc, order="C") self.starts = bounds[:-1] self.ends = bounds[1:] else: self.starts = np.asarray(starts, dtype=np.intc, order="C") self.ends = np.asarray(ends, dtype=np.intc, order="C") self._c_struct = slug.dll.RaggedArray( ptr(self.flat), self.itemsize, len(self), ptr(self.starts), ptr(self.ends), ) if check: self.check()
def loads(cls, bin, dtype, rows=-1, ldtype=np.intc) -> Tuple['RaggedArray', int]: """Deserialize a ragged array. This is the reciprocal of :meth:`dumps`. Args: bin (bytes): Raw data to unpack. dtype (Union[numpy.dtype, Type[numpy.generic]]): Data type of the row contents in **bin**. rows (int): Number of rows to parse. Defaults to :py:`-1` for unknown. ldtype (Union[numpy.dtype, Type[numpy.generic]]): Integer type of the row lengths in **bin**. Returns: RaggedArray: The deserialised ragged array. int: The number of bytes from **bin** consumed. Raises: ValueError: If **bin** ends prematurely or in the middle of a row. This is indicative of either data corruption or, more likely, muddling of dtypes. """ dtype = np.dtype(dtype) ldtype = np.dtype(ldtype) # We need to know how many rows there will be in this new ragged array # before creating and populating it. if rows == -1: # If it's not already known then it has to be counted. rows = slug.dll.count_rows(ptr(bin), len(bin), _2_power(ldtype), _big_endian(ldtype), dtype.itemsize) if rows == -1: # `count_rows()` returns -1 on error. raise ValueError( "Raw `bin` data ended mid way through a row. Either this " "data is corrupt or the dtype(s) given are incorrect.") # Run again with known number of `rows`. return cls.loads(bin, dtype, rows, ldtype) free = len(bin) - rows * ldtype.itemsize items = free // dtype.itemsize if items < 0: raise ValueError( f"With `bin` of length {len(bin)}, {rows} rows of " f"{ldtype.itemsize} byte lengths leaves {free} bytes " f"for the flat data. Perhaps your data types are wrong?") self = cls(np.empty(items, dtype=dtype), np.empty(rows + 1, np.intc), check=False) bin_consumed = ctypes.c_size_t(0) _rows = slug.dll.load(self._c_struct._ptr, ptr(bin), len(bin), ctypes.byref(bin_consumed), rows, _2_power(ldtype), _big_endian(ldtype)) if _rows < rows: raise ValueError( f"Raw `bin` data ended too soon. " f"Only {_rows} out of the requested {rows} rows were read. " f"Either this data is corrupt or the dtype(s) given are " "incorrect.") return self, bin_consumed.value
def sum_(arr): # Ensure `arr` is an array, is of the correct dtype and is C contiguous. arr = np.asarray(arr, dtype=np.double, order="C") # Find its sum. return slug.dll.sum(ptr(arr), len(arr))
def _add(self, key): return slug.dll.HT_add(self._raw._ptr, ptr(key))
def _get(self, key): return slug.dll.HT_get(self._raw._ptr, ptr(key))
def test_repr(): p = ptr(b"") assert repr(int(p)) in repr(p) assert repr(int(p)) != repr(p)
def test_ptr_leaks(): leaks(lambda: ptr(bytes(MEM_BLOCK_SIZE)), MEM_LEAK_TOL) leaks(lambda: nc_ptr(bytes(MEM_BLOCK_SIZE)), MEM_LEAK_TOL)
def test_ptr(ptr): a = array.array("i", range(100)) assert a.buffer_info() == (ptr(a), len(a)) assert a.buffer_info() == (ptr(memoryview(a)), len(a)) if sys.version_info >= (3, 8): assert a.buffer_info() == (ptr(memoryview(a).toreadonly()), len(a)) # bytes() and bytearray() make copies so the ids won't match. Still check # we can get their pointers though. ptr(bytes(a)) ptr(bytearray(a)) with pytest.raises(TypeError): ptr("not bytes-like") with pytest.raises(TypeError): ptr(12)
from array import array from cslug import CSlug, ptr, anchor slug = CSlug(anchor("arrays-demo.c")) slug.make() assert slug.dll.sum(ptr(array("d", [10, 11, 12])), 3) == 33.0 def sum_(arr): """Wrapper for the ``sum()`` function from ``arrays-demos.c``.""" # If not the correct type: if not (isinstance(arr, array) and arr.typecode == "d"): # Make it the correct type. arr = array("d", arr) # Run the C function. return slug.dll.sum(ptr(arr), len(arr)) assert sum_(range(10)) == 45 from cslug.misc import array_typecode int32_t = array_typecode("int32_t") def cumsum(x): # Some form of type check. if not (isinstance(x, array) and x.typecode == int32_t): x = array(int32_t, x)