def _long_impl(val): # This function assumes val came from a long int repr with val being a # uint64_t this means having to split the input into PyLong_SHIFT size # chunks in an unsigned hash wide type, max numba can handle is a 64bit int # mask to select low _PyLong_SHIFT bits _tmp_shift = 32 - _PyLong_SHIFT mask_shift = (~types.uint32(0x0)) >> _tmp_shift # a 64bit wide max means Numba only needs 3 x 30 bit values max, # or 5 x 15 bit values max on 32bit platforms i = (64 // _PyLong_SHIFT) + 1 # alg as per hash_long x = 0 p3 = (_PyHASH_BITS - _PyLong_SHIFT) for idx in range(i - 1, -1, -1): p1 = x << _PyLong_SHIFT p2 = p1 & _PyHASH_MODULUS p4 = x >> p3 x = p2 | p4 # the shift and mask splits out the `ob_digit` parts of a Long repr x += types.uint32((val >> idx * _PyLong_SHIFT) & mask_shift) if x >= _PyHASH_MODULUS: x -= _PyHASH_MODULUS return _Py_hash_t(x)
def grind_seeds(matches, results, seedoffs, seedstep) -> int: R = numpy.full(0x80, uint32(0)) arsonist_shuffle = numpy.full(6, uint32(0)) rslot = 0 seed = BASE_SEED for i in range(0x40): R[i] = seed R[i + 0x40] = seed seed = ((seed * 1103515245) + 12345) & 0x7FFFFFFF for j in range(seedoffs): R[(j + 0x40) & 0x7F] = seed R[(j + 0x80) & 0x7F] = seed seed = ((seed * 1103515245) + 12345) & 0x7FFFFFFF for r in range(seedoffs, 1 << 31, seedstep): if calc_all_from_seed(R, arsonist_shuffle, matches, results, rslot, r): rslot += 1 for j in range(seedstep): R[(r + j + 0x40) & 0x7F] = seed R[(r + j + 0x80) & 0x7F] = seed seed = ((seed * 1103515245) + 12345) & 0x7FFFFFFF if ((r - seedoffs + seedstep) & 0xFFFFF) == 0: print((r - seedoffs + seedstep) * 100.0 / (1 << 31)) if rslot >= 100: return rslot return rslot
def euclidean_map_kernel(x: ArrayLike, y: ArrayLike, out: ArrayLike) -> None: # pragma: no cover. """Euclidean map CUDA kernel. Parameters ---------- x [array-like, shape: (m, n)] y [array-like, shape: (p, n)] out [array-like, shape: (m, p, 1)] The zeros array of shape (m, p, 1) for returning the result. Returns ------- An ndarray, which contains the output of the calculation of the application of euclidean distance on all pairs of vectors from x and y arrays. """ # Aggresive typecasting of all the variables is done to improve performance. # Unique index of the thread in the whole grid. i1 = types.uint32(cuda.grid(2)[types.uint32(0)]) i2 = types.uint32(cuda.grid(2)[types.uint32(1)]) out_shape_0 = types.uint32(out.shape[types.uint32(0)]) out_shape_1 = types.uint32(out.shape[types.uint32(1)]) if i1 >= out_shape_0 or i2 >= out_shape_1: # Quit if (x, y) is outside of valid output array boundary # This is required because we may spin up more threads than we need. return _euclidean_distance_map(x[i1], y[i2], out[i1][i2])
def _euclidean_distance_map(a: ArrayLike, b: ArrayLike, out: ArrayLike) -> None: # pragma: no cover. """Helper function for the map step of euclidean distance which runs on the device (GPU) itself. Parameters ---------- a [array-like, shape: (1, n)] b [array-like, shape: (1, n)] out [array-like, shape: (1)] The output array for returning the result. Returns ------- An ndarray, which contains the squared sum of the corresponding elements of the given pair of vectors. """ square_sum = types.float32(0) zero = types.uint32(0) a_shape_0 = types.uint32(a.shape[types.uint32(0)]) i = types.uint32(0) while i < a_shape_0: if a[i] >= zero and b[i] >= zero: square_sum += (a[i] - b[i])**types.uint32(2) i = types.uint32(i + types.uint32(1)) out[0] = square_sum
def _boss_distance_dict(first, second, best_dist): dist = 0 for word, val_a in first.items(): val_b = second.get(word, types.uint32(0)) buf = val_a - val_b dist += buf * buf if dist > best_dist: return 0x7FFFFFFFFFFFFFFF return dist
def test_unsigned_access(self): L = List.empty_list(int32) ui32_0 = types.uint32(0) ui32_1 = types.uint32(1) ui32_2 = types.uint32(2) # insert L.append(types.uint32(10)) L.append(types.uint32(11)) L.append(types.uint32(12)) self.assertEqual(len(L), 3) # getitem self.assertEqual(L[ui32_0], 10) self.assertEqual(L[ui32_1], 11) self.assertEqual(L[ui32_2], 12) # setitem L[ui32_0] = 123 L[ui32_1] = 456 L[ui32_2] = 789 self.assertEqual(L[ui32_0], 123) self.assertEqual(L[ui32_1], 456) self.assertEqual(L[ui32_2], 789) # index ui32_123 = types.uint32(123) ui32_456 = types.uint32(456) ui32_789 = types.uint32(789) self.assertEqual(L.index(ui32_123), 0) self.assertEqual(L.index(ui32_456), 1) self.assertEqual(L.index(ui32_789), 2) # delitem L.__delitem__(ui32_2) del L[ui32_1] self.assertEqual(len(L), 1) self.assertEqual(L[ui32_0], 123) # pop L.append(2) L.append(3) L.append(4) self.assertEqual(len(L), 4) self.assertEqual(L.pop(), 4) self.assertEqual(L.pop(ui32_2), 3) self.assertEqual(L.pop(ui32_1), 2) self.assertEqual(L.pop(ui32_0), 123)
def correlation_map_kernel(x: ArrayLike, y: ArrayLike, out: ArrayLike) -> None: # pragma: no cover. i1 = types.uint32(cuda.grid(2)[types.uint32(0)]) i2 = types.uint32(cuda.grid(2)[types.uint32(1)]) out_shape_0 = types.uint32(out.shape[types.uint32(0)]) out_shape_1 = types.uint32(out.shape[types.uint32(1)]) if i1 >= out_shape_0 or i2 >= out_shape_1: # Quit if (x, y) is outside of valid output array boundary return _correlation(x[i1], y[i2], out[i1][i2])
def getitem_str_offset(typingctx, str_arr_typ, ind_t): def codegen(context, builder, sig, args): in_str_arr, ind = args string_array = context.make_helper(builder, string_array_type, in_str_arr) offsets = builder.bitcast(string_array.offsets, lir.IntType(32).as_pointer()) return builder.load(builder.gep(offsets, [ind])) return types.uint32(string_array_type, ind_t), codegen
def _correlation(x: ArrayLike, y: ArrayLike, out: ArrayLike) -> None: # pragma: no cover. # Note: assigning variable and only saving the final value in the # array made this significantly faster. # aggressively making all variables explicitly typed # makes it more performant by a factor of ~2-3x v0 = types.float32(0) v1 = types.float32(0) v2 = types.float32(0) v3 = types.float32(0) v4 = types.float32(0) v5 = types.float32(0) m = types.uint32(x.shape[types.uint32(0)]) i = types.uint32(0) zero = types.uint32(0) while i < m: if x[i] >= zero and y[i] >= zero: v0 += x[i] v1 += y[i] v2 += x[i] * x[i] v3 += y[i] * y[i] v4 += x[i] * y[i] v5 += 1 i = types.uint32(i + types.uint32(1)) out[0] = v0 out[1] = v1 out[2] = v2 out[3] = v3 out[4] = v4 out[5] = v5
def param_lookup(param_dict, default_val, dtype): """ Generate the ufunc lookup(channel, val), which returns a numpy array of values corresponding to various channels that are looked up in the provided param_dict. If there is no key, use default_val instead. """ out_type = from_dtype(np.dtype(dtype)) #convert types to avoid any necessity of casting... param_dict = { types.uint32(k):out_type(v) for k, v in param_dict.items() } default_val = out_type(default_val) @guvectorize(["void(uint32, "+out_type.name+"[:])"], "()->()", forceobj = True) def lookup(channel, val): """Look up a value for the provided channel from a dictionary provided at compile time""" val[0] = param_dict.get(channel, default_val) return lookup
def next_32(self): sig = types.uint32(types.CPointer(types.uint64)) @cfunc(sig) def next_32(st): bit_gen_state = carray(st, (2, ), dtype=np.uint64) if bit_gen_state[1] & np.uint64(0x1): out = bit_gen_state[1] >> np.uint64(32) bit_gen_state[1] = 0 return out z = splitmix_next(bit_gen_state) bit_gen_state[1] = z | np.uint64(0x1) return z & 0xFFFFFFFF # Ensure a reference is held self._next_32 = next_32 return next_32
def deref_uint16(typingctx, data, offset): sig = types.uint32(types.voidptr, types.intp) return sig, make_deref_codegen(16)
def _pick_ascii(is_ascii1, is_ascii2): if is_ascii1 == 1 and is_ascii2 == 1: return types.uint32(1) return types.uint32(0)
def node_update_count(tree, idx_node, idx_sample): # TODO: Don't do it twice... c = uint32(tree.samples.labels[idx_sample]) tree.nodes.counts[idx_node, c] += 1
def deref_uint32(typingctx, data, offset): sig = types.uint32(types.voidptr, types.intp) return sig, make_deref_codegen(32)
def float_to_unsigned(x): return types.uint32(x)
node_update_downwards, node_split, node_update_depth, node_update_weight_tree, ) from .tree import TreeClassifier from .utils import sample_discrete # TODO: an overall task is to minimize the O(#n_features) complexity: pass few # times over the features # TODO: write all the docstrings @njit(uint32(TreeClassifier.class_type.instance_type, uint32)) def tree_go_downwards(tree, idx_sample): # We update the nodes along the path which leads to the leaf containing # x_t. For each node on the path, we consider the possibility of # splitting it, following the Mondrian process definition. # Index of the root is 0 idx_current_node = 0 x_t = tree.samples.features[idx_sample] if tree.iteration == 0: # If it's the first iteration, we just put x_t in the range of root node_update_downwards(tree, idx_current_node, idx_sample, False) return idx_current_node else: while True: # If it's not the first iteration (otherwise the current node
"--arsonist", type=int, default=None, help="Force an arsonist location from 1 to 6.", ) parser.add_argument( "--case", type=lambda x: briefcase_words.index(x), default=None, help="Force a 4-letter briefcase word.", ) args = parser.parse_args() matches = numpy.full(9, uint32(0x10000)) has_constraint = False clock_str = args.clock clock = None if clock_str is not None: clock_h_str, _, clock_m_str, = clock_str.partition(":") clock_h = int(clock_h_str) clock_m = int(clock_m_str) assert 0 <= clock_h < 12 assert 0 <= clock_m < 60 clock = 60 * clock_h + clock_m matches[0] = clock has_constraint = True
new = np.ones((size, ), dtype=arr.dtype) else: new = np.zeros((size, ), dtype=arr.dtype) new[:keep] = arr[:keep] return new elif arr.ndim == 2: _, n_cols = arr.shape new = np.zeros((size, n_cols), dtype=arr.dtype) new[:keep] = arr[:keep] return new else: raise ValueError("resize_array can resize only 1D and 2D arrays") # Sadly there is no function to sample for a discrete distribution in numba @njit(uint32(float32[::1])) def sample_discrete(distribution): """Samples according to the given discrete distribution. Parameters ---------- distribution : `np.array', shape=(size,), dtype='float32' The discrete distribution we want to sample from. This must contain non-negative entries that sum to one. Returns ------- output : `uint32` Output sampled in {0, 1, 2, distribution.size} according to the given distribution
from .base_general import basis_general from ._basis_general_core import user_core_wrap import numpy as _np from numba import cfunc, types, njit try: from numba.ccallback import CFunc # numba < 0.49.0 except ModuleNotFoundError: from numba.core.ccallback import CFunc # numba >= 0.49.0 map_sig_32 = types.uint32(types.uint32, types.intc, types.CPointer(types.int8), types.CPointer(types.uint32)) map_sig_64 = types.uint64(types.uint64, types.intc, types.CPointer(types.int8), types.CPointer(types.uint64)) next_state_sig_32 = types.uint32(types.uint32, types.uint32, types.uint32, types.CPointer(types.uint32)) next_state_sig_64 = types.uint64(types.uint64, types.uint64, types.uint64, types.CPointer(types.uint64)) pre_check_state_sig_32 = types.uint32(types.uint32, types.uint32, types.CPointer(types.uint32)) pre_check_state_sig_64 = types.uint64(types.uint64, types.uint64, types.CPointer(types.uint64)) op_results_32 = types.Record.make_c_struct([ ('matrix_ele', types.complex128), ('state', types.uint32), ]) op_results_64 = types.Record.make_c_struct([('matrix_ele', types.complex128), ('state', types.uint64)])
def box_index(x, y, step, nb_x): """Return k_box index for each value""" return numba_types.uint32((x % 360) // step + nb_x * ((y + 90) // step))
def deref_uint32(typingctx, data, offset): sig = types.uint32(data, types.intp) return sig, make_deref_codegen(32)
def box_indexes(x, y, step): """Return i_box,j_box index for each value""" return numba_types.uint32((x % 360) // step), numba_types.uint32( (y + 90) // step)
def _histogram_intersection_dict(first, second): sim = 0 for word, val_a in first.items(): val_b = second.get(word, types.uint32(0)) sim += min(val_a, val_b) return sim
return ncompiler GrB_UnaryOp = OpContainer() GrB_BinaryOp = OpContainer() ################################## # Useful collections of signatures ################################## _unary_bool = [nt.boolean(nt.boolean)] _unary_int = [ nt.uint8(nt.uint8), nt.int8(nt.int8), nt.uint16(nt.uint16), nt.int16(nt.int16), nt.uint32(nt.uint32), nt.int32(nt.int32), nt.uint64(nt.uint64), nt.int64(nt.int64) ] _unary_float = [nt.float32(nt.float32), nt.float64(nt.float64)] _unary_all = _unary_bool + _unary_int + _unary_float _binary_bool = [nt.boolean(nt.boolean, nt.boolean)] _binary_int = [ nt.uint8(nt.uint8, nt.uint8), nt.int8(nt.int8, nt.int8), nt.uint16(nt.uint16, nt.uint16), nt.int16(nt.int16, nt.int16), nt.uint32(nt.uint32, nt.uint32), nt.int32(nt.int32, nt.int32),