def _randint_arg_check(low, high, endpoint, lower_bound, upper_bound): """ Check that low and high are within the bounds for the given datatype. """ if low < lower_bound: raise ValueError("low is out of bounds") # This is being done to avoid high being accidentally # casted to int64/32 while subtracting 1 before # checking bounds, avoids overflow. if high > 0: high = uint64(high) if not endpoint: high -= uint64(1) upper_bound = uint64(upper_bound) if low > 0: low = uint64(low) if high > upper_bound: raise ValueError("high is out of bounds") if low > high: # -1 already subtracted, closed interval raise ValueError("low is greater than high in given interval") else: if high > upper_bound: raise ValueError("high is out of bounds") if low > high: # -1 already subtracted, closed interval raise ValueError("low is greater than high in given interval")
def pack_n_nb(spaced, n, chunk_bits): a = 0 for i in range(chunk_bits): bit_idx = nb.uint64(i * n) bit_to_set = nb.uint64(1) << bit_idx a |= (spaced & bit_to_set) >> (bit_idx - i) return a
def bounded_lemire_uint64(bitgen, rng): """ Generates a random unsigned 64 bit integer bounded within a given interval using Lemire's rejection. """ rng_excl = uint64(rng) + uint64(1) assert (rng != 0xFFFFFFFFFFFFFFFF) x = next_uint64(bitgen) leftover = uint64(x) * uint64(rng_excl) if (leftover < rng_excl): threshold = (UINT64_MAX - rng) % rng_excl while (leftover < threshold): x = next_uint64(bitgen) leftover = uint64(x) * uint64(rng_excl) x0 = x & uint64(0xFFFFFFFF) x1 = x >> 32 rng_excl0 = rng_excl & uint64(0xFFFFFFFF) rng_excl1 = rng_excl >> 32 w0 = x0 * rng_excl0 t = x1 * rng_excl0 + (w0 >> 32) w1 = t & uint64(0xFFFFFFFF) w2 = t >> 32 w1 += x0 * rng_excl1 m1 = x1 * rng_excl1 + w2 + (w1 >> 32) return m1
def check_primes_cuda(p): if p < 10: if p in [4, 6, 8, 9]: return numba.uint64(0) else: pass else: for i in range(2, (p**0.5)//1): if (p % i) == 0: return numba.uint64(0) else: pass return p
def init_xoroshiro128p_states_cpu(states, seed, subsequence_start): n = states.shape[0] seed = uint64(seed) subsequence_start = uint64(subsequence_start) if n >= 1: init_xoroshiro128p_state(states, 0, seed) # advance to starting subsequence number for _ in range(subsequence_start): xoroshiro128p_jump(states, 0) # populate the rest of the array for i in range(1, n): states[i] = states[i - 1] # take state of previous generator xoroshiro128p_jump(states, i) # and jump forward 2**64 steps
def encode_single_coord(coord, chunk_bits): """ Encodes a coordinate in ℝⁿ in ℝ¹ using Morton ordering, assuming that the size of each dimension is 0..2^{chunk_bits} >>> morton_offsets = set() >>> for i in range(16): ... for j in range(16): ... morton_offsets.add(encode_single_coord( ... np.array([i, j], dtype=np.uint8), ... 4)) >>> morton_offsets == {i for i in range(256)} True Here we demonstrate that there is mapping from coordinates in a 16x16 square to the numbers 0..255 :param coord: coordinate to encode, numba array of type uint8, size <= 8 :param chunk_bits: coordinate dimensions :return: Morton-coded offset of type uint64 """ assert coord.shape[0] <= 8 x = nb.uint64(0) for i in range(coord.shape[0]): x += separate_n_nb(coord[i], coord.shape[0], chunk_bits) << i return x
def atomic_sub_double_3(ary): tx = cuda.threadIdx.x ty = cuda.threadIdx.y sm = cuda.shared.array((4, 8), float64) sm[tx, ty] = ary[tx, ty] cuda.syncthreads() cuda.atomic.sub(sm, (tx, uint64(ty)), 1) cuda.syncthreads() ary[tx, ty] = sm[tx, ty]
def H(psi_1,psi_2,H_ising,pg,J,h): for i in prange(psi_1.size): b = uint64(1) # use this number fo flip bit to get column index ME = (1 + pg + J*H_ising[i])*psi_1[i] for j in range(N): ME += -h*psi_1[i^b] # x-field action b <<= 1 # shift flipping fit to the right psi_2[i] = ME
def atomic_add3(ary): tx = roc.get_local_id(0) ty = roc.get_local_id(1) sm = roc.shared.array((4, 8), numba.uint32) sm[tx, ty] = ary[tx, ty] roc.barrier(roc.CLK_GLOBAL_MEM_FENCE) roc.atomic.add(sm, (tx, numba.uint64(ty)), 1) roc.barrier(roc.CLK_GLOBAL_MEM_FENCE) ary[tx, ty] = sm[tx, ty]
def atomic_add_float_3(ary): tx = cuda.threadIdx.x ty = cuda.threadIdx.y sm = cuda.shared.array((4, 8), float32) sm[tx, ty] = ary[tx, ty] cuda.syncthreads() cuda.atomic.add(sm, (tx, uint64(ty)), 1) cuda.syncthreads() ary[tx, ty] = sm[tx, ty]
def atomic_add3(ary): tx = cuda.threadIdx.x ty = cuda.threadIdx.y sm = cuda.shared.array((4, 8), uint32) sm[tx, ty] = ary[tx, ty] cuda.syncthreads() cuda.atomic.add(sm, (tx, uint64(ty)), 1) cuda.syncthreads() ary[tx, ty] = sm[tx, ty]
def atomic_add3(ary): tx = hsa.get_local_id(0) ty = hsa.get_local_id(1) sm = hsa.shared.array((4, 8), numba.uint32) sm[tx, ty] = ary[tx, ty] hsa.barrier(hsa.CLK_GLOBAL_MEM_FENCE) hsa.atomic.add(sm, (tx, numba.uint64(ty)), 1) hsa.barrier(hsa.CLK_GLOBAL_MEM_FENCE) ary[tx, ty] = sm[tx, ty]
def xoroshiro128p_next(states, index): '''Return the next random uint64 and advance the RNG in states[index]. :type states: 1D array, dtype=xoroshiro128p_dtype :param states: array of RNG states :type index: int64 :param index: offset in states to update :rtype: uint64 ''' index = int64(index) s0 = states[index]['s0'] s1 = states[index]['s1'] result = s0 + s1 s1 ^= s0 states[index]['s0'] = uint64(rotl(s0, uint32(55))) ^ s1 ^ (s1 << uint32(14)) states[index]['s1'] = uint64(rotl(s1, uint32(36))) return result
def H_cuda(yin,yout,H_ising,pg,J,h): s = cuda.grid(1) if s < yin.size: b = uint64(1) # use this number fo flip bit to get column index ME = (1 + pg + J * H_ising[s])*yin[s] for j in range(N): ME += -h*yin[s^b] # x-field action b <<= 1 # shift flipping fit to the right yout[s] = ME
def init_xoroshiro128p_states_kernel(states, seed, subsequence_start): seed = uint64(seed) subsequence_start = uint64(subsequence_start) # Only run this with a single thread and block n = states.shape[0] if n < 1: return # assuming at least 1 state going forward init_xoroshiro128p_state(states, 0, seed) # advance to starting subsequence number for _ in range(subsequence_start): xoroshiro128p_jump(states, 0) # populate the rest of the array for i in range(1, n): states[i] = states[i - 1] # take state of previous generator xoroshiro128p_jump(states, i) # and jump forward 2**64 steps
def _2d_x_field(yin, yout, h): # adds to yout. N = len(h) Ns = (1 << N) for i in range(Ns): b = uint64(1) # use this number fo flip bit to get column index ME = 0 for j in range(N): ME += h[j] * yin[i ^ b] # x-field action b <<= 1 # shift flipping fit to the right yout[i] += ME
def block_analysis_jit(fd, xp, yp, vmin: np.float64, vmax: np.float64, vdims: np.ndarray, bdims: np.ndarray, bcount: np.ndarray, blocks: np.ndarray): diff = vmax - vmin num_vox = np.prod(vdims) print(num_vox) for i in numba.prange(num_vox): bI = numba.uint64((i % vdims[0]) / bdims[0]) bJ = numba.uint64(((i / vdims[0]) % vdims[1]) / bdims[1]) bK = numba.uint64(((i / vdims[0]) / vdims[1]) / bdims[2]) if bI < bcount[0] and bJ < bcount[1] and bK < bcount[2]: x = numba.float64((fd[i] - vmin) / diff) #if x <= xp[0]: # return yp[0] max_idx = len(xp) - 1 #if x >= xp[-1]: # return yp[-1] idx = int((x * max_idx) + 0.5) if idx > max_idx: k0 = int(max_idx - 1) k1 = int(max_idx) elif idx == 0: k0 = int(0) k1 = int(1) else: k0 = int(idx - 1) k1 = int(idx) d = (x - xp[k0]) / (xp[k1] - xp[k0]) rel = numba.float64(yp[k0] * (1.0 - d) + yp[k1] * d) bIdx = bI + bcount[0] * (bJ + bK * bcount[1]) blocks[bIdx] += rel
def xoroshiro128p_jump(states, index): '''Advance the RNG in ``states[index]`` by 2**64 steps. :type states: 1D array, dtype=xoroshiro128p_dtype :param states: array of RNG states :type index: int64 :param index: offset in states to update ''' index = int64(index) s0 = uint64(0) s1 = uint64(0) for i in range(2): for b in range(64): if XOROSHIRO128P_JUMP[i] & (uint64(1) << uint32(b)): s0 ^= states[index]['s0'] s1 ^= states[index]['s1'] xoroshiro128p_next(states, index) states[index]['s0'] = s0 states[index]['s1'] = s1
def init_xoroshiro128p_state(states, index, seed): '''Use SplitMix64 to generate an xoroshiro128p state from 64-bit seed. This ensures that manually set small seeds don't result in a predictable initial sequence from the random number generator. :type states: 1D array, dtype=xoroshiro128p_dtype :param states: array of RNG states :type index: uint64 :param index: offset in states to update :type seed: int64 :param seed: seed value to use when initializing state ''' index = int64(index) seed = uint64(seed) z = seed + uint64(0x9E3779B97F4A7C15) z = (z ^ (z >> uint32(30))) * uint64(0xBF58476D1CE4E5B9) z = (z ^ (z >> uint32(27))) * uint64(0x94D049BB133111EB) z = z ^ (z >> uint32(31)) states[index]['s0'] = z states[index]['s1'] = z
def separate_n_nb(packed, n, chunk_bits): """ A relatively inefficient generalization of the "separate bits" step of Morton encoding. Assuming that each of the `n` coordinates has `chunk_bits` bits, we can "space out" each bit of each coordinate `n` spaces at a time. >>> for i in range(8): ... print(i, ... format(separate_n_nb(i, 3, 3), '#012b'), ... format(separate_n_nb(i, 3, 3) << 1, '#012b'), ... format(separate_n_nb(i, 3, 3) << 2, '#012b')) 0 0b0000000000 0b0000000000 0b0000000000 1 0b0000000001 0b0000000010 0b0000000100 2 0b0000001000 0b0000010000 0b0000100000 3 0b0000001001 0b0000010010 0b0000100100 4 0b0001000000 0b0010000000 0b0100000000 5 0b0001000001 0b0010000010 0b0100000100 6 0b0001001000 0b0010010000 0b0100100000 7 0b0001001001 0b0010010010 0b0100100100 :param packed: packed tensor :param n: number of components that we will eventually want to Morton code :param chunk_bits: the number of bits that represent each coordinate :return: spaced-out bit representation, ready to be interleaved """ a = nb.uint64(packed) a = a & nb.uint64(0x00000000000000FF) x = 0 for i in range(chunk_bits): bit_to_set = nb.uint64(1) << nb.uint64(i * n) x |= (a << nb.uint64((n - 1) * i)) & bit_to_set return x
def xoroshiro128p_jump(states, index): '''Advance the RNG in ``states[index]`` by 2**64 steps. :type states: 1D array, dtype=xoroshiro128p_dtype :param states: array of RNG states :type index: int64 :param index: offset in states to update ''' index = int64(index) jump = (uint64(0xbeac0467eba5facb), uint64(0xd86b048b86aa9922)) s0 = uint64(0) s1 = uint64(0) for i in range(2): for b in range(64): if jump[i] & (uint64(1) << uint32(b)): s0 ^= states[index]['s0'] s1 ^= states[index]['s1'] xoroshiro128p_next(states, index) states[index]['s0'] = s0 states[index]['s1'] = s1
def H(t, psi_in, psi_out, H_ising, T, pg): # print np.linalg.norm(psi_in) J = (t / T)**2 h = (1 - t / T)**2 r = (1 - t / T)**2 for s in prange(psi_in.size): b = uint64(1) # use this number fo flip bit to get column index ME = (-1j * pg * r + J * H_ising[s]) * psi_in[s] for j in range(N): ME += -h * psi_in[s ^ b] # x-field action b <<= 1 # shift flipping fit to the right psi_out[s] = -1j * ME return psi_out
def buffered_bounded_lemire_uint32(bitgen, rng): """ Generates a random unsigned 32 bit integer bounded within a given interval using Lemire's rejection. """ rng_excl = uint32(rng) + uint32(1) assert (rng != 0xFFFFFFFF) # Generate a scaled random number. m = uint64(next_uint32(bitgen)) * uint64(rng_excl) # Rejection sampling to remove any bias leftover = m & 0xFFFFFFFF if (leftover < rng_excl): # `rng_excl` is a simple upper bound for `threshold`. threshold = (UINT32_MAX - rng) % rng_excl while (leftover < threshold): m = uint64(next_uint32(bitgen)) * uint64(rng_excl) leftover = m & 0xFFFFFFFF return (m >> 32)
def xoroshiro128p_jump(states, index): """Advance the RNG in ``states[index]`` by 2**64 steps. :type states: 1D array, dtype=xoroshiro128p_dtype :param states: array of RNG states :type index: int64 :param index: offset in states to update """ index = int64(index) jump = (uint64(0xBEAC0467EBA5FACB), uint64(0xD86B048B86AA9922)) s0 = uint64(0) s1 = uint64(0) for i in range(2): for b in range(64): if jump[i] & (uint64(1) << uint32(b)): s0 ^= states[index]["s0"] s1 ^= states[index]["s1"] xoroshiro128p_next(states, index) states[index]["s0"] = s0 states[index]["s1"] = s1
def _2d_H_op(yin, yout, diag_signs, J, h): N = h.shape[0] Nd = J.shape[0] Ns = diag_signs.shape[0] for i in prange(Ns): diag = 0 for j in range(Nd): diag += J[j] * diag_signs[i, j] b = uint64(1) # use this number fo flip bit to get column index ME = 0 for j in range(N): ME += h[j] * yin[i ^ b] # x-field action b <<= 1 # shift flipping fit to the right yout[i] = diag * yin[i] + ME
def numba_decompress_blocks(input, block_size, last_block_size, block_ends, output): num_blocks = len(block_ends) for p in numba.prange(num_blocks): if p == 0: i = numba.uint64(0) else: i = numba.uint64(block_ends[p - numba.uint(1)]) block_end = numba.uint64(block_ends[p]) j = numba.uint64(block_size * p) if (p == (num_blocks - numba.uint8(1))): end = j + numba.uint64(last_block_size) else: end = j + numba.uint64(block_size) while ((j < end) and (i < block_end)): t1 = numba.uint16((input[i] & 0xF0) >> 4) t2 = numba.uint16((input[i] & 0x0F) + 4) i += numba.uint8(1) if (t1 == 15): while input[i] == 255: t1 += numba.uint8(input[i]) i += numba.uint8(1) t1 += numba.uint8(input[i]) i += numba.uint8(1) for n in range(t1): output[j] = input[i] i += numba.uint8(1) j += numba.uint8(1) if (j >= end): break off = numba.uint16(input[i]) + (numba.uint16(input[i + 1]) << 8) i += numba.uint8(2) if (t2 == 19): while input[i] == 255: t2 += numba.uint8(input[i]) i += numba.uint8(1) t2 += numba.uint8(input[i]) i += numba.uint8(1) for n in range(t2): output[j] = output[j - off] j += numba.uint8(1)
def atomic_cast_to_uint64(num): return uint64(num)
REVERSED_EP_LOOKUP_ARRAY = SQUARES_180.copy() REVERSED_EP_LOOKUP_ARRAY[0] = NO_EP_SQUARE def power_set(iterable): s = list(iterable) return itertools.chain.from_iterable( itertools.combinations(s, r) for r in range(len(s) + 1)) flip_vert_const_1 = np.uint64(0x00FF00FF00FF00FF) flip_vert_const_2 = np.uint64(0x0000FFFF0000FFFF) @nb.vectorize([nb.uint64(nb.uint64)]) def vectorized_flip_vertically(bb): bb = ((bb >> 8) & flip_vert_const_1) | ((bb & flip_vert_const_1) << 8) bb = ((bb >> 16) & flip_vert_const_2) | ((bb & flip_vert_const_2) << 16) bb = (bb >> 32) | (bb << 32) return bb def get_castling_lookup_tables(): possible_castling_rights = np.zeros(2**4, dtype=np.uint64) for j, set in enumerate(power_set([BB_A1, BB_H1, BB_A8, BB_H8])): possible_castling_rights[j] = np.uint64( functools.reduce(lambda x, y: x | y, set, np.uint64(0))) white_turn_castling_tables = create_index_table(possible_castling_rights) black_turn_castling_tables = create_index_table(
# Let's find out the correct dtype depending on the max_value if max_value <= _UINT8_MAX: X = np.empty((n_samples, n_features), dtype=np.uint8, order="F") elif _UINT8_MAX < max_value <= _UINT16_MAX: X = np.empty((n_samples, n_features), dtype=np.uint16, order="F") elif _UINT16_MAX < max_value <= _UINT32_MAX: X = np.empty((n_samples, n_features), dtype=np.uint32, order="F") elif _UINT32_MAX < max_value <= _UINT64_MAX: X = np.empty((n_samples, n_features), dtype=np.uint64, order="F") else: raise ValueError("X cannot be created") return X @jit( uint64(uint64, uint64[::1], uint64, uint64, uint64), nopython=NOPYTHON, nogil=NOGIL, boundscheck=BOUNDSCHECK, fastmath=FASTMATH, inline=INLINE, ) def get_value_from_column(i, bitarray, bitmask, n_values_in_word, n_bits): """Get the bin value of a column based on the bitarray Parameters ---------- i : uint64 Sample index bitarray :
self.board_w = np.random.randint(2**64, size=(n, n), dtype=np.uint64) self.D = {} def __getitem__(self, state): _hash = get_hash(state, self.board_b, self.board_w) if _hash in self.D: return self.D[_hash] else: return False def __setitem__(self, state, value): _hash = get_hash(state, self.board_b, self.board_w) self.D[_hash] = value @nb.njit(nb.uint64(nb.int8[:, :], nb.uint64[:, :], nb.uint64[:, :])) def get_hash(state, board_b, board_w): _hash = 0 n = state.shape[0] for y in range(n): for x in range(n): if state[y, x] == 1: _hash ^= board_b[y, x] elif state[y, x] == -1: _hash ^= board_w[y, x] return _hash if __name__ == '__main__': N, M = 3, 3 Zob = Zobrist(N)
elif move.to_square == C1 and not board_state.rooks & BB_C1: return create_move(E1, A1) elif move.from_square == E8 and board_state.kings & BB_E8: if move.to_square == G8 and not board_state.rooks & BB_G8: return create_move(E8, H8) elif move.to_square == C8 and not board_state.rooks & BB_C8: return create_move(E8, A8) return move @njit(uint64(BoardState.class_type.instance_type, Move.class_type.instance_type)) def push_with_hash_update(board_state, move): move = _to_chess960(board_state, move) # Reset ep square. ep_square = board_state.ep_square board_state.ep_square = None # reset the ep square in the hash if not ep_square is None: # THIS IS ONLY A TEMPORARY WORKAROUND temp_ep_square = np.uint8(ep_square) if board_state.turn == True: ep_mask = shift_down(BB_SQUARES[temp_ep_square])
:param index: offset in states to update :rtype: uint64 ''' index = int64(index) s0 = states[index]['s0'] s1 = states[index]['s1'] result = s0 + s1 s1 ^= s0 states[index]['s0'] = uint64(rotl(s0, uint32(55))) ^ s1 ^ (s1 << uint32(14)) states[index]['s1'] = uint64(rotl(s1, uint32(36))) return result XOROSHIRO128P_JUMP = (uint64(0xbeac0467eba5facb), uint64(0xd86b048b86aa9922)) @jit def xoroshiro128p_jump(states, index): '''Advance the RNG in ``states[index]`` by 2**64 steps. :type states: 1D array, dtype=xoroshiro128p_dtype :param states: array of RNG states :type index: int64 :param index: offset in states to update ''' index = int64(index) s0 = uint64(0) s1 = uint64(0)
def uint64_to_unit_float64(x): '''Convert uint64 to float64 value in the range [0.0, 1.0)''' x = uint64(x) return (x >> uint32(11)) * (float64(1) / (uint64(1) << uint32(53)))
def atomic_max_double_normalizedindex(res, ary): tx = cuda.threadIdx.x bx = cuda.blockIdx.x cuda.atomic.max(res, 0, ary[tx, uint64(bx)])
def atomic_add_double_global_3(ary): tx = cuda.threadIdx.x ty = cuda.threadIdx.y cuda.atomic.add(ary, (tx, uint64(ty)), 1)
def uint64_to_unit_float32(x): '''Convert uint64 to float64 value in the range [0.0, 1.0)''' x = uint64(x) return float32(uint64_to_unit_float64(x))
def rotl(x, k): '''Left rotate x by k bits.''' x = uint64(x) k = uint32(k) return (x << k) | (x >> uint32(64 - k))