def _gen_func(self): n_dim = self.system.n_dim nb_float = float64 if self.system.dtype == np.dtype(np.float32): nb_float = float32 @cuda.jit(int32(nb_float[:], nb_float[:], int32[:]), device=True) def cu_cell_index(x, box, ibox): ret = 0 n_cell = 1 for i in range(0, x.shape[0]): tmp = x[i] / box[i] if tmp < -0.5 or tmp > 0.5: return -1 ret = ret + floor((tmp + 0.5) * ibox[i]) * n_cell n_cell = n_cell * ibox[i] return ret @cuda.jit(void(int32[:], int32[:], int32[:, :])) def cu_cell_map(ibox, dim, ret): cell_i = cuda.grid(1) if cell_i >= ret.shape[0]: return cell_vec_i = cuda.local.array(n_dim, int32) cell_vec_j = cuda.local.array(n_dim, int32) cu_unravel_index_f(cell_i, ibox, cell_vec_i) for j in range(ret.shape[1]): cu_unravel_index_f(j, dim, cell_vec_j) for k in range(n_dim): cell_vec_j[k] = cell_vec_i[k] + cell_vec_j[k] - 1 cell_j = cu_ravel_index_f_pbc(cell_vec_j, ibox) ret[cell_i, j] = cell_j @cuda.jit( void(nb_float[:, :], nb_float[:], int32[:], int32[:, :], int32[:], int32[:], int32[:], nb_float[:])) def cu_cell_list(x, box, ibox, cell_list, cell_counts, cells, cell_max, out_of_box): pi = cuda.grid(1) if pi >= x.shape[0]: return # xi = cuda.local.array(ndim, dtype=float64) # for k in range(ndim): # xi[k] = x[pi, k] xi = x[pi] ic = cu_cell_index(xi, box, ibox) if ic == -1: out_of_box[0] = nb_float(pi) for d in range(n_dim): out_of_box[d + 1] = x[pi, d] cells[pi] = ic index = cuda.atomic.add(cell_counts, ic, 1) if index < cell_list.shape[0]: cell_list[ic, index] = pi else: cuda.atomic.max(cell_max, 0, index + 1) return cu_cell_map, cu_cell_list
def compile(cls): if cls.is_compiled(): return cls._compiled if cls.state_dtype is None: cls.state_ntype = numba.void else: cls.state_ntype = numba.typeof(cls.state_dtype).dtype solution_state_ntype = numba.types.Array( cls.Optimizer.Problem.state_ntype, 2, 'C') solution_losses_ntype = numba.types.Array(Compiler.loss_ntype, 2, 'C') query_vector_ntype = numba.types.Array(numba.int32, 1, 'C') # Shoud match the signature of cls.schedule_work(...) schedule_work_ret_type = numba.types.Tuple((numba.int32, numba.int32)) schedule_work_signature = schedule_work_ret_type( query_vector_ntype, cls.state_ntype, solution_state_ntype, solution_losses_ntype, numba.int32) # Should match the signature of cls.shuffle(...) shuffle_signature = numba.void(cls.state_ntype, solution_state_ntype, solution_losses_ntype) init_signature = numba.void(cls.state_ntype, query_vector_ntype) final_result_signature = numba.int32(cls.state_ntype, solution_state_ntype, solution_losses_ntype, numba.int32) allocator = Compiler.generate_allocator(cls.__name__, cls.state_dtype) compiled_schedule_work = Compiler.jit(cls.__name__, 'schedule_work', schedule_work_signature, cls.schedule_work) compiled_shuffle = Compiler.jit(cls.__name__, 'shuffle', shuffle_signature, cls.shuffle) compiled_init = Compiler.jit(cls.__name__, 'init', init_signature, cls.init) compiled_final_result = Compiler.jit(cls.__name__, 'final_result', final_result_signature, cls.final_result) cls._compiled = SimpleNamespace(schedule_work=compiled_schedule_work, shuffle=compiled_shuffle, allocator=allocator, init=compiled_init, final_result=compiled_final_result) return cls._compiled
def _gen_func(dtype, n_dim): float = float64 if dtype == np.dtype(np.float32): float = float32 @cuda.jit(int32(float[:], float[:], int32[:]), device=True) def cu_cell_index(x, box, ibox): ret = floor((x[0] / box[0] + 0.5) * ibox[0]) n_cell = ibox[0] for i in range(1, x.shape[0]): ret = ret + floor((x[i] / box[i] + 0.5) * ibox[i]) * n_cell n_cell = n_cell * ibox[i] return ret @cuda.jit(void(int32[:], int32[:], int32[:, :])) def cu_cell_map(ibox, dim, ret): cell_i = cuda.grid(1) if cell_i >= ret.shape[0]: return cell_vec_i = cuda.local.array(n_dim, int32) cell_vec_j = cuda.local.array(n_dim, int32) cu_unravel_index_f(cell_i, ibox, cell_vec_i) for j in range(ret.shape[1]): cu_unravel_index_f(j, dim, cell_vec_j) for k in range(n_dim): cell_vec_j[k] = cell_vec_i[k] + cell_vec_j[k] - 1 cell_j = cu_ravel_index_f_pbc(cell_vec_j, ibox) ret[cell_i, j] = cell_j @cuda.jit( void(float[:, :], float[:], int32[:], float[:, :, :], int32[:], int32[:], int32[:])) def cu_cell_list(x, box, ibox, cell_list, cell_counts, cells, cell_max): pi = cuda.grid(1) if pi >= x.shape[0]: return # xi = cuda.local.array(ndim, dtype=float64) # for k in range(ndim): # xi[k] = x[pi, k] xi = x[pi] ic = cu_cell_index(xi, box, ibox) cells[pi] = ic index = cuda.atomic.add(cell_counts, ic, 1) if index < cell_list.shape[0]: for k in range(n_dim): cell_list[ic, index, k] = xi[k] cell_list[ic, index, n_dim] = float(pi) #cell_list_index[ic, index] = pi else: cuda.atomic.max(cell_max, 0, index + 1) return cu_cell_index, cu_cell_map, cu_cell_list
def test_nopython_flag(self): def foo(A, B): pass # nopython = True is fine guvectorize([void(float32[:], float32[:])], '(x)->(x)', target='cuda', nopython=True)(foo) # nopython = False is bad with self.assertRaises(TypeError) as raises: guvectorize([void(float32[:], float32[:])], '(x)->(x)', target='cuda', nopython=False)(foo) self.assertEqual("nopython flag must be True", str(raises.exception))
def test_get_regs_per_thread_unspecialized(self): # A kernel where the register usage per thread is likely to differ # between different specializations @cuda.jit def pi_sin_array(x, n): i = cuda.grid(1) if i < n: x[i] = 3.14 * math.sin(x[i]) # Call the kernel with different arguments to create two different # definitions within the Dispatcher object N = 10 arr_f32 = np.zeros(N, dtype=np.float32) arr_f64 = np.zeros(N, dtype=np.float64) pi_sin_array[1, N](arr_f32, N) pi_sin_array[1, N](arr_f64, N) # Check we get a positive integer for the two different variations sig_f32 = void(float32[::1], int64) sig_f64 = void(float64[::1], int64) regs_per_thread_f32 = pi_sin_array.get_regs_per_thread(sig_f32) regs_per_thread_f64 = pi_sin_array.get_regs_per_thread(sig_f64) self.assertIsInstance(regs_per_thread_f32, int) self.assertIsInstance(regs_per_thread_f64, int) self.assertGreater(regs_per_thread_f32, 0) self.assertGreater(regs_per_thread_f64, 0) # Check that getting the registers per thread for all signatures # provides the same values as getting the registers per thread for # individual signatures. Note that the returned dict is indexed by # (cc, argtypes) pairs (in keeping with definitions, ptx, LLVM IR, # etc.) regs_per_thread_all = pi_sin_array.get_regs_per_thread() cc = cuda.current_context().device.compute_capability self.assertEqual(regs_per_thread_all[cc, sig_f32.args], regs_per_thread_f32) self.assertEqual(regs_per_thread_all[cc, sig_f64.args], regs_per_thread_f64) if regs_per_thread_f32 == regs_per_thread_f64: # If the register usage is the same for both variants, there may be # a bug, but this may also be an artifact of the compiler / driver # / device combination, so produce an informational message only. print('f32 and f64 variant thread usages are equal.') print('This may warrant some investigation. Devices:') cuda.detect()
class MathsMethods: @staticmethod @numba.njit([void(float64[:], float64[:]), void(float64[:, :], float64[:, :]), void(int64[:, :], int64[:, :]), void(int64[:], int64[:]), void(float64[:, :], int64[:, :]), void(float64[:], float64)], **{**conf.JIT_FLAGS, **{'parallel': False}}) def add(output, addend): output += addend @staticmethod @numba.njit(void(int64[:, :], int64[:]), **conf.JIT_FLAGS) def row_modulo(output, divisor): for d in range(len(divisor)): for i in prange(output.shape[1]): output[d, i] %= divisor[d] @staticmethod @numba.njit(void(float64[:]), **conf.JIT_FLAGS) def floor(output): output[:] = np.floor(output) @staticmethod @numba.njit(void(int64[:, :], float64[:, :]), **conf.JIT_FLAGS) def floor_out_of_place(output, input_data): output[:] = np.floor(input_data) @staticmethod @numba.njit(**{**conf.JIT_FLAGS, **{'parallel': False}}) def multiply(output, multiplier): output *= multiplier @staticmethod @numba.njit(**conf.JIT_FLAGS) def multiply_out_of_place(output, multiplicand, multiplier): output[:] = multiplicand * multiplier @staticmethod @numba.njit(**conf.JIT_FLAGS) def power(output, exponent): output[:] = np.power(output, exponent) @staticmethod @numba.njit(**{**conf.JIT_FLAGS, **{'parallel': False}}) def subtract(output, subtrahend): output[:] -= subtrahend[:] @staticmethod # @numba.njit(void(float64[:]), **conf.JIT_FLAGS) def urand(output, seed=None): np.random.seed(seed) output.data[:] = np.random.uniform(0, 1, output.shape)
class Arm(ComponentBase): def __init__( self, name, **kwargs): """ Initialize this Arm component. Parameters: name (str): the name of this component """ super().__init__(__class__, **kwargs) self.name = name self.propagate_params = () # Aim a neutron at this arm to cause JIT compilation. import mcni neutrons = mcni.neutron_buffer(1) neutrons[0] = mcni.neutron(r=(0, 0, -1), v=(0, 0, 1), prob=1, time=0) self.process(neutrons) @cuda.jit( void( NB_FLOAT[:] ), device=True ) def propagate( in_neutron ): pass
def default_signature(self, method, ext_type): if method.name == "__init__": argtypes = [numba.object_] * (method.py_func.__code__.co_argcount - 1) default_signature = numba.void(*argtypes) return default_signature else: return super(JitMethodMaker, self).default_signature(method, ext_type)
def guvectorize_compute(target: str, *, cache: bool = True): return nb.guvectorize( [nb.void(_nb_float[:, :], _nb_float[:], _nb_float, _nb_float[:])], '(m, p),(p),()->(m)', nopython=True, target=target, cache=cache)
def makeWorker(): savethread = pythonapi.PyEval_SaveThread savethread.argtypes = [] savethread.restype = c_void_p restorethread = pythonapi.PyEval_RestoreThread restorethread.argtypes = [c_void_p] restorethread.restype = None def worker(p, q, R, job): threadstate = savethread() nQ = q.shape[1] for i in xrange(job[0], job[1]): for j in xrange(nQ): rx = p[i, 0] - q[0, j] ry = p[i, 1] - q[1, j] rz = p[i, 2] - q[2, j] R[i, j] = 1 / (1 + sqrt(rx * rx + ry * ry + rz * rz)) restorethread(threadstate) signature = void(double[:, :], double[:, :], double[:, :], int64[:]) worker_ext = jit(signature, nopython=True)(worker) return worker_ext
def test_vector(): def fun(s, x, p, out): out[0] = s[0] + x[0] out[1] = s[1] + x[1] s = numpy.random.random((2, )) x = numpy.random.random((2, )) p = numpy.random.random((2, )) out = numpy.zeros((2, )) out1 = numpy.zeros((2, )) from numba import guvectorize, float64, void gfun = guvectorize( ftylist=[void(float64[:], float64[:], float64[:], float64[:])], signature='(n),(n),(n)->(n)')(fun) sfun = standard_function(gfun, 2) fun(s, x, p, out) sfun(s, x, p, out=out1) out2 = sfun(s, x, p) out = sfun(s, x, p, diff=True) print("OUT") print(out)
def force_functions(self, funcs): # general pair cases # @cuda.jit("void(float64[:], float64[:], float64[:], float64[:], float64[:,:])", device=True) # def func(a, b, param, forces): # pass nb_float = float64 if self.system.dtype == np.dtype(np.float32): nb_float = float32 kernels = [] cu_pbc_dist2 = self.nlist.dist_funcs['cu_pbc_dist2'] for f in funcs: @cuda.jit( void(nb_float[:, :], nb_float[:], int32[:], int32[:], nb_float[:], int32[:], int32, nb_float[:])) def _f(x, box, nl, nc, params, typeid, n_types, forces): i = cuda.grid(1) if i >= x.shape[0]: return xi = x[i] ti = typeid[i] for k in range(nc[i]): j = nl[i, k] tj = typeid[j] dij2 = cu_pbc_dist2(xi, x[j], box) f(dij2, box, params[ti * n_types + tj], forces) kernels.append(_f) return kernels
def test_gufunc(self): @guvectorize([void(float32[:, :], float32[:, :], float32[:, :])], '(m,n),(n,p)->(m,p)', target='cuda') def matmulcore(A, B, C): m, n = A.shape n, p = B.shape for i in range(m): for j in range(p): C[i, j] = 0 for k in range(n): C[i, j] += A[i, k] * B[k, j] gufunc = matmulcore gufunc.max_blocksize = 512 matrix_ct = 1001 # an odd number to test thread/block division in CUDA A = np.arange(matrix_ct * 2 * 4, dtype=np.float32).reshape(matrix_ct, 2, 4) B = np.arange(matrix_ct * 4 * 5, dtype=np.float32).reshape(matrix_ct, 4, 5) C = gufunc(A, B) Gold = ut.matrix_multiply(A, B) self.assertTrue(np.allclose(C, Gold))
def test_invalid_array_type(self): rgx = ".*Cannot infer the type of variable 'arr'.*" def unsupported_type(): arr = cuda.shared.array(10, dtype=np.dtype('O')) # noqa: F841 with self.assertRaisesRegex(TypingError, rgx): cuda.jit(void())(unsupported_type) rgx = ".*Invalid NumPy dtype specified: 'int33'.*" def invalid_string_type(): arr = cuda.shared.array(10, dtype='int33') # noqa: F841 with self.assertRaisesRegex(TypingError, rgx): cuda.jit(void())(invalid_string_type)
def test_for_pre(self): """Test issue with loop not running due to bad sign-extension at the for loop precondition. """ @cuda.jit(void(float32[:, :], float32[:, :], float32[:])) def diagproduct(c, a, b): startX, startY = cuda.grid(2) gridX = cuda.gridDim.x * cuda.blockDim.x gridY = cuda.gridDim.y * cuda.blockDim.y height = c.shape[0] width = c.shape[1] for x in range(startX, width, (gridX)): for y in range(startY, height, (gridY)): c[y, x] = a[y, x] * b[x] N = 8 A, B = generate_input(N) F = np.empty(A.shape, dtype=A.dtype) blockdim = (32, 8) griddim = (1, 1) dA = cuda.to_device(A) dB = cuda.to_device(B) dF = cuda.to_device(F, copy=False) diagproduct[griddim, blockdim](dF, dA, dB) E = np.dot(A, np.diag(B)) np.testing.assert_array_almost_equal(dF.copy_to_host(), E)
class Wavelength_monitor(base): def __init__( self, name, xmin=0., xmax=0., ymin=0., ymax=0., xwidth=0., yheight=0., Lmin=0., Lmax=10., nchan=200, filename = "IL.h5", **kwargs ): self.name = name self.filename = filename if xwidth > 0: xmax = xwidth/2; xmin = -xmax if yheight > 0: ymax = yheight/2; ymin = -ymax assert xmin < xmax assert ymin < ymax dL = (Lmax-Lmin)/nchan self.L_centers = np.arange(Lmin+dL/2, Lmax, dL) self.out = np.zeros((3,nchan)) self.out_N = self.out[0] self.out_p = self.out[1] self.out_p2 = self.out[2] self.propagate_params = ( np.array([xmin, xmax, ymin, ymax, Lmin, Lmax]), nchan, self.out ) def getHistogram(self, scale_factor=1.): import histogram as H axes = [('wavelength', self.L_centers, 'angstrom')] return H.histogram( 'I(wavelength)', axes, data=self.out_p*scale_factor, errors=self.out_p2*scale_factor*scale_factor) @cuda.jit( void(NB_FLOAT[:], NB_FLOAT[:], int64, NB_FLOAT[:, :]), device=True) def propagate(neutron, limits, nchan, out): xmin, xmax, ymin, ymax, Lmin, Lmax = limits t0 = neutron[-2] x,y,z, t = prop_z0(neutron) if t0>t: return p = neutron[-1] vx,vy,vz = neutron[3:6] # if x<=xmin or x>=xmax or y<=ymin or y>=ymax: return v = math.sqrt(vx*vx+vy*vy+vz*vz) L = 2*math.pi/(v*V2K) if L<=Lmin or L>=Lmax: return iL = int(math.floor( (L-Lmin)/(Lmax-Lmin)*nchan )) cuda.atomic.add(out, ( 0, iL ), 1) cuda.atomic.add(out, ( 1, iL ), p) cuda.atomic.add(out, ( 2, iL ), p*p) return
def test_guvectorize_decor(self): gufunc = guvectorize( [void(float32[:, :], float32[:, :], float32[:, :])], '(m,n),(n,p)->(m,p)', target=self.target)(matmulcore) self.check_matmul_gufunc(gufunc)
def apply_factory(fit_intercept, val): jitdec = jit( void(get_type(State)), nopython=NOPYTHON, nogil=NOGIL, boundscheck=BOUNDSCHECK, fastmath=FASTMATH, ) if fit_intercept: @jitdec def apply(state): weights = state.weights for i in range(weights.shape[0]): weights[i] = val else: @jitdec def apply(state): weights = state.weights for i in range(1, weights.shape[0]): weights[i] = val return apply
def default_signature(self, method, ext_type): if method.name == '__init__': default_signature = numba.void(*self.argtypes) return default_signature else: return super(AutojitMethodMaker, self).default_signature( method, ext_type)
def find_active_loop(self): """ Build a numba compiled loop to find active nodes """ if self.__p_find_active_loop is None: def __find_active_loop(active, active_nod, nod): """ Compiled loop to find active nodes """ for i in range(active.shape[0]): old_found = 0 nodi = nod[i] activei = active[i] active_nodi = active_nod[i] # Iterate while there are changes while True: new_active = nodi[active_nodi].reshape(-1) found = new_active.shape[0] if found == old_found: break activei[new_active] = True old_found = found # Compile self.__p_find_active_loop = nb.njit( [nb.void(nb.b1[:, :], nb.b1[:, :], nb.i8[:, :, :])], nogil=True)(__find_active_loop) return self.__p_find_active_loop
def test_gufunc_small(self): @guvectorize( [void(float32[:, :], float32[:, :], float32[:, :])], "(m,n),(n,p)->(m,p)", target="cuda", ) def matmulcore(A, B, C): m, n = A.shape n, p = B.shape for i in range(m): for j in range(p): C[i, j] = 0 for k in range(n): C[i, j] += A[i, k] * B[k, j] gufunc = matmulcore gufunc.max_blocksize = 512 matrix_ct = 2 A = np.arange(matrix_ct * 2 * 4, dtype=np.float32).reshape(matrix_ct, 2, 4) B = np.arange(matrix_ct * 4 * 5, dtype=np.float32).reshape(matrix_ct, 4, 5) C = gufunc(A, B) Gold = ut.matrix_multiply(A, B) self.assertTrue(np.allclose(C, Gold))
def test_vector(): def fun(s,x,p,out): out[0] = s[0] + x[0] out[1] = s[1] + x[1] s = numpy.random.random((2,)) x = numpy.random.random((2,)) p = numpy.random.random((2,)) out = numpy.zeros((2,)) out1 = numpy.zeros((2,)) from numba import guvectorize, float64, void gfun = guvectorize(ftylist=[void(float64[:],float64[:],float64[:],float64[:])], signature='(n),(n),(n)->(n)')(fun) sfun = standard_function(gfun,2) fun(s,x,p,out) sfun(s,x,p,out=out1) out2 = sfun(s,x,p) out = sfun(s,x,p,diff=True) print("OUT") print(out)
def make_traverse_graph_via_bfs(callback, result_nb_type): """ callback(vertex, incident_edge, result) - callback function """ @jit(void(int32, planar_graph_nb_type, boolean[:], result_nb_type), nopython=True) def traverse_graph_via_bfs(start_vertex, graph, used_vertex_flags, result): queue = Queue() queue.append(start_vertex) used_vertex_flags[start_vertex] = True while not queue.is_empty(): vertex = queue.popleft() for incident_edge_index in graph.get_incident_edge_indices(vertex): adjacent_vertex = graph.edges.get_opposite_vertex( incident_edge_index, vertex) if not used_vertex_flags[adjacent_vertex]: callback(vertex, graph.edges, incident_edge_index, result) used_vertex_flags[adjacent_vertex] = True queue.append(adjacent_vertex) return traverse_graph_via_bfs
def _process_method_signatures(class_dict, ext_type): """ Process all method signatures: * Verify signatures * Populate ext_type with method signatures (ExtMethodType) """ for method_name, method in class_dict.iteritems(): default_signature = None if (method_name == '__init__' and isinstance(method, types.FunctionType)): if inspect.getargspec(method).args: warnings.warn( "Constructor for class '%s' has no signature, " "assuming arguments have type 'object'" % ext_type.py_class.__name__) argtypes = [numba.object_] * (method.__code__.co_argcount - 1) default_signature = numba.void(*argtypes) method, restype, argtypes = _process_signature(ext_type, method, default_signature) if method is None: continue signature = typesystem.ExtMethodType( return_type=restype, args=argtypes, name=method.name, is_class=method.is_class, is_static=method.is_static) ext_type.add_method(method_name, signature) class_dict[method_name] = method
def test_gufunc_auto_transfer(self): @guvectorize([void(float32[:, :], float32[:, :], float32[:, :])], '(m,n),(n,p)->(m,p)', target='cuda') def matmulcore(A, B, C): m, n = A.shape n, p = B.shape for i in range(m): for j in range(p): C[i, j] = 0 for k in range(n): C[i, j] += A[i, k] * B[k, j] gufunc = matmulcore gufunc.max_blocksize = 512 matrix_ct = 2 A = np.arange(matrix_ct * 2 * 4, dtype=np.float32).reshape(matrix_ct, 2, 4) B = np.arange(matrix_ct * 4 * 5, dtype=np.float32).reshape(matrix_ct, 4, 5) dB = cuda.to_device(B) C = gufunc(A, dB).copy_to_host() Gold = ut.matrix_multiply(A, B) self.assertTrue(np.allclose(C, Gold))
def test_struct_model_type_static(self): nthreads = 64 @cuda.jit(void(int32[::1], int32[::1])) def write_then_reverse_read_static(outx, outy): # Test creation arr = cuda.shared.array(nthreads, dtype=test_struct_model_type) i = cuda.grid(1) ri = nthreads - i - 1 if i < len(outx) and i < len(outy): # Test set to arr obj = TestStruct(int32(i), int32(i * 2)) arr[i] = obj cuda.syncthreads() # Test get from arr outx[i] = arr[ri].x outy[i] = arr[ri].y arrx = np.zeros((nthreads, ), dtype="int32") arry = np.zeros((nthreads, ), dtype="int32") write_then_reverse_read_static[1, nthreads](arrx, arry) for i, x in enumerate(arrx): self.assertEqual(x, nthreads - i - 1) for i, y in enumerate(arry): self.assertEqual(y, (nthreads - i - 1) * 2)
def inner(func): func_name = func.__name__ sig = numba.void( numba.types.CPointer(numba.boolean) if result_type is types.BOOL else numba.types.CPointer(arg_type.numba_t), numba.types.CPointer(arg_type.numba_t), numba.types.CPointer(arg_type.numba_t), ) jitfunc = numba.jit(func, nopython=True) @numba.cfunc(sig, nopython=True) def wrapper(z, x, y): # pragma: no cover result = jitfunc(x[0], y[0]) z[0] = result out = ffi.new("GrB_BinaryOp*") lib.GrB_BinaryOp_new( out, ffi.cast("GxB_binary_function", wrapper.address), result_type.gb_type, arg_type.gb_type, arg_type.gb_type, ) return BinaryOp(func_name, arg_type.__name__, out[0])
def test_gufunc_new_axis(self): @guvectorize([void(float64[:, :], float64[:, :], float64[:, :])], '(m,n),(n,p)->(m,p)', target='cuda') def matmulcore(A, B, C): m, n = A.shape n, p = B.shape for i in range(m): for j in range(p): C[i, j] = 0 for k in range(n): C[i, j] += A[i, k] * B[k, j] gufunc = matmulcore X = np.random.randn(10, 3, 3) Y = np.random.randn(3, 3) gold = ut.matrix_multiply(X, Y) res1 = gufunc(X, Y) np.testing.assert_allclose(gold, res1) res2 = gufunc(X, np.tile(Y, (10, 1, 1))) np.testing.assert_allclose(gold, res2)
def test_gufunc_small(self): @guvectorize([void(float32[:, :], float32[:, :], float32[:, :])], '(m,n),(n,p)->(m,p)', target='cuda') def matmulcore(A, B, C): m, n = A.shape n, p = B.shape for i in range(m): for j in range(p): C[i, j] = 0 for k in range(n): C[i, j] += A[i, k] * B[k, j] gufunc = matmulcore gufunc.max_blocksize = 512 matrix_ct = 2 A = np.arange(matrix_ct * 2 * 4, dtype=np.float32).reshape(matrix_ct, 2, 4) B = np.arange(matrix_ct * 4 * 5, dtype=np.float32).reshape(matrix_ct, 4, 5) ts = time() C = gufunc(A, B) tcuda = time() - ts ts = time() Gold = ut.matrix_multiply(A, B) tcpu = time() - ts non_stream_speedups.append(tcpu / tcuda) print(C, Gold) self.assertTrue(np.allclose(C, Gold))
def _numba_lombscargle_signature(ty): return void( ty[::1], ty[::1], ty[::1], ty[::1], ty[::1], # x # y # freqs # pgram # y_dot )
def test_type_with_struct_data_model(self): @cuda.jit(void(test_struct_model_type[::1])) def f(x): l = cuda.local.array(10, dtype=test_struct_model_type) l[0] = x[0] x[0] = l[0] self.check_dtype(f, test_struct_model_type)
def default_signature(self, method, ext_type): if method.name == '__init__': argtypes = [numba.object_] * (method.py_func.__code__.co_argcount - 1) default_signature = numba.void(*argtypes) return default_signature else: return super(JitMethodMaker, self).default_signature( method, ext_type)
def test_cuda_kernel(self): arr = cuda.to_device(np.array([1.])) @cuda.jit(void(f8[:])) def k1(a): a[0] += 2 @cuda.jit(void(f8[:])) def k2(a): a[0] *= 3 n1 = KernelNode(k1, [arr], []) n2 = KernelNode(k2, [arr], [n1]) n2.build().launch() cuda.synchronize() self.assertTrue(np.all(arr.copy_to_host() == [9]))
def make_inner_func( fn, *args ): signature = void( *args ) @jit(signature) def inner_func( result, x ): threadstate = savethread() for i in range(len(result)): result[i] = fn( x[i] ) restorethread(threadstate) return inner_func
def test_duplicated_output(self): @guvectorize([void(float32[:], float32[:])], '(x)->(x)', target='cuda') def foo(inp, out): pass # intentionally empty; never executed inp = out = np.zeros(10, dtype=np.float32) with self.assertRaises(ValueError) as raises: foo(inp, out, out=out) self.assertEqual(str(raises.exception), "cannot specify 'out' as both a positional and keyword argument")
def cf_from_source(source, func_globals): "Render the SSA graph given python source code" from numba import pipeline from numba import environment mod = ast.parse(source) func_ast = mod.body[0] env = environment.NumbaEnvironment.get_environment() func_env, _ = pipeline.run_pipeline2( env, None, func_ast, void(), pipeline_name="cf", function_globals=dict(func_globals) ) return func_env.symtab, func_env.flow # func_env.cfg
def test_copy_odd(self): @guvectorize([void(float32[:], float32[:])], '(x)->(x)', target='cuda') def copy(A, B): for i in range(B.size): B[i] = A[i] A = np.arange(11, dtype=np.float32) + 1 B = np.zeros_like(A) copy(A, out=B) self.assertTrue(np.allclose(A, B))
def test_copy2d(self): @guvectorize([void(float32[:, :], float32[:, :])], '(x, y)->(x, y)', target='cuda') def copy2d(A, B): for x in range(B.shape[0]): for y in range(B.shape[1]): B[x, y] = A[x, y] A = np.arange(30, dtype=np.float32).reshape(5, 6) + 1 B = np.zeros_like(A) copy2d(A, out=B) self.assertTrue(np.allclose(A, B))
def test_invalid_flags(self): # Check invalid flags def foo(A, B): pass with self.assertRaises(TypeError) as raises: guvectorize([void(float32[:], float32[:])], '(x)->(x)', target='cuda', what1=True, ever2=False)(foo) head = "The following target options are not supported:" msg = str(raises.exception) self.assertEqual(msg[:len(head)], head) items = msg[len(head):].strip().split(',') items = [i.strip("'\" ") for i in items] self.assertEqual(set(['what1', 'ever2']), set(items))
def test_uint_int_div_ary(): NPATHS = 10 normdist = np.empty(1000) #np.random.normal(0., 1., 1000) seed = np.arange(0x80000000, 0x80000000 + NPATHS, dtype=np.uint32) gold = np.empty(NPATHS, dtype=np.int32) got = gold.copy() uint_int_div_ary(gold, normdist, seed) print('expect %s' % gold) sig = void(uint32[:], double[:], uint32[:]) numba_func = jit(sig)(uint_int_div_ary) numba_func(got, normdist, seed) print('got %s' % got) assert all(gold == got)
def test_cpu_guvectorize(self): target = 'cpu' gufunc = guvectorize([void(float32[:,:], float32[:,:], float32[:,:])], '(m,n),(n,p)->(m,p)', target=target)(matmulcore) matrix_ct = 1001 # an odd number to test thread/block division in CUDA A = np.arange(matrix_ct * 2 * 4, dtype=np.float32).reshape(matrix_ct, 2, 4) B = np.arange(matrix_ct * 4 * 5, dtype=np.float32).reshape(matrix_ct, 4, 5) C = gufunc(A, B) Gold = ut.matrix_multiply(A, B) self.assertTrue(np.allclose(C, Gold))
def test_issue_734(self): from numba import jit, void, int32, double @jit(void(int32, double[:]), forceobj=True) def forloop_with_if(u, a): if u == 0: for i in range(a.shape[0]): a[i] = a[i] * 2.0 else: for i in range(a.shape[0]): a[i] = a[i] + 1.0 for u in (0, 1): nb_a = np.arange(10, dtype='int32') np_a = np.arange(10, dtype='int32') forloop_with_if(u, nb_a) forloop_with_if.py_func(u, np_a) self.assertTrue(np.all(nb_a == np_a))
def cls_wrapper(cls): class_type = cls.class_type instance_type = class_type.instance_type jitmethods = class_type.jitmethods # don't make decisions about what to do with NULL values for users step_signature = void(instance_type, *signature.args) jitmethods['step'].compile(step_signature) # aggregates can always return a NULL value finalize_signature = signature.return_type(instance_type) jitmethods['finalize'].compile(finalize_signature) func_name = camel_to_snake(cls.__name__) step_name = '{}_step'.format(func_name) finalize_name = '{}_finalize'.format(func_name) step_mod = gen_step(cls, step_name) finalize_mod = gen_finalize(cls, finalize_name) genmod = ast.Module(body=step_mod.body + finalize_mod.body) mod = ast.fix_missing_locations(genmod) code = compile(mod, __file__, 'exec') scope = { cls.__name__: cls, 'sqlite3_aggregate_context': sqlite3_aggregate_context, 'sqlite3_result_null': sqlite3_result_null, 'unsafe_cast': unsafe_cast, 'sizeof': sizeof, 'not_null': not_null, 'SQLITE_NULL': SQLITE_NULL, } scope.update(CONVERTERS) scope.update((func.__name__, func) for func in RESULT_SETTERS.values()) exec(code, scope) step = scope[step_name] finalize = scope[finalize_name] cls.step.address = step.address cls.finalize.address = finalize.address return cls
def test_gufunc_stream(self): @guvectorize([void(float32[:, :], float32[:, :], float32[:, :])], '(m,n),(n,p)->(m,p)', target='cuda') def matmulcore(A, B, C): m, n = A.shape n, p = B.shape for i in range(m): for j in range(p): C[i, j] = 0 for k in range(n): C[i, j] += A[i, k] * B[k, j] gufunc = matmulcore gufunc.max_blocksize = 512 #cuda.driver.flush_pending_free() matrix_ct = 1001 # an odd number to test thread/block division in CUDA A = np.arange(matrix_ct * 2 * 4, dtype=np.float32).reshape(matrix_ct, 2, 4) B = np.arange(matrix_ct * 4 * 5, dtype=np.float32).reshape(matrix_ct, 4, 5) ts = time() stream = cuda.stream() dA = cuda.to_device(A, stream) dB = cuda.to_device(B, stream) dC = cuda.device_array(shape=(1001, 2, 5), dtype=A.dtype, stream=stream) dC = gufunc(dA, dB, out=dC, stream=stream) C = dC.copy_to_host(stream=stream) stream.synchronize() tcuda = time() - ts ts = time() Gold = ut.matrix_multiply(A, B) tcpu = time() - ts stream_speedups.append(tcpu / tcuda) self.assertTrue(np.allclose(C, Gold))
def test_columns(): def fun(s,x,out): out[0] = s[0] + x[0] out[1] = s[1] + x[1] from numba import guvectorize, float64, void gfun = guvectorize(ftylist=[void(float64[:],float64[:],float64[:])], signature='(n),(n)->(n)')(fun) N = 5 s = numpy.random.random((N,2,)) x = numpy.random.random((2,)) out = numpy.zeros((N,2,)) out1 = numpy.zeros((N,2,)) sfun = standard_function(gfun,2) for n in range(N): fun(s[n,:],x,out[n,:]) sfun(s,x,out=out1) out2 = sfun(s,x) # print(out2) # print(s+x) print(s+x) # assert( (abs(out2-s-x).max())<1e-8 ) print(out) print(out1) print(out2) out, out_s, = sfun(s,x,diff=True)
''' Only added decorators to the linregr_python.py implementation. ''' import numbapro from numba import autojit, jit, f8, int32, void @jit(void(f8[:], f8[:], f8[:], f8, int32)) def gradient_descent(X, Y, theta, alpha, num_iters): m = Y.shape[0] theta_x = 0.0 theta_y = 0.0 for i in range(num_iters): predict = theta_x + theta_y * X err_x = (predict - Y) err_y = (predict - Y) * X theta_x = theta_x - alpha * (1.0 / m) * err_x.sum() theta_y = theta_y - alpha * (1.0 / m) * err_y.sum() theta[0] = theta_x theta[1] = theta_y
Where `a` is a scalar `x` and `y` are vectors Prefix 'S' indicates single-precision float32 operations """ from __future__ import print_function import sys import numpy from numba import cuda, vectorize, float32, void # GPU code # --------- @cuda.jit(void(float32, float32[:], float32[:], float32[:])) def saxpy(a, x, y, out): # Short for cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x i = cuda.grid(1) # Map i to array elements if i >= out.size: # Out of range? return # Do actual work out[i] = a * x[i] + y[i] """ Vectorize turns a scalar function into a elementwise operation over the input arrays. """
import logging; logging.getLogger().setLevel(0) import numba def init(): print("init") def deinit(): print("deinit") def do_something(): raise Exception('Life is pain') def handle(exn): print("handled %r" % exn) @numba.jit(numba.void()) def testfn0(): init() deinit() ''' This generates the following: define void @__numba_specialized_0___main___2E_testfn0() { entry: %tuple_result = tail call { i32, i32* }* (i32, ...)* @PyTuple_Pack(i32 0) %0 = icmp eq { i32, i32* }* %tuple_result, null br i1 %0, label %cleanup_label, label %"no_error_13:0" cleanup_label: ; preds = %"no_error_13:05", %entry, %"no_error_13:0", %"no_error_13:01" %1 = phi { i32, i32* }* [ %tuple_result4, %"no_error_13:05" ], [ null, %entry ], [ null, %"no_error_13:0" ], [ null, %"no_error_13:01" ] %2 = phi { i32, i32* }* [ %7, %"no_error_13:05" ], [ null, %entry ], [ null, %"no_error_13:0" ], [ null, %"no_error_13:01" ] %3 = phi { i32, i32* }* [ %4, %"no_error_13:05" ], [ null, %entry ], [ null, %"no_error_13:0" ], [ %4, %"no_error_13:01" ] tail call void @Py_XDECREF({ i32, i32* }* %tuple_result) tail call void @Py_XDECREF({ i32, i32* }* %3)
def test_guvectorize_decor(self): gufunc = guvectorize([void(float32[:,:], float32[:,:], float32[:,:])], '(m,n),(n,p)->(m,p)', target=self.target)(matmulcore) self.check_matmul_gufunc(gufunc)
pmt_gain = c.get('pmt_reference_gain', 2e6) if pmt_gain == 0: return 0 return adc_to_e / pmt_gain def get_detector_by_channel(config): """Return a channel -> detector lookup dictionary from a configuration""" detector_by_channel = {} for name, chs in config['channels_in_detector'].items(): for ch in chs: detector_by_channel[ch] = name return detector_by_channel @numba.jit(numba.void(numba.float64[:], numba.int64[:, :], numba.int64, numba.int64), nopython=True) def extend_intervals(w, intervals, left_extension, right_extension): """Extends intervals on w by left_extension to left and right_extension to right, never exceeding w's bounds :param w: Waveform intervals live on. Only used for edges (kind of pointless to pass...) :param intervals: numpy N*2 array of ints of interval bounds :param left_extension: Extend intervals left by this number of samples, or as far as possible until the end of another interval / the end of w. :param right_extension: Same, extend to right. :return: None, modifes intervals in place When two intervals' extension claims compete, right extension has priority. Boundary indices are inclusive, i.e. without any extension settings, the right boundary is the last index which was still above low_threshold """ n_intervals = len(intervals) last_index_in_w = len(w) - 1
def precompile(ex, signature=(), context={}): """Compile the expression to an intermediate form. """ if isinstance(ex, (str, unicode)): #XXX: we might want to work directly with (python's) AST # and do numexpr transformations directly at that level instead of going # str -> Expression -> ast -> ... types = dict(signature) ex = stringToExpression(ex, types, context) if signature: argnames = [name for (name, type_) in signature] else: # this can only occur when NumExpr() is called directly without # signature, and in that case we have no other choice than use # the default type for all arguments (double) argnames = get_argnames(ex) signature = [(name, double) for name in argnames] if ex.value in ('sum', 'prod'): reduction_func = getattr(np, ex.value) args = ex.children # axis is second arg assert len(args) == 2 ex, axis = args axis = axis.value else: reduction_func = None axis = None ast_expr = ex.toPython() # print ast.dump(ast_expr, annotate_fields=False) ast_func = ast_expr_to_ast_func(ast_expr, argnames) inner_func = ast_func_to_func(ast_func) # print ast.dump(ast_func, annotate_fields=False) res_type = kind_to_type[ex.astKind] full_sig = [('__result__', res_type)] + signature arg_types = [type_to_numba[type_] for name, type_ in full_sig] jit_signature = void(*[t[:] for t in arg_types]) inner_func_nb = jit(jit_signature, nopython=True)(inner_func) if reduction_func is not None: # this is a hack. To do it (more) correctly (with multithreading), # I would need to use a different template_func: # for i in range(len(__result__)): # __result__[0] += __expr_placeholder__ def func(*args, **kwargs): # order, casting, ex_uses_vml out = kwargs.pop('out', None) if out is not None: raise NotImplementedError() shape = args[0].shape args = [a.ravel() for a in args] tmp_out = np.empty(shape, dtype=res_type) inner_func_nb(tmp_out.ravel(), *args) return reduction_func(tmp_out, axis=axis) else: def func(*args, **kwargs): numthreads = utils.num_threads # only scalars if any(isinstance(arg, np.ndarray) for arg in args): shape = args[0].shape if any(arg.shape != shape for arg in args[1:]): args = np.broadcast_arrays(*args) shape = args[0].shape out = kwargs.pop('out', None) if out is None: out = np.empty(shape, dtype=res_type) # "flatten" arguments # we cannot use order="K" which is most efficient, in case arguments # have not the same in-memory layout, because we need the same # target memory layout for all arguments. #XXX: can't we test for that and convert only if really necessary? args = [out.ravel()] + [a.ravel() for a in args] length = len(args[0]) # TODO: it might be better to make sure the starting bounds # are aligned to X bytes # TODO: it might be better to not multithread at all if # length < THRESHOLD chunklen = (length + numthreads - 1) // numthreads bounds = [(i * chunklen, min((i + 1) * chunklen, length)) for i in range(numthreads)] assert bounds[-1][1] == length chunks = [[arg[start:stop] for arg in args] for start, stop in bounds] threads = [threading.Thread(target=inner_func_nb, args=chunk) for chunk in chunks[:-1]] for thread in threads: thread.start() # the main thread handles the last chunk inner_func_nb(*chunks[-1]) for thread in threads: thread.join() return out else: # all arguments are scalar out = np.empty(1, dtype=res_type) args = [out] + [np.array([a]) for a in args] inner_func_nb(*args) return out[0] def run(*args, **kwargs): return func(*args, **kwargs) func.run = run return func
if np.sqrt((x[i]-x0)**2+(y[i]-y0)**2) < r: exposure_indices = np.vstack((exposure_indices,np.array([x[i],y[i]],dtype=np.int32))) return exposure_indices outfilename = 'test.txt' @jit(float32(float32,float32,float32,float32),nopython=True) def dist(x0,y0,x,y): return math.sqrt( (x0-x)*(x0-x)+(y0-y)*(y0-y) ) @jit(void(float32[:,:],int32[:,:],float32[:]),nopython=True,parallel= True) def set_doses_field(field, exposure_indices, doses): for i in prange(doses.shape[0]): field[exposure_indices[i,0],exposure_indices[i,1]] = doses[i] @jit(void(float32[:,:],int32[:,:],float32),nopython=True) def set_target(target, exposure_indices, dose): for i in range(exposure_indices.shape[0]): target[exposure_indices[i,0],exposure_indices[i,1]] = dose @njit(void(float32[:,:],float32[:,:],float32[:],float32[:]),parallel=True) def convolve_with_vector(field,exposure,v,h): buf = np.zeros(field.shape,dtype=np.float32) for j in prange(field.shape[1]): for i in range(field.shape[0]):
savethread = pythonapi.PyEval_SaveThread savethread.argtypes = [] savethread.restype = c_void_p restorethread = pythonapi.PyEval_RestoreThread restorethread.argtypes = [c_void_p] restorethread.restype = None def inner_func(result, a, b): threadstate = savethread() for i in range(len(result)): result[i] = exp(2.1 * a[i] + 3.2 * b[i]) restorethread(threadstate) signature = void(double[:], double[:], double[:]) inner_func_nb = jit(signature, nopython=True)(inner_func) func_nb = make_singlethread(inner_func_nb) func_nb_mt = make_multithread(inner_func_nb, nthreads) def func_np(a, b): return np.exp(2.1 * a + 3.2 * b) a = np.random.rand(size) b = np.random.rand(size) c = np.random.rand(size) correct = timefunc(None, "numpy (1 thread)", func_np, a, b) timefunc(correct, "numba (1 thread)", func_nb, a, b) timefunc(correct, "numba (%d threads)" % nthreads, func_nb_mt, a, b)
import numpy as np import unittest from numba import void, int32, uint32, jit, int64 @jit(void(uint32[:], uint32, uint32)) def prng(X, A, C): for i in range(X.shape[0]): for j in range(100): v = (A * X[i] + C) X[i] = v & 0xffffffff @jit(uint32()) def unsigned_literal(): return abs(0xFFFFFFFF) @jit(int64()) def unsigned_literal_64(): return 0x100000000 @jit(int64(int32)) def constant_int_add(a): return 0xffffffff + a class Test(unittest.TestCase): def test_prng(self): N = 100 A = 1664525 C = 1013904223 X0 = np.arange(N, dtype=np.uint32) X1 = X0.copy()