def __init__(self, context, builder): """ Note: Maybe called multiple times when lowering a function """ from numba.targets import boxing self.context = context self.builder = builder self.module = builder.basic_block.function.module # A unique mapping of serialized objects in this module try: self.module.__serialized except AttributeError: self.module.__serialized = {} # Initialize types self.pyobj = self.context.get_argument_type(types.pyobject) self.voidptr = Type.pointer(Type.int(8)) self.long = Type.int(ctypes.sizeof(ctypes.c_long) * 8) self.ulonglong = Type.int(ctypes.sizeof(ctypes.c_ulonglong) * 8) self.longlong = self.ulonglong self.double = Type.double() self.py_ssize_t = self.context.get_value_type(types.intp) self.cstring = Type.pointer(Type.int(8)) self.gil_state = Type.int(_helperlib.py_gil_state_size * 8) self.py_buffer_t = ir.ArrayType(ir.IntType(8), _helperlib.py_buffer_size)
def __init__(self, context, builder): """ Note: Maybe called multiple times when lowering a function """ from numba.targets import boxing self.context = context self.builder = builder self.module = builder.basic_block.function.module # A unique mapping of serialized objects in this module try: self.module.__serialized except AttributeError: self.module.__serialized = {} # Initialize types self.pyobj = self.context.get_argument_type(types.pyobject) self.voidptr = Type.pointer(Type.int(8)) self.long = Type.int(ctypes.sizeof(ctypes.c_long) * 8) self.ulong = self.long self.longlong = Type.int(ctypes.sizeof(ctypes.c_ulonglong) * 8) self.ulonglong = self.longlong self.double = Type.double() self.py_ssize_t = self.context.get_value_type(types.intp) self.cstring = Type.pointer(Type.int(8)) self.gil_state = Type.int(_helperlib.py_gil_state_size * 8) self.py_buffer_t = ir.ArrayType(ir.IntType(8), _helperlib.py_buffer_size)
def __call__(self, *args): ctx, symbol, kernargs, kernarg_region = self.bind() # Unpack pyobject values into ctypes scalar values expanded_values = [] for ty, val in zip(self.argument_types, args): _unpack_argument(ty, val, expanded_values) # Insert kernel arguments base = 0 for av in expanded_values: # Adjust for alignemnt align = ctypes.sizeof(av) pad = _calc_padding_for_alignment(align, base) base += pad # Move to offset offseted = ctypes.addressof(kernargs) + base asptr = ctypes.cast(offseted, ctypes.POINTER(type(av))) # Assign value asptr[0] = av # Increment offset base += align assert base <= ctypes.sizeof( kernargs), "Kernel argument size is invalid" # Actual Kernel launch qq = ctx.default_queue # Dispatch qq.dispatch(symbol, kernargs, workgroup_size=self.local_size, grid_size=self.global_size) # Free kernel region kernarg_region.free(kernargs)
def __call__(self, *args): self._sentry_resource_limit() ctx, symbol, kernargs, kernarg_region = self.bind() # Unpack pyobject values into ctypes scalar values expanded_values = [] # contains lambdas to execute on return retr = [] for ty, val in zip(self.argument_types, args): _unpack_argument(ty, val, expanded_values, retr) # Insert kernel arguments base = 0 for av in expanded_values: # Adjust for alignemnt align = ctypes.sizeof(av) pad = _calc_padding_for_alignment(align, base) base += pad # Move to offset offseted = kernargs.value + base asptr = ctypes.cast(offseted, ctypes.POINTER(type(av))) # Assign value asptr[0] = av # Increment offset base += align # Actual Kernel launch qq = ctx.default_queue if self.stream is None: hsa.implicit_sync() # Dispatch signal = None if self.stream is not None: signal = hsa.create_signal(1) qq.insert_barrier(self.stream._get_last_signal()) qq.dispatch(symbol, kernargs, workgroup_size=self.local_size, grid_size=self.global_size, signal=signal) if self.stream is not None: self.stream._add_signal(signal) # retrieve auto converted arrays for wb in retr: wb() # Free kernel region if kernargs is not None: if self.stream is None: kernarg_region.free(kernargs) else: self.stream._add_callback( lambda: kernarg_region.free(kernargs))
def __init__(self, context, builder): """ Note: Maybe called multiple times when lowering a function """ fix_python_api() self.context = context self.builder = builder self.module = builder.basic_block.function.module # Initialize types self.pyobj = self.context.get_argument_type(types.pyobject) self.long = Type.int(ctypes.sizeof(ctypes.c_long) * 8) self.ulonglong = Type.int(ctypes.sizeof(ctypes.c_ulonglong) * 8) self.longlong = self.ulonglong self.double = Type.double() self.py_ssize_t = self.context.get_value_type(types.intp) self.cstring = Type.pointer(Type.int(8))
def __call__(self, *args): self._sentry_resource_limit() ctx, symbol, kernargs, kernarg_region = self.bind() # Unpack pyobject values into ctypes scalar values expanded_values = [] # contains lambdas to execute on return retr = [] for ty, val in zip(self.argument_types, args): _unpack_argument(ty, val, expanded_values, retr) # Insert kernel arguments base = 0 for av in expanded_values: # Adjust for alignemnt align = ctypes.sizeof(av) pad = _calc_padding_for_alignment(align, base) base += pad # Move to offset offseted = kernargs.value + base asptr = ctypes.cast(offseted, ctypes.POINTER(type(av))) # Assign value asptr[0] = av # Increment offset base += align # Actual Kernel launch qq = ctx.default_queue if self.stream is None: hsa.implicit_sync() # Dispatch signal = None if self.stream is not None: signal = hsa.create_signal(1) qq.insert_barrier(self.stream._get_last_signal()) qq.dispatch(symbol, kernargs, workgroup_size=self.local_size, grid_size=self.global_size, signal=signal) if self.stream is not None: self.stream._add_signal(signal) # retrieve auto converted arrays for wb in retr: wb() # Free kernel region if kernargs is not None: if self.stream is None: kernarg_region.free(kernargs) else: self.stream._add_callback(lambda: kernarg_region.free(kernargs))
def allocate(self, nd): arraytype = make_array_ctype(nd) sizeof = ctypes.sizeof(arraytype) # Oversized or insufficient space if sizeof > self.elemsize or not self.queue: return _allocate_head(nd) mem = self.queue.popleft() self.allocated.add(mem) return mem
def _kernel_call(self, args, griddim, blockdim, stream=0, sharedmem=0): # Prepare kernel cufunc = self._func.get() if self.debug: excname = cufunc.name + "__errcode__" excmem, excsz = cufunc.module.get_global_symbol(excname) assert excsz == ctypes.sizeof(ctypes.c_int) excval = ctypes.c_int() excmem.memset(0, stream=stream) # Prepare arguments retr = [] # hold functors for writeback kernelargs = [] for t, v in zip(self.argument_types, args): self._prepare_args(t, v, stream, retr, kernelargs) # Configure kernel cu_func = cufunc.configure(griddim, blockdim, stream=stream, sharedmem=sharedmem) # Invoke kernel cu_func(*kernelargs) if self.debug: driver.device_to_host(ctypes.addressof(excval), excmem, excsz) if excval.value != 0: # An error occurred def load_symbol(name): mem, sz = cufunc.module.get_global_symbol( "%s__%s__" % (cufunc.name, name)) val = ctypes.c_int() driver.device_to_host(ctypes.addressof(val), mem, sz) return val.value tid = [load_symbol("tid" + i) for i in 'zyx'] ctaid = [load_symbol("ctaid" + i) for i in 'zyx'] code = excval.value exccls, exc_args = self.call_helper.get_exception(code) # Prefix the exception message with the thread position prefix = "tid=%s ctaid=%s" % (tid, ctaid) if exc_args: exc_args = ("%s: %s" % (prefix, exc_args[0]), ) + exc_args[1:] else: exc_args = prefix, raise exccls(*exc_args) # retrieve auto converted arrays for wb in retr: wb()
def _kernel_call(self, args, griddim, blockdim, stream=0, sharedmem=0): # Prepare kernel cufunc = self._func.get() if self.debug: excname = cufunc.name + "__errcode__" excmem, excsz = cufunc.module.get_global_symbol(excname) assert excsz == ctypes.sizeof(ctypes.c_int) excval = ctypes.c_int() excmem.memset(0, stream=stream) # Prepare arguments retr = [] # hold functors for writeback kernelargs = [] for t, v in zip(self.argument_types, args): self._prepare_args(t, v, stream, retr, kernelargs) # Configure kernel cu_func = cufunc.configure(griddim, blockdim, stream=stream, sharedmem=sharedmem) # Invoke kernel cu_func(*kernelargs) if self.debug: driver.device_to_host(ctypes.addressof(excval), excmem, excsz) if excval.value != 0: # An error occurred def load_symbol(name): mem, sz = cufunc.module.get_global_symbol("%s__%s__" % (cufunc.name, name)) val = ctypes.c_int() driver.device_to_host(ctypes.addressof(val), mem, sz) return val.value tid = [load_symbol("tid" + i) for i in 'zyx'] ctaid = [load_symbol("ctaid" + i) for i in 'zyx'] code = excval.value exccls, exc_args = self.call_helper.get_exception(code) # Prefix the exception message with the thread position prefix = "tid=%s ctaid=%s" % (tid, ctaid) if exc_args: exc_args = ("%s: %s" % (prefix, exc_args[0]),) + exc_args[1:] else: exc_args = prefix, raise exccls(*exc_args) # retrieve auto converted arrays for wb in retr: wb()
def bind(self): """ Bind kernel to device """ ctx, entry = self._cacheprog.get() if entry.symbol.kernarg_segment_size > 0: sz = ctypes.sizeof(ctypes.c_byte) *\ entry.symbol.kernarg_segment_size kernargs = entry.kernarg_region.allocate(sz) else: kernargs = None return ctx, entry.symbol, kernargs, entry.kernarg_region
def make_array_ctype(ndim): """Create a array header type for a given dimension. """ c_intp = ctypes.c_ssize_t class c_array(ctypes.Structure): _fields_ = [('parent', ctypes.c_void_p), ('data', ctypes.c_void_p), ('shape', c_intp * ndim), ('strides', c_intp * ndim)] assert ctypes.sizeof(c_array) == _calc_array_sizeof(ndim), \ "sizeof(CUDA array struct) != sizeof(CPU array struct)" return c_array
def ndarray_populate_head(gpu_head, gpu_data, shape, strides, stream=0): """ Populate the array header """ nd = len(shape) assert nd > 0, "0 or negative dimension" arraytype = make_array_ctype(nd) struct = arraytype(data=driver.device_pointer(gpu_data), shape=shape, strides=strides) driver.host_to_device(gpu_head, struct, ctypes.sizeof(struct), stream=stream) driver.device_memory_depends(gpu_head, gpu_data)
def ndarray_populate_head(gpu_mem, gpu_data, shape, strides, stream=0): """ Populate the array header """ nd = len(shape) assert nd > 0, "0 or negative dimension" arraytype = make_array_ctype(nd) struct = arraytype(data=driver.device_pointer(gpu_data), shape=shape, strides=strides) gpu_head = gpu_mem.allocate(nd) databytes = np.ndarray(shape=ctypes.sizeof(struct), dtype=np.byte, buffer=struct) gpu_mem.write(databytes, gpu_head, stream=stream) driver.device_memory_depends(gpu_head, gpu_data) return gpu_head
def ndarray_populate_head(gpu_mem, gpu_data, shape, strides, stream=0): """ Populate the array header """ nd = len(shape) assert nd > 0, "0 or negative dimension" arraytype = make_array_ctype(nd) struct = arraytype(parent=None, data=driver.device_pointer(gpu_data), shape=shape, strides=strides) gpu_head = gpu_mem.allocate(nd) databytes = np.ndarray(shape=ctypes.sizeof(struct), dtype=np.byte, buffer=struct) gpu_mem.write(databytes, gpu_head, stream=stream) driver.device_memory_depends(gpu_head, gpu_data) return gpu_head
def _allocate_head(nd): """Allocate the metadata structure """ arraytype = make_array_ctype(nd) gpu_head = devices.get_context().memalloc(ctypes.sizeof(arraytype)) return gpu_head
def _test_array(self, ndim): c_array = make_array_ctype(ndim) self.assertEqual(ctypes.sizeof(c_array), self._cpu_array_sizeof(ndim))