def llvm_cpython_wrapper_name(self): """ The LLVM-registered name for a CPython-compatible wrapper of the raw function (i.e. a PyCFunctionWithKeywords). """ return itanium_mangler.prepend_namespace(self.mangled_name, ns="cpython")
def prepare_cuda_kernel(self, codelib, func_name, argtypes, debug, nvvm_options, max_registers=None): """ Adapt a code library ``codelib`` with the numba compiled CUDA kernel with name ``fname`` and arguments ``argtypes`` for NVVM. A new library is created with a wrapper function that can be used as the kernel entry point for the given kernel. Returns the new code library and the wrapper function. Parameters: codelib: The CodeLibrary containing the device function to wrap in a kernel call. func_name: The mangled name of the device function. argtypes: An iterable of the types of arguments to the kernel. debug: Whether to compile with debug. nvvm_options: Dict of NVVM options used when compiling the new library. max_registers: The max_registers argument for the code library. """ kernel_name = itanium_mangler.prepend_namespace(func_name, ns='cudapy') library = self.codegen().create_library(f'{codelib.name}_kernel_', entry_name=kernel_name, nvvm_options=nvvm_options, max_registers=max_registers) library.add_linking_library(codelib) wrapper = self.generate_kernel_wrapper(library, kernel_name, func_name, argtypes, debug) return library, wrapper
def generate_kernel_wrapper(self, library, fname, argtypes, debug): """ Generate the kernel wrapper in the given ``library``. The function being wrapped have the name ``fname`` and argument types ``argtypes``. The wrapper function is returned. """ arginfo = self.get_arg_packer(argtypes) argtys = list(arginfo.argument_types) wrapfnty = ir.FunctionType(ir.VoidType(), argtys) wrapper_module = self.create_module("cuda.kernel.wrapper") fnty = ir.FunctionType( ir.IntType(32), [self.call_conv.get_return_type(types.pyobject)] + argtys) func = ir.Function(wrapper_module, fnty, fname) prefixed = itanium_mangler.prepend_namespace(func.name, ns='cudapy') wrapfn = ir.Function(wrapper_module, wrapfnty, prefixed) builder = ir.IRBuilder(wrapfn.append_basic_block('')) # Define error handling variables def define_error_gv(postfix): name = wrapfn.name + postfix gv = cgutils.add_global_variable(wrapper_module, ir.IntType(32), name) gv.initializer = ir.Constant(gv.type.pointee, None) return gv gv_exc = define_error_gv("__errcode__") gv_tid = [] gv_ctaid = [] for i in 'xyz': gv_tid.append(define_error_gv("__tid%s__" % i)) gv_ctaid.append(define_error_gv("__ctaid%s__" % i)) callargs = arginfo.from_arguments(builder, wrapfn.args) status, _ = self.call_conv.call_function(builder, func, types.void, argtypes, callargs) if debug: # Check error status with cgutils.if_likely(builder, status.is_ok): builder.ret_void() with builder.if_then(builder.not_(status.is_python_exc)): # User exception raised old = ir.Constant(gv_exc.type.pointee, None) # Use atomic cmpxchg to prevent rewriting the error status # Only the first error is recorded if nvvm.NVVM().is_nvvm70: xchg = builder.cmpxchg(gv_exc, old, status.code, 'monotonic', 'monotonic') changed = builder.extract_value(xchg, 1) else: casfnty = ir.FunctionType( old.type, [gv_exc.type, old.type, old.type]) cas_hack = "___numba_atomic_i32_cas_hack" casfn = ir.Function(wrapper_module, casfnty, name=cas_hack) xchg = builder.call(casfn, [gv_exc, old, status.code]) changed = builder.icmp_unsigned('==', xchg, old) # If the xchange is successful, save the thread ID. sreg = nvvmutils.SRegBuilder(builder) with builder.if_then(changed): for dim, ptr, in zip("xyz", gv_tid): val = sreg.tid(dim) builder.store(val, ptr) for dim, ptr, in zip("xyz", gv_ctaid): val = sreg.ctaid(dim) builder.store(val, ptr) builder.ret_void() nvvm.set_cuda_kernel(wrapfn) library.add_ir_module(wrapper_module) library.finalize() wrapfn = library.get_function(wrapfn.name) return wrapfn
# file axpy.cubin # SASS # come back to this once toolkit downloaded # cuobjdump -sass axpy.cubin # Load module from numba.cuda.cudadrv.driver import load_module_image # noqa from numba.core import itanium_mangler # noqa ctx = cuda.get_current_device().get_primary_context() module = load_module_image(ctx, cubin) mangled_name = itanium_mangler.prepend_namespace(fname, ns='cudapy') cufunc = module.get_function(mangled_name) type(cufunc) # Launch kernel # Copy our arrays to the device - normally Numba does this for us d_r = cuda.to_device(r) d_x = cuda.to_device(x) d_y = cuda.to_device(y) # A couple of helpers from Numba's CUDA driver implementation from numba.cuda.cudadrv.driver import (device_pointer, is_device_memory, device_ctypes_pointer) # noqa