def convert_types(restype, argtypes): # eval type string if sigutils.is_signature(restype): assert argtypes is None argtypes, restype = sigutils.normalize_signature(restype) return restype, argtypes
def jit(signature=None, device=False): """JIT compile a python function conforming to the HSA-Python """ if signature is None: return autojit(device=device) elif not sigutils.is_signature(signature): func = signature return autojit(device=device)(func) else: if device: return _device_jit(signature) else: return _kernel_jit(signature)
def jit(func_or_sig=None, argtypes=None, device=False, inline=False, bind=True, link=[], debug=None, **kws): """ JIT compile a python function conforming to the CUDA Python specification. If a signature is supplied, then a function is returned that takes a function to compile. If :param func_or_sig: A function to JIT compile, or a signature of a function to compile. If a function is supplied, then an :class:`AutoJitCUDAKernel` is returned. If a signature is supplied, then a function which takes a function to compile and returns an :class:`AutoJitCUDAKernel` is returned. .. note:: A kernel cannot have any return value. :type func_or_sig: function or numba.typing.Signature :param device: Indicates whether this is a device function. :type device: bool :param bind: Force binding to CUDA context immediately :type bind: bool :param link: A list of files containing PTX source to link with the function :type link: list :param debug: If True, check for exceptions thrown when executing the kernel. Since this degrades performance, this should only be used for debugging purposes. Defaults to False. (The default value can be overriden by setting environment variable ``NUMBA_CUDA_DEBUGINFO=1``.) :param fastmath: If true, enables flush-to-zero and fused-multiply-add, disables precise division and square root. This parameter has no effect on device function, whose fastmath setting depends on the kernel function from which they are called. """ debug = config.CUDA_DEBUGINFO_DEFAULT if debug is None else debug if link and config.ENABLE_CUDASIM: raise NotImplementedError('Cannot link PTX in the simulator') fastmath = kws.get('fastmath', False) if argtypes is None and not sigutils.is_signature(func_or_sig): if func_or_sig is None: if config.ENABLE_CUDASIM: def autojitwrapper(func): return FakeCUDAKernel(func, device=device, fastmath=fastmath, debug=debug) else: def autojitwrapper(func): return jit(func, device=device, bind=bind, debug=debug, **kws) return autojitwrapper # func_or_sig is a function else: if config.ENABLE_CUDASIM: return FakeCUDAKernel(func_or_sig, device=device, fastmath=fastmath, debug=debug) elif device: return jitdevice(func_or_sig, debug=debug, **kws) else: targetoptions = kws.copy() targetoptions['debug'] = debug return AutoJitCUDAKernel(func_or_sig, bind=bind, targetoptions=targetoptions) else: if config.ENABLE_CUDASIM: def jitwrapper(func): return FakeCUDAKernel(func, device=device, fastmath=fastmath, debug=debug) return jitwrapper restype, argtypes = convert_types(func_or_sig, argtypes) if restype and not device and restype != types.void: raise TypeError("CUDA kernel must have void return type.") def kernel_jit(func): kernel = compile_kernel(func, argtypes, link=link, debug=debug, inline=inline, fastmath=fastmath) # Force compilation for the current context if bind: kernel.bind() return kernel def device_jit(func): return compile_device(func, restype, argtypes, inline=inline, debug=debug) if device: return device_jit else: return kernel_jit
def jit(signature_or_function=None, argtypes=None, restype=None, locals={}, target='cpu', **targetoptions): """jit([signature_or_function, [locals={}, [target='cpu', [**targetoptions]]]]) The function can be used as the following versions: 1) jit(signature, [target='cpu', [**targetoptions]]) -> jit(function) Equivalent to: d = dispatcher(function, targetoptions) d.compile(signature) Create a dispatcher object for a python function and default target-options. Then, compile the funciton with the given signature. Example: @jit("void(int32, float32)") def foo(x, y): return x + y 2) jit(function) -> dispatcher Same as old autojit. Create a dispatcher function object that specialize at call site. Example: @jit def foo(x, y): return x + y 3) jit([target='cpu', [**targetoptions]]) -> configured_jit(function) Same as old autojit and 2). But configure with target and default target-options. Example: @jit(target='cpu', nopython=True) def foo(x, y): return x + y Target Options --------------- The CPU (default target) defines the following: - nopython: [bool] Set to True to disable the use of PyObjects and Python API calls. The default behavior is to allow the use of PyObjects and Python API. Default value is False. - forceobj: [bool] Set to True to force the use of PyObjects for every value. Default value is False. """ # Handle deprecated argtypes and restype keyword arguments if argtypes is not None: assert signature_or_function is None, "argtypes used but " \ "signature is provided" warnings.warn("Keyword argument 'argtypes' is deprecated", DeprecationWarning) if restype is None: signature_or_function = tuple(argtypes) else: signature_or_function = restype(*argtypes) # Handle signature if signature_or_function is None: # Used as autojit def configured_jit(arg): return jit(arg, locals=locals, target=target, **targetoptions) return configured_jit elif sigutils.is_signature(signature_or_function): # Function signature is provided sig = signature_or_function return _jit(sig, locals=locals, target=target, targetoptions=targetoptions) else: # No signature is provided pyfunc = signature_or_function dispatcher = registry.target_registry[target] dispatcher = dispatcher(py_func=pyfunc, locals=locals, targetoptions=targetoptions) # NOTE This affects import time for large function # # Compile a pure object mode # if target == 'cpu' and not targetoptions.get('nopython', False): # dispatcher.compile((), locals=locals, forceobj=True) return dispatcher
def jit(func_or_sig=None, argtypes=None, device=False, inline=False, bind=True, link=[], debug=False, **kws): """ JIT compile a python function conforming to the CUDA Python specification. If a signature is supplied, then a function is returned that takes a function to compile. If :param func_or_sig: A function to JIT compile, or a signature of a function to compile. If a function is supplied, then an :class:`AutoJitCUDAKernel` is returned. If a signature is supplied, then a function which takes a function to compile and returns an :class:`AutoJitCUDAKernel` is returned. .. note:: A kernel cannot have any return value. :type func_or_sig: function or numba.typing.Signature :param device: Indicates whether this is a device function. :type device: bool :param bind: Force binding to CUDA context immediately :type bind: bool :param link: A list of files containing PTX source to link with the function :type link: list :param debug: If True, check for exceptions thrown when executing the kernel. Since this degrades performance, this should only be used for debugging purposes. :param fastmath: If true, enables flush-to-zero and fused-multiply-add, disables precise division and square root. This parameter has no effect on device function, whose fastmath setting depends on the kernel function from which they are called. """ if link and config.ENABLE_CUDASIM: raise NotImplementedError('Cannot link PTX in the simulator') if argtypes is None and not sigutils.is_signature(func_or_sig): if func_or_sig is None: if config.ENABLE_CUDASIM: def autojitwrapper(func): return FakeCUDAKernel(func, device=device, fastmath=fastmath, debug=debug) else: def autojitwrapper(func): return jit(func, device=device, bind=bind, **kws) return autojitwrapper # func_or_sig is a function else: if config.ENABLE_CUDASIM: return FakeCUDAKernel(func_or_sig, device=device, fastmath=fastmath, debug=debug) elif device: return jitdevice(func_or_sig, **kws) else: targetoptions = kws.copy() targetoptions['debug'] = debug return AutoJitCUDAKernel(func_or_sig, bind=bind, targetoptions=targetoptions) else: fastmath = kws.get('fastmath', False) if config.ENABLE_CUDASIM: def jitwrapper(func): return FakeCUDAKernel(func, device=device, fastmath=fastmath, debug=debug) return jitwrapper restype, argtypes = convert_types(func_or_sig, argtypes) if restype and not device and restype != types.void: raise TypeError("CUDA kernel must have void return type.") def kernel_jit(func): kernel = compile_kernel(func, argtypes, link=link, debug=debug, inline=inline, fastmath=fastmath) # Force compilation for the current context if bind: kernel.bind() return kernel def device_jit(func): return compile_device(func, restype, argtypes, inline=inline, debug=debug) if device: return device_jit else: return kernel_jit
def jit(restype=None, argtypes=None, device=False, inline=False, bind=True, link=[], debug=False, **kws): """JIT compile a python function conforming to the CUDA-Python specification. To define a CUDA kernel that takes two int 1D-arrays:: @cuda.jit('void(int32[:], int32[:])') def foo(aryA, aryB): ... .. note:: A kernel cannot have any return value. To launch the cuda kernel:: griddim = 1, 2 blockdim = 3, 4 foo[griddim, blockdim](aryA, aryB) ``griddim`` is the number of thread-block per grid. It can be: * an int; * tuple-1 of ints; * tuple-2 of ints. ``blockdim`` is the number of threads per block. It can be: * an int; * tuple-1 of ints; * tuple-2 of ints; * tuple-3 of ints. The above code is equaivalent to the following CUDA-C. .. code-block:: c dim3 griddim(1, 2); dim3 blockdim(3, 4); foo<<<griddim, blockdim>>>(aryA, aryB); To access the compiled PTX code:: print foo.ptx To define a CUDA device function that takes two ints and returns a int:: @cuda.jit('int32(int32, int32)', device=True) def bar(a, b): ... To force inline the device function:: @cuda.jit('int32(int32, int32)', device=True, inline=True) def bar_forced_inline(a, b): ... A device function can only be used inside another kernel. It cannot be called from the host. Using ``bar`` in a CUDA kernel:: @cuda.jit('void(int32[:], int32[:], int32[:])') def use_bar(aryA, aryB, aryOut): i = cuda.grid(1) # global position of the thread for a 1D grid. aryOut[i] = bar(aryA[i], aryB[i]) When the function signature is not given, this decorator behaves like autojit. """ if argtypes is None and not sigutils.is_signature(restype): if restype is None: return autojit(device=device, bind=bind, link=link, debug=debug, inline=inline, **kws) # restype is a function else: decor = autojit(device=device, bind=bind, link=link, debug=debug, inline=inline, **kws) return decor(restype) else: restype, argtypes = convert_types(restype, argtypes) if restype and not device and restype != types.void: raise TypeError("CUDA kernel must have void return type.") def kernel_jit(func): kernel = compile_kernel(func, argtypes, link=link, debug=debug, inline=inline) # Force compilation for the current context if bind: kernel.bind() return kernel def device_jit(func): return compile_device(func, restype, argtypes, inline=inline, debug=debug) if device: return device_jit else: return kernel_jit
def jit(restype=None, argtypes=None, device=False, inline=False, bind=True, link=[], debug=False, **kws): """JIT compile a python function conforming to the CUDA-Python specification. To define a CUDA kernel that takes two int 1D-arrays:: @cuda.jit('void(int32[:], int32[:])') def foo(aryA, aryB): ... .. note:: A kernel cannot have any return value. To launch the cuda kernel:: griddim = 1, 2 blockdim = 3, 4 foo[griddim, blockdim](aryA, aryB) ``griddim`` is the number of thread-block per grid. It can be: * an int; * tuple-1 of ints; * tuple-2 of ints. ``blockdim`` is the number of threads per block. It can be: * an int; * tuple-1 of ints; * tuple-2 of ints; * tuple-3 of ints. The above code is equaivalent to the following CUDA-C. .. code-block:: c dim3 griddim(1, 2); dim3 blockdim(3, 4); foo<<<griddim, blockdim>>>(aryA, aryB); To access the compiled PTX code:: print foo.ptx To define a CUDA device function that takes two ints and returns a int:: @cuda.jit('int32(int32, int32)', device=True) def bar(a, b): ... To force inline the device function:: @cuda.jit('int32(int32, int32)', device=True, inline=True) def bar_forced_inline(a, b): ... A device function can only be used inside another kernel. It cannot be called from the host. Using ``bar`` in a CUDA kernel:: @cuda.jit('void(int32[:], int32[:], int32[:])') def use_bar(aryA, aryB, aryOut): i = cuda.grid(1) # global position of the thread for a 1D grid. aryOut[i] = bar(aryA[i], aryB[i]) When the function signature is not given, this decorator behaves like autojit. The following addition options are available for kernel functions only. They are ignored in device function. - fastmath: bool Enables flush-to-zero for denormal float; Enables fused-multiply-add; Disables precise division; Disables precise square root. """ if link and config.ENABLE_CUDASIM: raise NotImplementedError('Cannot link PTX in the simulator') if argtypes is None and not sigutils.is_signature(restype): if restype is None: return autojit(device=device, bind=bind, link=link, debug=debug, inline=inline, **kws) # restype is a function else: decor = autojit(device=device, bind=bind, link=link, debug=debug, inline=inline, **kws) return decor(restype) else: fastmath = kws.get('fastmath', False) if config.ENABLE_CUDASIM: def jitwrapper(func): return FakeCUDAKernel(func, device=device, fastmath=fastmath, debug=debug) return jitwrapper restype, argtypes = convert_types(restype, argtypes) if restype and not device and restype != types.void: raise TypeError("CUDA kernel must have void return type.") def kernel_jit(func): kernel = compile_kernel(func, argtypes, link=link, debug=debug, inline=inline, fastmath=fastmath) # Force compilation for the current context if bind: kernel.bind() return kernel def device_jit(func): return compile_device(func, restype, argtypes, inline=inline, debug=debug) if device: return device_jit else: return kernel_jit