def env(): kernel_string = """__global__ void vector_add(float *c, float *a, float *b, int n) { int i = blockIdx.x * block_size_x + threadIdx.x; if (i<n) { c[i] = a[i] + b[i]; } } """ args = get_vector_add_args() params = {"block_size_x": 128} lang = "CUDA" kernel_source = core.KernelSource(kernel_string, lang) verbose = True kernel_options = Options(kernel_name="vector_add", kernel_string=kernel_string, problem_size=args[-1], arguments=args, lang=lang, grid_div_x=None, grid_div_y=None, grid_div_z=None, cmem_args=None, texmem_args=None, block_size_names=None) device_options = Options(device=0, platform=0, lang=lang, quiet=False, compiler=None, compiler_options=None) dev = core.DeviceInterface(kernel_source, iterations=7, **device_options) instance = dev.create_kernel_instance(kernel_source, kernel_options, params, verbose) return dev, instance
def run_kernel(kernel_name, kernel_string, problem_size, arguments, params, grid_div_x=None, grid_div_y=None, grid_div_z=None, lang=None, device=0, platform=0, cmem_args=None, compiler_options=None, block_size_names=None, quiet=False): #sort options into separate dicts opts = locals() kernel_options = Options([(k, opts[k]) for k in _kernel_options.keys()]) device_options = Options([(k, opts[k]) for k in _device_options.keys()]) #detect language and create the right device function interface dev = core.DeviceInterface(kernel_string, iterations=1, **device_options) #move data to the GPU util.check_argument_list(arguments) gpu_args = dev.ready_argument_list(arguments) instance = None try: #create kernel instance instance = dev.create_kernel_instance(kernel_options, params, False) if instance is None: raise Exception("cannot create kernel instance, too many threads per block") #compile the kernel func = dev.compile_kernel(instance, False) if func is None: raise Exception("cannot compile kernel, too much shared memory used") #add constant memory arguments to compiled module if cmem_args is not None: dev.copy_constant_memory_args(cmem_args) finally: #delete temp files if instance is not None: for v in instance.temp_files.values(): util.delete_temp_file(v) #run the kernel if not dev.run_kernel(func, gpu_args, instance): raise Exception("runtime error occured, too many resources requested") #copy data in GPU memory back to the host results = [] for i, arg in enumerate(arguments): if numpy.isscalar(arg): results.append(arg) else: results.append(numpy.zeros_like(arg)) dev.memcpy_dtoh(results[-1], gpu_args[i]) return results
def __init__(self, kernel_name, kernel_string, problem_size, arguments, params, inputs=None, outputs=None, device=0, platform=0, block_size_names=None, grid_div_x=None, grid_div_y=None, grid_div_z=None, verbose=True, lang=None): """ Construct Python helper object to compile and call the kernel from Python This object compiles a GPU kernel parameterized using the parameters in params. GPU memory is allocated for each argument using its size and type as listed in arguments. The object can be called directly as a function with the kernel arguments as function arguments. Kernel arguments marked as inputs will be copied to the GPU on every kernel launch. Only the kernel arguments marked as outputs will be returned, note that the result is always returned in a list, even when there is only one output. Most of the arguments to this function are the same as with tune_kernel or run_kernel in Kernel Tuner, and are therefore not duplicated here. The two new arguments are: :param inputs: a boolean list of length arguments to signal whether an argument is input to the kernel :type inputs: list(bool) :param outputs: a boolean list of length arguments to signal whether an argument is output of the kernel :type outputs: list(bool) """ #construct device interface kernel_source = core.KernelSource(kernel_string, lang) self.dev = core.DeviceInterface(kernel_source, device=device) #construct kernel_options to hold information about the kernel opts = locals() kernel_options = Options([(k, opts[k]) for k in _kernel_options.keys() if k in opts.keys()]) #instantiate the kernel given the parameters in params self.kernel_instance = self.dev.create_kernel_instance(kernel_source, kernel_options, params, verbose) #compile the kernel self.func = self.dev.compile_kernel(self.kernel_instance, verbose) #setup GPU memory self.gpu_args = self.dev.ready_argument_list(arguments) if inputs: self.inputs = inputs else: self.inputs = [True for _ in arguments] if outputs: self.outputs = outputs else: self.outputs = [True for _ in arguments]
def test_check_kernel_output(dev_func_interface): dev_func_interface.configure_mock(**mock_config) dev = core.DeviceInterface(core.KernelSource("", lang="CUDA")) dfi = dev.dev answer = [np.zeros(4).astype(np.float32)] instance = core.KernelInstance("name", None, "kernel_string", "temp_files", (256, 1, 1), (1, 1, 1), {}, answer) wrong = [np.array([1, 2, 3, 4]).astype(np.float32)] atol = 1e-6 test = dev.check_kernel_output('func', answer, instance, answer, atol, None, True) dfi.memcpy_htod.assert_called_once_with(answer[0], answer[0]) dfi.run_kernel.assert_called_once_with('func', answer, (256, 1, 1), (1, 1, 1)) print(dfi.mock_calls) assert dfi.memcpy_dtoh.called == 1 for name, args, _ in dfi.mock_calls: if name == 'memcpy_dtoh': assert all(args[0] == answer[0]) assert all(args[1] == answer[0]) assert test #the following call to check_kernel_output is expected to fail because #the answer is non-zero, while the memcpy_dtoh function on the Mocked object #obviously does not result in the result_host array containing anything #non-zero try: dev.check_kernel_output('func', wrong, instance, wrong, atol, None, True) print("check_kernel_output failed to throw an exception") assert False except Exception: assert True
def run_kernel(kernel_name, kernel_string, problem_size, arguments, params, grid_div_x=None, grid_div_y=None, grid_div_z=None, lang=None, device=0, platform=0, cmem_args=None, texmem_args=None, compiler=None, compiler_options=None, block_size_names=None, quiet=False, log=None): if log: logging.basicConfig(filename=kernel_name + datetime.now().strftime('%Y%m%d-%H:%M:%S') + '.log', level=log) kernel_source = core.KernelSource(kernel_string, lang) _check_user_input(kernel_name, kernel_source, arguments, block_size_names) #sort options into separate dicts opts = locals() kernel_options = Options([(k, opts[k]) for k in _kernel_options.keys()]) device_options = Options([(k, opts[k]) for k in _device_options.keys()]) #detect language and create the right device function interface dev = core.DeviceInterface(kernel_source, iterations=1, **device_options) #move data to the GPU gpu_args = dev.ready_argument_list(arguments) instance = None try: #create kernel instance instance = dev.create_kernel_instance(kernel_source, kernel_options, params, False) if instance is None: raise Exception("cannot create kernel instance, too many threads per block") # see if the kernel arguments have correct type util.check_argument_list(instance.name, instance.kernel_string, arguments) #compile the kernel func = dev.compile_kernel(instance, False) if func is None: raise Exception("cannot compile kernel, too much shared memory used") #add constant memory arguments to compiled module if cmem_args is not None: dev.copy_constant_memory_args(cmem_args) #add texture memory arguments to compiled module if texmem_args is not None: dev.copy_texture_memory_args(texmem_args) finally: #delete temp files if instance is not None: instance.delete_temp_files() #run the kernel if not dev.run_kernel(func, gpu_args, instance): raise Exception("runtime error occured, too many resources requested") #copy data in GPU memory back to the host results = [] for i, arg in enumerate(arguments): if numpy.isscalar(arg): results.append(arg) else: results.append(numpy.zeros_like(arg)) dev.memcpy_dtoh(results[-1], gpu_args[i]) #trying to make run_kernel work nicely with the Nvidia Visual Profiler del dev return results
def test_get_device_interface3(): with raises(Exception): lang = "blabla" core.DeviceInterface("", 0, 0, lang=lang)
def test_get_device_interface2(): lang = "OpenCL" dev = core.DeviceInterface("", 0, 0, lang=lang) assert isinstance(dev, core.DeviceInterface) assert isinstance(dev.dev, opencl.OpenCLFunctions)
def test_get_device_interface1(): lang = "CUDA" dev = core.DeviceInterface("", 0, 0, lang=lang) assert isinstance(dev, core.DeviceInterface) assert isinstance(dev.dev, cuda.CudaFunctions)
def test_get_device_interface3(): with raises(Exception): lang = "blabla" with core.DeviceInterface(lang) as dev: pass
def test_get_device_interface2(): lang = "OpenCL" with core.DeviceInterface(core.KernelSource("", lang=lang)) as dev: assert isinstance(dev, core.DeviceInterface) assert isinstance(dev.dev, opencl.OpenCLFunctions)
def test_get_device_interface1(): lang = "CUDA" with core.DeviceInterface(core.KernelSource("", lang=lang)) as dev: assert isinstance(dev, core.DeviceInterface) assert isinstance(dev.dev, cuda.CudaFunctions)
def test_get_device_interface3(): lang = "blabla" core.DeviceInterface("", 0, 0, lang=lang)