def test_complies_fortran_function_with_module(): kernel_string = """ module my_fancy_module use iso_c_binding contains function my_test_function() result(time) use iso_c_binding real (c_float) :: time time = 42.0 end function my_test_function end module my_fancy_module """ kernel_name = "my_test_function" kernel_sources = KernelSource(kernel_name, kernel_string, "C") kernel_instance = KernelInstance(kernel_name, kernel_sources, kernel_string, [], None, None, dict(), []) try: with CFunctions(compiler="gfortran") as cfunc: func = cfunc.compile(kernel_instance) result = cfunc.run_kernel(func, [], (), ()) assert np.isclose(result, 42.0) finally: util.delete_temp_file("my_fancy_module.mod")
def test_setup_device_targets_max(fake_results): results_filename = "temp_test_results_file.json" header_filename = "temp_test_header_file.h" kernel_name, kernel_string, tune_params, problem_size, parameter_space, results, env = fake_results #add GFLOP/s as metric for i, e in enumerate(results): e['GFLOP/s'] = 1e5 / e['time'] try: integration.store_results(results_filename, kernel_name, kernel_string, tune_params, problem_size, results, env, top=3, objective="GFLOP/s") integration.create_device_targets(header_filename, results_filename, objective="GFLOP/s") with open(header_filename, 'r') as fh: output_str = fh.read() assert "TARGET_My_GPU" in output_str assert "#define a 1" in output_str assert "#define b 4" in output_str #test output when more then one problem size is used, and best configuration is different for i, e in enumerate(results): if e['a'] == 1 and e['b'] == 4: e['time'] += 100 e['GFLOP/s'] = 1e5 / e['time'] integration.store_results(results_filename, kernel_name, kernel_string, tune_params, 1000, results, env, top=3, objective="GFLOP/s") integration.create_device_targets(header_filename, results_filename, objective="GFLOP/s") with open(header_filename, 'r') as fh: output_str = fh.read() expected = "\n".join(["TARGET_My_GPU", "#define a 1", "#define b 5"]) assert expected in output_str finally: util.delete_temp_file(results_filename) util.delete_temp_file(header_filename)
def run_kernel(kernel_name, kernel_string, problem_size, arguments, params, grid_div_x=None, grid_div_y=None, grid_div_z=None, lang=None, device=0, platform=0, cmem_args=None, compiler_options=None, block_size_names=None, quiet=False): #sort options into separate dicts opts = locals() kernel_options = Options([(k, opts[k]) for k in _kernel_options.keys()]) device_options = Options([(k, opts[k]) for k in _device_options.keys()]) #detect language and create the right device function interface dev = core.DeviceInterface(kernel_string, iterations=1, **device_options) #move data to the GPU util.check_argument_list(arguments) gpu_args = dev.ready_argument_list(arguments) instance = None try: #create kernel instance instance = dev.create_kernel_instance(kernel_options, params, False) if instance is None: raise Exception("cannot create kernel instance, too many threads per block") #compile the kernel func = dev.compile_kernel(instance, False) if func is None: raise Exception("cannot compile kernel, too much shared memory used") #add constant memory arguments to compiled module if cmem_args is not None: dev.copy_constant_memory_args(cmem_args) finally: #delete temp files if instance is not None: for v in instance.temp_files.values(): util.delete_temp_file(v) #run the kernel if not dev.run_kernel(func, gpu_args, instance): raise Exception("runtime error occured, too many resources requested") #copy data in GPU memory back to the host results = [] for i, arg in enumerate(arguments): if numpy.isscalar(arg): results.append(arg) else: results.append(numpy.zeros_like(arg)) dev.memcpy_dtoh(results[-1], gpu_args[i]) return results
def compile_and_benchmark(self, gpu_args, params, kernel_options, tuning_options): """ Compile and benchmark a kernel instance based on kernel strings and parameters """ instance_string = util.get_instance_string(params) logging.debug('compile_and_benchmark ' + instance_string) mem_usage = round( resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024.0, 1) logging.debug('Memory usage : %2.2f MB', mem_usage) verbose = tuning_options.verbose instance = self.create_kernel_instance(kernel_options, params, verbose) if instance is None: return None try: #compile the kernel func = self.compile_kernel(instance, verbose) if func is None: return None #add constant memory arguments to compiled module if kernel_options.cmem_args is not None: self.dev.copy_constant_memory_args(kernel_options.cmem_args) #test kernel for correctness and benchmark if tuning_options.answer is not None: self.check_kernel_correctness(func, gpu_args, instance, tuning_options.answer, tuning_options.atol, tuning_options.verify, verbose) #benchmark time = self.benchmark(func, gpu_args, instance, verbose) except Exception as e: #dump kernel_string to temp file temp_filename = util.get_temp_filename(suffix=".c") util.write_file(temp_filename, instance.kernel_string) print("Error while compiling or benchmarking, see source files: " + temp_filename + " ".join(instance.temp_files.values())) raise e #clean up any temporary files, if no error occured for v in instance.temp_files.values(): util.delete_temp_file(v) return time
def compile_and_benchmark(self, gpu_args, params, kernel_options, tuning_options): """ Compile and benchmark a kernel instance based on kernel strings and parameters """ instance_string = util.get_instance_string(params) logging.debug('compile_and_benchmark ' + instance_string) mem_usage = round(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss/1024.0, 1) logging.debug('Memory usage : %2.2f MB', mem_usage) verbose = tuning_options.verbose instance = self.create_kernel_instance(kernel_options, params, verbose) if instance is None: return None try: #compile the kernel func = self.compile_kernel(instance, verbose) if func is None: return None #add constant memory arguments to compiled module if kernel_options.cmem_args is not None: self.dev.copy_constant_memory_args(kernel_options.cmem_args) #add texture memory arguments to compiled module if kernel_options.texmem_args is not None: self.dev.copy_texture_memory_args(kernel_options.texmem_args) #test kernel for correctness and benchmark if tuning_options.answer is not None: self.check_kernel_output(func, gpu_args, instance, tuning_options.answer, tuning_options.atol, tuning_options.verify, verbose) #benchmark time = self.benchmark(func, gpu_args, instance, tuning_options.times, verbose) except Exception as e: #dump kernel_string to temp file temp_filename = util.get_temp_filename(suffix=".c") util.write_file(temp_filename, instance.kernel_string) print("Error while compiling or benchmarking, see source files: " + temp_filename + " ".join(instance.temp_files.values())) raise e #clean up any temporary files, if no error occured for v in instance.temp_files.values(): util.delete_temp_file(v) return time
def test_PythonKernel_tuned(test_kernel): kernel_name, kernel_string, n, args, params = test_kernel c, a, b, n = args test_results_file = "test_results_file.json" results = params.copy() results['time'] = 1.0 env = {"device_name": "bogus GPU"} try: #create a fake results file integration.store_results(test_results_file, kernel_name, kernel_string, params, n, [results], env) #create a kernel using the results kernel_function = kernelbuilder.PythonKernel(kernel_name, kernel_string, n, args, results_file=test_results_file) #test if params were retrieved correctly assert kernel_function.params["block_size_x"] == 384 #see if it functions properly reference = kernel_function(c, a, b, n) assert np.allclose(reference[0], a+b) finally: util.delete_temp_file(test_results_file)
def delete_temp_files(self): """Delete any generated temp files""" for v in self.temp_files.values(): util.delete_temp_file(v)
def test_store_results(fake_results): filename = "temp_test_results_file.json" kernel_name, kernel_string, tune_params, problem_size, parameter_space, results, env = fake_results try: #test basic operation integration.store_results(filename, kernel_name, kernel_string, tune_params, problem_size, results, env, top=3) meta, stored_data = integration._read_results_file(filename) assert len([ d for d in stored_data if d["device_name"] == "My_GPU" and d["problem_size"] == "100" ]) == 3 #test if results for a different problem_size values are added integration.store_results(filename, kernel_name, kernel_string, tune_params, 1000, results, env, top=3) meta, stored_data = integration._read_results_file(filename) assert len([ d for d in stored_data if d["device_name"] == "My_GPU" and d["problem_size"] == "100" ]) == 3 assert len([ d for d in stored_data if d["device_name"] == "My_GPU" and d["problem_size"] == "1000" ]) == 3 #test if results for a different GPU can be added integration.store_results(filename, kernel_name, kernel_string, tune_params, problem_size, results, {"device_name": "Another GPU"}, top=3) meta, stored_data = integration._read_results_file(filename) assert len(set([d["device_name"] for d in stored_data])) == 2 #test if overwriting results works for i, r in enumerate(results): r["time"] = 50.0 + i integration.store_results(filename, kernel_name, kernel_string, tune_params, problem_size, results, env, top=0.1) meta, stored_data = integration._read_results_file(filename) my_gpu_100_data = [ d for d in stored_data if d["device_name"] == "My_GPU" and d["problem_size"] == "100" ] assert len(my_gpu_100_data) == 1 assert my_gpu_100_data[0]["time"] < 100 finally: util.delete_temp_file(filename)
def test_setup_device_targets(fake_results): results_filename = "temp_test_results_file.json" header_filename = "temp_test_header_file.h" kernel_name, kernel_string, tune_params, problem_size, parameter_space, results, env = fake_results try: integration.store_results(results_filename, kernel_name, kernel_string, tune_params, problem_size, results, env, top=3) #results file #{'My_GPU': {'100': [{'a': 1, 'b': 4, 'time': 100.0}, {'a': 1, 'b': 5, 'time': 101.0}, {'a': 1, 'b': 6, 'time': 102.0}]}} integration.create_device_targets(header_filename, results_filename) with open(header_filename, 'r') as fh: output_str = fh.read() assert "#ifdef TARGET_My_GPU" in output_str assert "#define a 1" in output_str assert "#define b 4" in output_str #test output when more then one problem size is used, and best configuration is different for i, e in enumerate(results): if e['a'] == 1 and e['b'] == 4: e['time'] += 100 integration.store_results(results_filename, kernel_name, kernel_string, tune_params, 1000, results, env, top=3) integration.create_device_targets(header_filename, results_filename, objective="time") with open(header_filename, 'r') as fh: output_str = fh.read() expected = "\n".join(["TARGET_My_GPU", "#define a 1", "#define b 5"]) assert expected in output_str #test output when more then one problem size is used, and best configuration depends on total time for i, e in enumerate(results): if e['a'] == 1 and e['b'] == 6: e['time'] -= 3 integration.store_results(results_filename, kernel_name, kernel_string, tune_params, 1000, results, env, top=3) integration.create_device_targets(header_filename, results_filename, objective="time") with open(header_filename, 'r') as fh: output_str = fh.read() expected = "\n".join(["TARGET_My_GPU", "#define a 1", "#define b 6"]) assert expected in output_str #test output when more then one GPU is used for i, e in enumerate(results): if e['a'] == 1 and e['b'] == 6: e['time'] += 3.1 env['device_name'] = "My_GPU2" integration.store_results(results_filename, kernel_name, kernel_string, tune_params, 1000, results, env, top=3) integration.create_device_targets(header_filename, results_filename, objective="time") with open(header_filename, 'r') as fh: output_str = fh.read() expected = "\n".join(["TARGET_My_GPU", "#define a 1", "#define b 6"]) assert expected in output_str expected = "\n".join(["TARGET_My_GPU2", "#define a 1", "#define b 5"]) assert expected in output_str expected = "\n".join([ "#else /* default configuration */", "#define a 1", "#define b 5" ]) assert expected in output_str finally: util.delete_temp_file(results_filename) util.delete_temp_file(header_filename)
def compile(self, kernel_name, kernel_string): """call the C compiler to compile the kernel, return the function :param kernel_name: The name of the kernel to be compiled, used to lookup the function after compilation. :type kernel_name: string :param kernel_string: The C code that contains the function `kernel_name` :type kernel_string: string :returns: An ctypes function that can be called directly. :rtype: ctypes._FuncPtr """ logging.debug('compiling ' + kernel_name) if self.lib != None: self.cleanup_lib() compiler_options = ["-fPIC"] #detect openmp if "#include <omp.h>" in kernel_string or "use omp_lib" in kernel_string: logging.debug('set using_openmp to true') self.using_openmp = True if self.compiler == "pgfortran": compiler_options.append("-mp") else: compiler_options.append("-fopenmp") #detect whether to use nvcc as default instead of g++, may overrule an explicitly passed g++ if ("#include <cuda" in kernel_string) or ("__global__" in kernel_string): if self.compiler == "g++" and self.nvcc_available: self.compiler = "nvcc" #select right suffix based on compiler suffix = ".cc" if self.compiler in ["gfortran", "pgfortran", "ftn", "ifort"]: suffix = ".F90" if self.compiler == "nvcc": suffix = suffix[:-1] + "u" compiler_options = ["-Xcompiler=" + c for c in compiler_options] if ".c" in suffix: if not "extern \"C\"" in kernel_string: kernel_string = "extern \"C\" {\n" + kernel_string + "\n}" #copy user specified compiler options to current list if self.compiler_options: compiler_options += self.compiler_options lib_args = [] if "CL/cl.h" in kernel_string: lib_args = ["-lOpenCL"] logging.debug('using compiler ' + self.compiler) logging.debug('compiler_options ' + " ".join(compiler_options)) logging.debug('lib_args ' + " ".join(lib_args)) source_file = get_temp_filename(suffix=suffix) filename = ".".join(source_file.split(".")[:-1]) #detect Fortran modules match = re.search(r"\s*module\s+([a-zA-Z_]*)", kernel_string) if match: if self.compiler == "gfortran": kernel_name = "__" + match.group(1) + "_MOD_" + kernel_name elif self.compiler in ["ftn", "ifort"]: kernel_name = match.group(1) + "_mp_" + kernel_name + "_" elif self.compiler == "pgfortran": kernel_name = match.group(1) + "_" + kernel_name + "_" try: write_file(source_file, kernel_string) lib_extension = ".so" if platform.system() == "Darwin": lib_extension = ".dylib" subprocess.check_call([self.compiler, "-c", source_file] + compiler_options + ["-o", filename + ".o"]) subprocess.check_call([self.compiler, filename + ".o"] + compiler_options + ["-shared", "-o", filename + lib_extension] + lib_args) self.lib = numpy.ctypeslib.load_library(filename, '.') func = getattr(self.lib, kernel_name) func.restype = C.c_float finally: delete_temp_file(source_file) delete_temp_file(filename + ".o") delete_temp_file(filename + ".so") delete_temp_file(filename + ".dylib") return func
def compile(self, kernel_instance): """call the C compiler to compile the kernel, return the function :param kernel_instance: An object representing the specific instance of the tunable kernel in the parameter space. :type kernel_instance: kernel_tuner.core.KernelInstance :returns: An ctypes function that can be called directly. :rtype: ctypes._FuncPtr """ logging.debug('compiling ' + kernel_instance.name) kernel_string = kernel_instance.kernel_string kernel_name = kernel_instance.name if self.lib != None: self.cleanup_lib() compiler_options = ["-fPIC"] #detect openmp if "#include <omp.h>" in kernel_string or "use omp_lib" in kernel_string: logging.debug('set using_openmp to true') self.using_openmp = True if self.compiler == "pgfortran": compiler_options.append("-mp") else: compiler_options.append("-fopenmp") #if filename is known, use that one suffix = kernel_instance.kernel_source.get_user_suffix() #if code contains device code, suffix .cu is required device_code_signals = ["__global", "__syncthreads()", "threadIdx"] if any([snippet in kernel_string for snippet in device_code_signals]): suffix = ".cu" #detect whether to use nvcc as default instead of g++, may overrule an explicitly passed g++ if (suffix == ".cu") or ("#include <cuda" in kernel_string) or ("cudaMemcpy" in kernel_string): if self.compiler == "g++" and self.nvcc_available: self.compiler = "nvcc" if suffix is None: #select right suffix based on compiler suffix = ".cc" if self.compiler in ["gfortran", "pgfortran", "ftn", "ifort"]: suffix = ".F90" if self.compiler == "nvcc": compiler_options = ["-Xcompiler=" + c for c in compiler_options] #this basically checks if we aren't compiling Fortran #at the moment any C, C++, or CUDA code is assumed to use extern "C" linkage if ".c" in suffix: if not "extern \"C\"" in kernel_string: kernel_string = "extern \"C\" {\n" + kernel_string + "\n}" #copy user specified compiler options to current list if self.compiler_options: compiler_options += self.compiler_options lib_args = [] if "CL/cl.h" in kernel_string: lib_args = ["-lOpenCL"] logging.debug('using compiler ' + self.compiler) logging.debug('compiler_options ' + " ".join(compiler_options)) logging.debug('lib_args ' + " ".join(lib_args)) source_file = get_temp_filename(suffix=suffix) filename = ".".join(source_file.split(".")[:-1]) #detect Fortran modules match = re.search(r"\s*module\s+([a-zA-Z_]*)", kernel_string) if match: if self.compiler == "gfortran": kernel_name = "__" + match.group(1) + "_MOD_" + kernel_name elif self.compiler in ["ftn", "ifort"]: kernel_name = match.group(1) + "_mp_" + kernel_name + "_" elif self.compiler == "pgfortran": kernel_name = match.group(1) + "_" + kernel_name + "_" else: #for functions outside of modules if self.compiler in ["gfortran", "ftn", "ifort", "pgfortran"]: kernel_name = kernel_name + "_" try: write_file(source_file, kernel_string) lib_extension = ".so" if platform.system() == "Darwin": lib_extension = ".dylib" subprocess.check_call([self.compiler, "-c", source_file] + compiler_options + ["-o", filename + ".o"]) subprocess.check_call([self.compiler, filename + ".o"] + compiler_options + ["-shared", "-o", filename + lib_extension] + lib_args) self.lib = numpy.ctypeslib.load_library(filename, '.') func = getattr(self.lib, kernel_name) func.restype = C.c_float finally: delete_temp_file(source_file) delete_temp_file(filename + ".o") delete_temp_file(filename + ".so") delete_temp_file(filename + ".dylib") return func
def compile(self, kernel_name, kernel_string): """call the C compiler to compile the kernel, return the function :param kernel_name: The name of the kernel to be compiled, used to lookup the function after compilation. :type kernel_name: string :param kernel_string: The C code that contains the function `kernel_name` :type kernel_string: string :returns: An ctypes function that can be called directly. :rtype: ctypes._FuncPtr """ logging.debug('compiling ' + kernel_name) if self.lib != None: self.cleanup_lib() suffix = ".cc" if not "extern \"C\"" in kernel_string: kernel_string = "extern \"C\" {\n" + kernel_string + "\n}" compiler_options = ["-fPIC"] if "#include <omp.h>" in kernel_string: logging.debug('set using_openmp to true') self.using_openmp = True compiler_options.append("-fopenmp") if ("#include <cuda" in kernel_string) or ("__global__" in kernel_string): if self.nvcc_available: self.compiler = "nvcc" if self.compiler == "nvcc": suffix = suffix[:-1] + "u" compiler_options = ["-Xcompiler=" + c for c in compiler_options] if self.compiler_options: compiler_options += self.compiler_options lib_args = [] if "CL/cl.h" in kernel_string: lib_args = ["-lOpenCL"] logging.debug('using compiler ' + self.compiler) logging.debug('compiler_options ' + " ".join(compiler_options)) logging.debug('lib_args ' + " ".join(lib_args)) source_file = get_temp_filename(suffix=suffix) filename = ".".join(source_file.split(".")[:-1]) try: write_file(source_file, kernel_string) subprocess.check_call([self.compiler, "-c", source_file] + compiler_options + ["-o", filename+".o"]) subprocess.check_call([self.compiler, filename+".o"] + compiler_options + ["-shared", "-o", filename+".so"] + lib_args) self.lib = numpy.ctypeslib.load_library(filename, '.') func = getattr(self.lib, kernel_name) func.restype = C.c_float finally: delete_temp_file(source_file) delete_temp_file(filename+".o") delete_temp_file(filename+".so") return func
def compile(self, kernel_name, kernel_string): """call the C compiler to compile the kernel, return the function :param kernel_name: The name of the kernel to be compiled, used to lookup the function after compilation. :type kernel_name: string :param kernel_string: The C code that contains the function `kernel_name` :type kernel_string: string :returns: An ctypes function that can be called directly. :rtype: ctypes._FuncPtr """ logging.debug('compiling ' + kernel_name) if self.lib != None: self.cleanup_lib() compiler_options = ["-fPIC"] #detect openmp if "#include <omp.h>" in kernel_string or "use omp_lib" in kernel_string: logging.debug('set using_openmp to true') self.using_openmp = True if self.compiler == "pgfortran": compiler_options.append("-mp") else: compiler_options.append("-fopenmp") #select right suffix based on compiler suffix = ".cc" #detect whether to use nvcc as default instead of g++, may overrule an explicitly passed g++ if ("#include <cuda" in kernel_string) or ("cudaMemcpy" in kernel_string): if self.compiler == "g++" and self.nvcc_available: self.compiler = "nvcc" #if contains device code suffix .cu is required by nvcc if self.compiler == "nvcc" and "__global__" in kernel_string: suffix = ".cu" if self.compiler in ["gfortran", "pgfortran", "ftn", "ifort"]: suffix = ".F90" if self.compiler == "nvcc": compiler_options = ["-Xcompiler=" + c for c in compiler_options] if ".c" in suffix: if not "extern \"C\"" in kernel_string: kernel_string = "extern \"C\" {\n" + kernel_string + "\n}" #copy user specified compiler options to current list if self.compiler_options: compiler_options += self.compiler_options lib_args = [] if "CL/cl.h" in kernel_string: lib_args = ["-lOpenCL"] logging.debug('using compiler ' + self.compiler) logging.debug('compiler_options ' + " ".join(compiler_options)) logging.debug('lib_args ' + " ".join(lib_args)) source_file = get_temp_filename(suffix=suffix) filename = ".".join(source_file.split(".")[:-1]) #detect Fortran modules match = re.search(r"\s*module\s+([a-zA-Z_]*)", kernel_string) if match: if self.compiler == "gfortran": kernel_name = "__" + match.group(1) + "_MOD_" + kernel_name elif self.compiler in ["ftn", "ifort"]: kernel_name = match.group(1) + "_mp_" + kernel_name + "_" elif self.compiler == "pgfortran": kernel_name = match.group(1) + "_" + kernel_name + "_" try: write_file(source_file, kernel_string) lib_extension = ".so" if platform.system() == "Darwin": lib_extension = ".dylib" subprocess.check_call([self.compiler, "-c", source_file] + compiler_options + ["-o", filename + ".o"]) subprocess.check_call([self.compiler, filename + ".o"] + compiler_options + ["-shared", "-o", filename + lib_extension] + lib_args) self.lib = numpy.ctypeslib.load_library(filename, '.') func = getattr(self.lib, kernel_name) func.restype = C.c_float finally: delete_temp_file(source_file) delete_temp_file(filename+".o") delete_temp_file(filename+".so") delete_temp_file(filename+".dylib") return func