def compile(self, extra_source_files=[], extra_cuda_flags=[], with_cuda=None, build_dir=None, compile_module_name=None): from torch.utils.cpp_extension import load file_extension = '.cu' if self.is_cuda else '.cpp' source_code = str(self) hash = _hash(source_code.encode()).hexdigest() if not build_dir: build_dir = join(get_cache_config()['object_cache'], self.module_name) os.makedirs(build_dir, exist_ok=True) file_name = join(build_dir, f'{hash}{file_extension}') self.compiled_file = join(build_dir, compile_module_name or file_name).replace('.cpp', '') + '.so' if not exists(file_name): write_file(file_name, source_code) # Torch regards CXX os.environ['CXX'] = get_compiler_config()['command'] torch_extension = load(compile_module_name or hash, [file_name] + extra_source_files, with_cuda=self.is_cuda or with_cuda, extra_cflags=['--std=c++14', get_compiler_config() ['flags'].replace('--std=c++11', '')], extra_cuda_cflags=['-std=c++14', '-ccbin', get_compiler_config()['command']] + extra_cuda_flags, build_directory=build_dir, extra_include_paths=[get_pycuda_include_path(), get_pystencils_include_path(), *get_cubic_interpolation_include_paths()]) return torch_extension
def compile_module(code, code_hash, base_dir): compiler_config = get_compiler_config() extra_flags = [ '-I' + get_paths()['include'], '-I' + get_pystencils_include_path() ] if compiler_config['os'].lower() == 'windows': function_prefix = '__declspec(dllexport)' lib_suffix = '.pyd' object_suffix = '.obj' windows = True else: function_prefix = '' lib_suffix = '.so' object_suffix = '.o' windows = False src_file = os.path.join(base_dir, code_hash + ".cpp") lib_file = os.path.join(base_dir, code_hash + lib_suffix) object_file = os.path.join(base_dir, code_hash + object_suffix) if not os.path.exists(object_file): with file_handle_for_atomic_write(src_file) as f: code.write_to_file(compiler_config['restrict_qualifier'], function_prefix, f) if windows: compile_cmd = ['cl.exe', '/c', '/EHsc' ] + compiler_config['flags'].split() compile_cmd += [*extra_flags, src_file, '/Fo' + object_file] run_compile_step(compile_cmd) else: with atomic_file_write(object_file) as file_name: compile_cmd = [compiler_config['command'], '-c' ] + compiler_config['flags'].split() compile_cmd += [*extra_flags, '-o', file_name, src_file] run_compile_step(compile_cmd) # Linking if windows: import sysconfig config_vars = sysconfig.get_config_vars() py_lib = os.path.join( config_vars["installed_base"], "libs", "python{}.lib".format(config_vars["py_version_nodot"])) run_compile_step( ['link.exe', py_lib, '/DLL', '/out:' + lib_file, object_file]) elif platform.system().lower() == 'darwin': with atomic_file_write(lib_file) as file_name: run_compile_step([ compiler_config['command'], '-shared', object_file, '-o', file_name, '-undefined', 'dynamic_lookup' ] + compiler_config['flags'].split()) else: with atomic_file_write(lib_file) as file_name: run_compile_step([ compiler_config['command'], '-shared', object_file, '-o', file_name ] + compiler_config['flags'].split()) return lib_file
def compile(self): try: import cppimport except ImportError: try: from torch.utils.cpp_extension import load except Exception: assert False, 'cppimport or torch ist required for compiling pybind11 modules' assert not self.is_cuda source_code = self.CPP_IMPORT_PREFIX + str(self) hash_str = _hash(source_code.encode()).hexdigest() source_code_with_hash = source_code.replace( f'PYBIND11_MODULE({self.module_name}', f'PYBIND11_MODULE(cppimport_{hash_str}') cache_dir = join(get_cache_config()['object_cache']) file_name = join(cache_dir, f'cppimport_{hash_str}.cpp') os.makedirs(cache_dir, exist_ok=True) if not exists(file_name): write_file(file_name, source_code_with_hash) # TODO: propagate extra headers if cache_dir not in sys.path: sys.path.append(cache_dir) # Torch regards CXX os.environ['CXX'] = get_compiler_config()['command'] try: torch_extension = cppimport.imp(f'cppimport_{hash_str}') except Exception as e: print(e) torch_extension = load(hash, [file_name], with_cuda=self.is_cuda, extra_cflags=['--std=c++14', get_compiler_config()['flags'].replace('--std=c++11', '')], extra_cuda_cflags=['-std=c++14', '-ccbin', get_compiler_config()['command']], build_directory=cache_dir, extra_include_paths=[get_pycuda_include_path(), get_pystencils_include_path()]) return torch_extension
def run_c_benchmark(ast, inner_iterations, outer_iterations=3): """Runs the given kernel with outer loop in C Args: ast: inner_iterations: timings are recorded around this many iterations outer_iterations: number of timings recorded Returns: list of times per iterations for each outer iteration """ import kerncraft benchmark_code = generate_benchmark(ast, timing=True) with open('bench.c', 'w') as f: f.write(benchmark_code) kerncraft_path = os.path.dirname(kerncraft.__file__) extra_flags = ['-I' + get_pystencils_include_path(), '-I' + os.path.join(kerncraft_path, 'headers')] compiler_config = get_compiler_config() compile_cmd = [compiler_config['command']] + compiler_config['flags'].split() compile_cmd += [*extra_flags, os.path.join(kerncraft_path, 'headers', 'timing.c'), os.path.join(kerncraft_path, 'headers', 'dummy.c'), 'bench.c', '-o', 'bench', ] run_compile_step(compile_cmd) results = [] for _ in range(outer_iterations): benchmark_time = float(subprocess.check_output(['./bench', str(inner_iterations)])) results.append(benchmark_time) return results
def make_python_function(kernel_function_node, argument_dict=None, custom_backend=None): """ Creates a kernel function from an abstract syntax tree which was created e.g. by :func:`pystencils.gpucuda.create_cuda_kernel` or :func:`pystencils.gpucuda.created_indexed_cuda_kernel` Args: kernel_function_node: the abstract syntax tree argument_dict: parameters passed here are already fixed. Remaining parameters have to be passed to the returned kernel functor. Returns: compiled kernel as Python function """ import pycuda.autoinit # NOQA from pycuda.compiler import SourceModule if argument_dict is None: argument_dict = {} header_list = ['<stdint.h>'] + list(get_headers(kernel_function_node)) includes = "\n".join( ["#include %s" % (include_file, ) for include_file in header_list]) code = includes + "\n" code += "#define FUNC_PREFIX __global__\n" code += "#define RESTRICT __restrict__\n\n" code += str( generate_c(kernel_function_node, dialect='cuda', custom_backend=custom_backend)) options = ["-w", "-std=c++11", "-Wno-deprecated-gpu-targets"] if USE_FAST_MATH: options.append("-use_fast_math") mod = SourceModule(code, options=options, include_dirs=[get_pystencils_include_path()]) func = mod.get_function(kernel_function_node.function_name) parameters = kernel_function_node.get_parameters() cache = {} cache_values = [] def wrapper(**kwargs): key = hash( tuple((k, v.ctypes.data, v.strides, v.shape) if isinstance(v, np.ndarray) else (k, id(v)) for k, v in kwargs.items())) try: args, block_and_thread_numbers = cache[key] func(*args, **block_and_thread_numbers) except KeyError: full_arguments = argument_dict.copy() full_arguments.update(kwargs) shape = _check_arguments(parameters, full_arguments) indexing = kernel_function_node.indexing block_and_thread_numbers = indexing.call_parameters(shape) block_and_thread_numbers['block'] = tuple( int(i) for i in block_and_thread_numbers['block']) block_and_thread_numbers['grid'] = tuple( int(i) for i in block_and_thread_numbers['grid']) args = _build_numpy_argument_list(parameters, full_arguments) cache[key] = (args, block_and_thread_numbers) cache_values.append( kwargs) # keep objects alive such that ids remain unique func(*args, **block_and_thread_numbers) # import pycuda.driver as cuda # cuda.Context.synchronize() # useful for debugging, to get errors right after kernel was called wrapper.ast = kernel_function_node wrapper.parameters = kernel_function_node.get_parameters() wrapper.num_regs = func.num_regs return wrapper
def make_python_function(kernel_function_node, opencl_queue, opencl_ctx, argument_dict=None, custom_backend=None): """ Creates a **OpenCL** kernel function from an abstract syntax tree which was created for the ``target='gpu'`` e.g. by :func:`pystencils.gpucuda.create_cuda_kernel` or :func:`pystencils.gpucuda.created_indexed_cuda_kernel` Args: opencl_queue: a valid :class:`pyopencl.CommandQueue` opencl_ctx: a valid :class:`pyopencl.Context` kernel_function_node: the abstract syntax tree argument_dict: parameters passed here are already fixed. Remaining parameters have to be passed to the returned kernel functor. Returns: compiled kernel as Python function """ import pyopencl as cl assert opencl_ctx, "No valid OpenCL context" assert opencl_queue, "No valid OpenCL queue" if argument_dict is None: argument_dict = {} # Changing of kernel name necessary since compilation with default name "kernel" is not possible (OpenCL keyword!) kernel_function_node.function_name = "opencl_" + kernel_function_node.function_name header_list = ['"opencl_stdint.h"'] + list(get_headers(kernel_function_node)) includes = "\n".join(["#include %s" % (include_file,) for include_file in header_list]) code = includes + "\n" code += "#define FUNC_PREFIX __kernel\n" code += "#define RESTRICT restrict\n\n" code += str(generate_c(kernel_function_node, dialect='opencl', custom_backend=custom_backend)) options = [] if USE_FAST_MATH: options.append("-cl-unsafe-math-optimizations -cl-mad-enable -cl-fast-relaxed-math -cl-finite-math-only") options.append("-I \"" + get_pystencils_include_path() + "\"") mod = cl.Program(opencl_ctx, code).build(options=options) func = getattr(mod, kernel_function_node.function_name) parameters = kernel_function_node.get_parameters() cache = {} cache_values = [] def wrapper(**kwargs): key = hash(tuple((k, v.ctypes.data, v.strides, v.shape) if isinstance(v, np.ndarray) else (k, id(v)) for k, v in kwargs.items())) try: args, block_and_thread_numbers = cache[key] func(opencl_queue, block_and_thread_numbers['grid'], block_and_thread_numbers['block'], *args) except KeyError: full_arguments = argument_dict.copy() full_arguments.update(kwargs) shape = _check_arguments(parameters, full_arguments) indexing = kernel_function_node.indexing block_and_thread_numbers = indexing.call_parameters(shape) block_and_thread_numbers['block'] = tuple(int(i) for i in block_and_thread_numbers['block']) block_and_thread_numbers['grid'] = tuple(int(b * g) for (b, g) in zip(block_and_thread_numbers['block'], block_and_thread_numbers['grid'])) args = _build_numpy_argument_list(parameters, full_arguments) args = [a.data if hasattr(a, 'data') else a for a in args] cache[key] = (args, block_and_thread_numbers) cache_values.append(kwargs) # keep objects alive such that ids remain unique func(opencl_queue, block_and_thread_numbers['grid'], block_and_thread_numbers['block'], *args) wrapper.ast = kernel_function_node wrapper.parameters = kernel_function_node.get_parameters() return wrapper
def compile_module(code, code_hash, base_dir, compile_flags=None): if compile_flags is None: compile_flags = [] compiler_config = get_compiler_config() extra_flags = [ '-I' + get_paths()['include'], '-I' + get_pystencils_include_path() ] + compile_flags if compiler_config['os'].lower() == 'windows': lib_suffix = '.pyd' object_suffix = '.obj' windows = True else: lib_suffix = '.so' object_suffix = '.o' windows = False src_file = os.path.join(base_dir, code_hash + ".cpp") lib_file = os.path.join(base_dir, code_hash + lib_suffix) object_file = os.path.join(base_dir, code_hash + object_suffix) if not os.path.exists(object_file): try: with open(src_file, 'x') as f: code.write_to_file(f) except FileExistsError: pass if windows: compile_cmd = ['cl.exe', '/c', '/EHsc' ] + compiler_config['flags'].split() compile_cmd += [*extra_flags, src_file, '/Fo' + object_file] run_compile_step(compile_cmd) else: with atomic_file_write(object_file) as file_name: compile_cmd = [compiler_config['command'], '-c' ] + compiler_config['flags'].split() compile_cmd += [*extra_flags, '-o', file_name, src_file] run_compile_step(compile_cmd) # Linking if windows: import sysconfig config_vars = sysconfig.get_config_vars() py_lib = os.path.join( config_vars["installed_base"], "libs", f"python{config_vars['py_version_nodot']}.lib") run_compile_step( ['link.exe', py_lib, '/DLL', '/out:' + lib_file, object_file]) elif platform.system().lower() == 'darwin': with atomic_file_write(lib_file) as file_name: run_compile_step([ compiler_config['command'], '-shared', object_file, '-o', file_name, '-undefined', 'dynamic_lookup' ] + compiler_config['flags'].split()) else: with atomic_file_write(lib_file) as file_name: run_compile_step([ compiler_config['command'], '-shared', object_file, '-o', file_name ] + compiler_config['flags'].split()) return lib_file