def get_function_definition(self, codegen_state, codegen_result, schedule_index, function_decl, function_body): from loopy.kernel.data import TemporaryVariable args = (["_lpy_cl_kernels", "queue"] + [ idi.name for idi in codegen_state.implemented_data_info if not issubclass(idi.arg_class, TemporaryVariable) ] + ["wait_for=None", "allocator=None"]) from genpy import (For, Function, Suite, Return, Line, Statement as S) return Function( codegen_result.current_program(codegen_state).name, args, Suite([ Line(), ] + [ Line(), function_body, Line(), ] + ([ For( "_tv", "_global_temporaries", # free global temporaries S("_tv.release()")) ] if self._get_global_temporaries(codegen_state) else []) + [ Line(), Return("_lpy_evt"), ]))
def get_kernel_call(self, codegen_state, name, gsize, lsize, extra_args): ecm = self.get_expression_to_code_mapper(codegen_state) if not gsize: gsize = (1, ) if not lsize: lsize = (1, ) all_args = codegen_state.implemented_data_info + extra_args value_arg_code, arg_idx_to_cl_arg_idx, cl_arg_count = \ generate_value_arg_setup( codegen_state.kernel, [self.target.device], all_args) arry_arg_code = generate_array_arg_setup(codegen_state.kernel, all_args, arg_idx_to_cl_arg_idx) from genpy import Suite, Assign, Assert, Line, Comment from pymbolic.mapper.stringifier import PREC_NONE import pyopencl.version as cl_ver if cl_ver.VERSION < (2020, 2): from warnings import warn warn("Your kernel invocation will likely fail because your " "version of PyOpenCL does not support allow_empty_ndrange. " "Please upgrade to version 2020.2 or newer.") # TODO: Generate finer-grained dependency structure return Suite([ Comment("{{{ enqueue %s" % name), Line(), Assign("_lpy_knl", "_lpy_cl_kernels." + name), Assert("_lpy_knl.num_args == %d" % cl_arg_count), Line(), value_arg_code, arry_arg_code, Assign( "_lpy_evt", "%(pyopencl_module_name)s.enqueue_nd_range_kernel(" "queue, _lpy_knl, " "%(gsize)s, %(lsize)s, " # using positional args because pybind is slow with kwargs "None, " # offset "wait_for, " "True, " # g_times_l "True, " # allow_empty_ndrange ")" % dict(pyopencl_module_name=self.target.pyopencl_module_name, gsize=ecm(gsize, prec=PREC_NONE, type_context="i"), lsize=ecm(lsize, prec=PREC_NONE, type_context="i"))), Assign("wait_for", "[_lpy_evt]"), Line(), Comment("}}}"), Line(), ])
def generate_array_arg_setup(kernel, implemented_data_info, arg_idx_to_cl_arg_idx): from loopy.kernel.array import ArrayBase from genpy import Statement as S, Suite result = [] gen = result.append for arg_idx, arg in enumerate(implemented_data_info): if not issubclass(arg.arg_class, ArrayBase): continue cl_arg_idx = arg_idx_to_cl_arg_idx[arg_idx] gen(S("_lpy_knl.set_arg(%d, %s)" % (cl_arg_idx, arg.name))) return Suite(result)
def get_kernel_call(self, codegen_state, name, gsize, lsize, extra_args): ecm = self.get_expression_to_code_mapper(codegen_state) if not gsize: gsize = (1,) if not lsize: lsize = (1,) all_args = codegen_state.implemented_data_info + extra_args value_arg_code, arg_idx_to_cl_arg_idx, cl_arg_count = \ generate_value_arg_setup( codegen_state.kernel, [self.target.device], all_args) arry_arg_code = generate_array_arg_setup( codegen_state.kernel, all_args, arg_idx_to_cl_arg_idx) from genpy import Suite, Assign, Assert, Line, Comment from pymbolic.mapper.stringifier import PREC_NONE # TODO: Generate finer-grained dependency structure return Suite([ Comment("{{{ enqueue %s" % name), Line(), Assign("_lpy_knl", "_lpy_cl_kernels."+name), Assert("_lpy_knl.num_args == %d" % cl_arg_count), Line(), value_arg_code, arry_arg_code, Assign("_lpy_evt", "%(pyopencl_module_name)s.enqueue_nd_range_kernel(" "queue, _lpy_knl, " "%(gsize)s, %(lsize)s, wait_for=wait_for, g_times_l=True)" % dict( pyopencl_module_name=self.target.pyopencl_module_name, gsize=ecm(gsize, prec=PREC_NONE, type_context="i"), lsize=ecm(lsize, prec=PREC_NONE, type_context="i"))), Assign("wait_for", "[_lpy_evt]"), Line(), Comment("}}}"), Line(), ])
def generate_array_arg_setup(kernel, implemented_data_info, arg_idx_to_cl_arg_idx): from loopy.kernel.array import ArrayBase from genpy import Statement as S, Suite result = [] gen = result.append cl_indices_and_args = [] for arg_idx, arg in enumerate(implemented_data_info): if issubclass(arg.arg_class, ArrayBase): cl_indices_and_args.append(arg_idx_to_cl_arg_idx[arg_idx]) cl_indices_and_args.append(arg.name) if cl_indices_and_args: assert len(cl_indices_and_args) % 2 == 0 gen( S(f"_lpy_knl._set_arg_multi(" f"({', '.join(str(i) for i in cl_indices_and_args)},)" ")")) return Suite(result)
def get_function_definition(self, codegen_state, codegen_result, schedule_index, function_decl, function_body): from loopy.kernel.data import TemporaryVariable args = (["_lpy_cl_kernels", "queue"] + [ idi.name for idi in codegen_state.implemented_data_info if not issubclass(idi.arg_class, TemporaryVariable) ] + ["wait_for=None", "allocator=None"]) from genpy import (For, Function, Suite, Import, ImportAs, Return, FromImport, If, Assign, Line, Statement as S) return Function( codegen_result.current_program(codegen_state).name, args, Suite([ FromImport("struct", ["pack as _lpy_pack"]), ImportAs("pyopencl", "_lpy_cl"), Import("pyopencl.tools"), Line(), If( "allocator is None", Assign("allocator", "_lpy_cl_tools.DeferredAllocator(queue.context)")), Line(), ] + [ Line(), function_body, Line(), ] + [ For( "_tv", "_global_temporaries", # free global temporaries S("_tv.release()")) ] + [ Line(), Return("_lpy_evt"), ]))
def generate_value_arg_setup(kernel, devices, implemented_data_info): options = kernel.options import loopy as lp from loopy.kernel.array import ArrayBase # {{{ arg counting bug handling # For example: # https://github.com/pocl/pocl/issues/197 # (but Apple CPU has a similar bug) work_around_arg_count_bug = False warn_about_arg_count_bug = False try: from pyopencl.characterize import has_struct_arg_count_bug except ImportError: count_bug_per_dev = [False]*len(devices) else: count_bug_per_dev = [ has_struct_arg_count_bug(dev) if dev is not None else False for dev in devices] if any(dev is None for dev in devices): warn("{knl_name}: device not supplied to PyOpenCLTarget--" "workarounds for broken OpenCL implementations " "(such as those relating to complex numbers) " "may not be enabled when needed" .format(knl_name=kernel.name)) if any(count_bug_per_dev): if all(count_bug_per_dev): work_around_arg_count_bug = True else: warn_about_arg_count_bug = True # }}} cl_arg_idx = 0 arg_idx_to_cl_arg_idx = {} fp_arg_count = 0 from genpy import ( Comment, Line, If, Raise, Assign, Statement as S, Suite) result = [] gen = result.append for arg_idx, idi in enumerate(implemented_data_info): arg_idx_to_cl_arg_idx[arg_idx] = cl_arg_idx if not issubclass(idi.arg_class, lp.ValueArg): assert issubclass(idi.arg_class, ArrayBase) # assume each of those generates exactly one... cl_arg_idx += 1 continue gen(Comment("{{{ process %s" % idi.name)) gen(Line()) if not options.skip_arg_checks: gen(If("%s is None" % idi.name, Raise('RuntimeError("input argument \'{name}\' ' 'must be supplied")'.format(name=idi.name)))) if idi.dtype.is_integral(): gen(Comment("cast to Python int to avoid trouble " "with struct packing or Boost.Python")) if sys.version_info < (3,): py_type = "long" else: py_type = "int" gen(Assign(idi.name, "%s(%s)" % (py_type, idi.name))) gen(Line()) if idi.dtype.is_composite(): gen(S("_lpy_knl.set_arg(%d, %s)" % (cl_arg_idx, idi.name))) cl_arg_idx += 1 elif idi.dtype.is_complex(): assert isinstance(idi.dtype, NumpyType) dtype = idi.dtype if warn_about_arg_count_bug: warn("{knl_name}: arguments include complex numbers, and " "some (but not all) of the target devices mishandle " "struct kernel arguments (hence the workaround is " "disabled".format( knl_name=kernel.name)) if dtype.numpy_dtype == np.complex64: arg_char = "f" elif dtype.numpy_dtype == np.complex128: arg_char = "d" else: raise TypeError("unexpected complex type: %s" % dtype) if (work_around_arg_count_bug and dtype.numpy_dtype == np.complex128 and fp_arg_count + 2 <= 8): gen(Assign( "_lpy_buf", "_lpy_pack('{arg_char}', {arg_var}.real)" .format(arg_char=arg_char, arg_var=idi.name))) gen(S( "_lpy_knl.set_arg({cl_arg_idx}, _lpy_buf)" .format(cl_arg_idx=cl_arg_idx))) cl_arg_idx += 1 gen(Assign( "_lpy_buf", "_lpy_pack('{arg_char}', {arg_var}.imag)" .format(arg_char=arg_char, arg_var=idi.name))) gen(S( "_lpy_knl.set_arg({cl_arg_idx}, _lpy_buf)" .format(cl_arg_idx=cl_arg_idx))) cl_arg_idx += 1 else: gen(Assign( "_lpy_buf", "_lpy_pack('{arg_char}{arg_char}', " "{arg_var}.real, {arg_var}.imag)" .format(arg_char=arg_char, arg_var=idi.name))) gen(S( "_lpy_knl.set_arg({cl_arg_idx}, _lpy_buf)" .format(cl_arg_idx=cl_arg_idx))) cl_arg_idx += 1 fp_arg_count += 2 elif isinstance(idi.dtype, NumpyType): if idi.dtype.dtype.kind == "f": fp_arg_count += 1 gen(S( "_lpy_knl.set_arg(%d, _lpy_pack('%s', %s))" % (cl_arg_idx, idi.dtype.dtype.char, idi.name))) cl_arg_idx += 1 else: raise LoopyError("do not know how to pass argument of type '%s'" % idi.dtype) gen(Line()) gen(Comment("}}}")) gen(Line()) return Suite(result), arg_idx_to_cl_arg_idx, cl_arg_idx
def generate_value_arg_setup(kernel, implemented_data_info): options = kernel.options import loopy as lp from loopy.kernel.array import ArrayBase cl_arg_idx = 0 arg_idx_to_cl_arg_idx = {} fp_arg_count = 0 from genpy import If, Raise, Statement as S, Suite result = [] gen = result.append buf_indices_and_args = [] buf_pack_indices_and_args = [] from pyopencl.invoker import BUF_PACK_TYPECHARS def add_buf_arg(arg_idx, typechar, expr_str): if typechar in BUF_PACK_TYPECHARS: buf_pack_indices_and_args.append(arg_idx) buf_pack_indices_and_args.append(repr(typechar.encode())) buf_pack_indices_and_args.append(expr_str) else: buf_indices_and_args.append(arg_idx) buf_indices_and_args.append(f"pack('{typechar}', {expr_str})") for arg_idx, idi in enumerate(implemented_data_info): arg_idx_to_cl_arg_idx[arg_idx] = cl_arg_idx if not issubclass(idi.arg_class, lp.ValueArg): assert issubclass(idi.arg_class, ArrayBase) # assume each of those generates exactly one... cl_arg_idx += 1 continue if not options.skip_arg_checks: gen( If( "%s is None" % idi.name, Raise('RuntimeError("input argument \'{name}\' ' 'must be supplied")'.format(name=idi.name)))) if idi.dtype.is_composite(): buf_indices_and_args.append(cl_arg_idx) buf_indices_and_args.append(f"{idi.name}") cl_arg_idx += 1 elif idi.dtype.is_complex(): assert isinstance(idi.dtype, NumpyType) dtype = idi.dtype if dtype.numpy_dtype == np.complex64: arg_char = "f" elif dtype.numpy_dtype == np.complex128: arg_char = "d" else: raise TypeError("unexpected complex type: %s" % dtype) buf_indices_and_args.append(cl_arg_idx) buf_indices_and_args.append(f"_lpy_pack('{arg_char}{arg_char}', " f"{idi.name}.real, {idi.name}.imag)") cl_arg_idx += 1 fp_arg_count += 2 elif isinstance(idi.dtype, NumpyType): if idi.dtype.dtype.kind == "f": fp_arg_count += 1 add_buf_arg(cl_arg_idx, idi.dtype.dtype.char, idi.name) cl_arg_idx += 1 else: raise LoopyError("do not know how to pass argument of type '%s'" % idi.dtype) for arg_kind, args_and_indices, entry_length in [ ("_buf", buf_indices_and_args, 2), ("_buf_pack", buf_pack_indices_and_args, 3), ]: assert len(args_and_indices) % entry_length == 0 if args_and_indices: gen( S(f"_lpy_knl._set_arg{arg_kind}_multi(" f"({', '.join(str(i) for i in args_and_indices)},), " ")")) return Suite(result), arg_idx_to_cl_arg_idx, cl_arg_idx
def generate_value_arg_setup(kernel, devices, implemented_data_info): options = kernel.options import loopy as lp from loopy.kernel.array import ArrayBase # {{{ arg counting bug handling # For example: # https://github.com/pocl/pocl/issues/197 # (but Apple CPU has a similar bug) work_around_arg_count_bug = False warn_about_arg_count_bug = False try: from pyopencl.characterize import has_struct_arg_count_bug except ImportError: count_bug_per_dev = [False] * len(devices) else: count_bug_per_dev = [ has_struct_arg_count_bug(dev) if dev is not None else False for dev in devices ] if any(dev is None for dev in devices): warn("{knl_name}: device not supplied to PyOpenCLTarget--" "workarounds for broken OpenCL implementations " "(such as those relating to complex numbers) " "may not be enabled when needed. To avoid this, " "pass target=lp.PyOpenCLTarget(dev) when creating " "the kernel.".format(knl_name=kernel.name)) if any(count_bug_per_dev): if all(count_bug_per_dev): work_around_arg_count_bug = True else: warn_about_arg_count_bug = True # }}} cl_arg_idx = 0 arg_idx_to_cl_arg_idx = {} fp_arg_count = 0 from genpy import If, Raise, Statement as S, Suite result = [] gen = result.append buf_indices_and_args = [] buf_pack_indices_and_args = [] from pyopencl.invoker import BUF_PACK_TYPECHARS def add_buf_arg(arg_idx, typechar, expr_str): if typechar in BUF_PACK_TYPECHARS: buf_pack_indices_and_args.append(arg_idx) buf_pack_indices_and_args.append(repr(typechar.encode())) buf_pack_indices_and_args.append(expr_str) else: buf_indices_and_args.append(arg_idx) buf_indices_and_args.append(f"pack('{typechar}', {expr_str})") for arg_idx, idi in enumerate(implemented_data_info): arg_idx_to_cl_arg_idx[arg_idx] = cl_arg_idx if not issubclass(idi.arg_class, lp.ValueArg): assert issubclass(idi.arg_class, ArrayBase) # assume each of those generates exactly one... cl_arg_idx += 1 continue if not options.skip_arg_checks: gen( If( "%s is None" % idi.name, Raise('RuntimeError("input argument \'{name}\' ' 'must be supplied")'.format(name=idi.name)))) if idi.dtype.is_composite(): buf_indices_and_args.append(cl_arg_idx) buf_indices_and_args.append(f"{idi.name}") cl_arg_idx += 1 elif idi.dtype.is_complex(): assert isinstance(idi.dtype, NumpyType) dtype = idi.dtype if warn_about_arg_count_bug: warn("{knl_name}: arguments include complex numbers, and " "some (but not all) of the target devices mishandle " "struct kernel arguments (hence the workaround is " "disabled".format(knl_name=kernel.name)) if dtype.numpy_dtype == np.complex64: arg_char = "f" elif dtype.numpy_dtype == np.complex128: arg_char = "d" else: raise TypeError("unexpected complex type: %s" % dtype) if (work_around_arg_count_bug and dtype.numpy_dtype == np.complex128 and fp_arg_count + 2 <= 8): add_buf_arg(cl_arg_idx, arg_char, f"{idi.name}.real") cl_arg_idx += 1 add_buf_arg(cl_arg_idx, arg_char, f"{idi.name}.imag") cl_arg_idx += 1 else: buf_indices_and_args.append(cl_arg_idx) buf_indices_and_args.append( f"_lpy_pack('{arg_char}{arg_char}', " f"{idi.name}.real, {idi.name}.imag)") cl_arg_idx += 1 fp_arg_count += 2 elif isinstance(idi.dtype, NumpyType): if idi.dtype.dtype.kind == "f": fp_arg_count += 1 add_buf_arg(cl_arg_idx, idi.dtype.dtype.char, idi.name) cl_arg_idx += 1 else: raise LoopyError("do not know how to pass argument of type '%s'" % idi.dtype) for arg_kind, args_and_indices, entry_length in [ ("_buf", buf_indices_and_args, 2), ("_buf_pack", buf_pack_indices_and_args, 3), ]: assert len(args_and_indices) % entry_length == 0 if args_and_indices: gen( S(f"_lpy_knl._set_arg{arg_kind}_multi(" f"({', '.join(str(i) for i in args_and_indices)},), " ")")) return Suite(result), arg_idx_to_cl_arg_idx, cl_arg_idx