def test_custom_type_fill(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) from pyopencl.characterize import has_struct_arg_count_bug if has_struct_arg_count_bug(queue.device): pytest.skip("device has LLVM arg counting bug") dtype = np.dtype([ ("cur_min", np.int32), ("cur_max", np.int32), ("pad", np.int32), ]) from pyopencl.tools import get_or_register_dtype, match_dtype_to_c_struct name = "mmc_type" dtype, c_decl = match_dtype_to_c_struct(queue.device, name, dtype) dtype = get_or_register_dtype(name, dtype) n = 1000 z_dev = cl.array.empty(queue, n, dtype=dtype) z_dev.fill(np.zeros((), dtype)) z = z_dev.get() assert np.array_equal(np.zeros(n, dtype), z)
def __init__(self, ctx_getter=cl.create_some_context, enable_extents=False): ctx = ctx_getter() queue = cl.CommandQueue(ctx) from pyopencl.characterize import has_struct_arg_count_bug if has_struct_arg_count_bug(queue.device): pytest.xfail( "won't work on devices with the struct arg count issue") logging.basicConfig(level=logging.INFO) dims = 2 nsources = 9000000 ntargets = 9000000 dtype = np.float32 from boxtree.fmm import drive_fmm sources = p_normal(queue, nsources, dims, dtype, seed=15) targets = p_normal(queue, ntargets, dims, dtype, seed=15) from pyopencl.clrandom import PhiloxGenerator rng = PhiloxGenerator(queue.context, seed=12) if enable_extents: target_radii = 2**rng.uniform(queue, ntargets, dtype=dtype, a=-10, b=0) else: target_radii = None from boxtree import TreeBuilder tb = TreeBuilder(ctx) tree, _ = tb( queue, sources, #targets=targets, max_particles_in_box=30, #target_radii=target_radii, #stick_out_factor=0.25, debug=True) from boxtree.traversal import FMMTraversalBuilder tbuild = FMMTraversalBuilder(ctx) trav, _ = tbuild(queue, tree, debug=True) weights = np.ones(nsources) weights_sum = np.sum(weights) host_trav = trav.get(queue=queue) host_tree = host_trav.tree self.tree = host_tree self.trav = host_trav self.input = [host_tree, weights, weights_sum, host_trav] self.pot = None
def test_fmm_float32(ctx_getter=cl.create_some_context, enable_extents=True): from time import time ctx = ctx_getter() queue = cl.CommandQueue(ctx) from pyopencl.characterize import has_struct_arg_count_bug if has_struct_arg_count_bug(queue.device): pytest.xfail("won't work on devices with the struct arg count issue") logging.basicConfig(level=logging.INFO) dims = 2 nsources = 3000000 ntargets = 3000000 dtype = np.float32 from boxtree.fmm import drive_fmm sources = p_normal(queue, nsources, dims, dtype, seed=15) targets = p_normal(queue, ntargets, dims, dtype, seed=15) from pyopencl.clrandom import PhiloxGenerator rng = PhiloxGenerator(queue.context, seed=12) if enable_extents: target_radii = 2**rng.uniform(queue, ntargets, dtype=dtype, a=-10, b=0) else: target_radii = None from boxtree import TreeBuilder tb = TreeBuilder(ctx) tree, _ = tb(queue, sources, targets=targets, max_particles_in_box=30, target_radii=target_radii,stick_out_factor=0.25, debug=True) from boxtree.traversal import FMMTraversalBuilder tbuild = FMMTraversalBuilder(ctx) trav, _ = tbuild(queue, tree, debug=True) weights = np.ones(nsources) weights_sum = np.sum(weights) host_trav = trav.get(queue=queue) host_tree = host_trav.tree wrangler = ConstantOneExpansionWrangler(host_tree) ti = time() pot = drive_fmm(host_trav, wrangler, weights) print(time() - ti) assert (pot == weights_sum).all()
def test_dot(ctx_factory): from pytest import importorskip importorskip("mako") context = ctx_factory() queue = cl.CommandQueue(context) dev = context.devices[0] dtypes = [np.float32, np.complex64] if has_double_support(dev): if has_struct_arg_count_bug(dev) == "apple": dtypes.extend([np.float64]) else: dtypes.extend([np.float64, np.complex128]) for a_dtype in dtypes: for b_dtype in dtypes: print(a_dtype, b_dtype) a_gpu = general_clrand(queue, (200000,), a_dtype) a = a_gpu.get() b_gpu = general_clrand(queue, (200000,), b_dtype) b = b_gpu.get() dot_ab = np.dot(a, b) dot_ab_gpu = cl_array.dot(a_gpu, b_gpu).get() assert abs(dot_ab_gpu - dot_ab) / abs(dot_ab) < 1e-4 try: vdot_ab = np.vdot(a, b) except NotImplementedError: import sys is_pypy = "__pypy__" in sys.builtin_module_names if is_pypy: print("PYPY: VDOT UNIMPLEMENTED") continue else: raise vdot_ab_gpu = cl_array.vdot(a_gpu, b_gpu).get() rel_err = abs(vdot_ab_gpu - vdot_ab) / abs(vdot_ab) assert rel_err < 1e-4, rel_err
def test_dot(ctx_factory): from pytest import importorskip importorskip("mako") context = ctx_factory() queue = cl.CommandQueue(context) dev = context.devices[0] dtypes = [np.float32, np.complex64] if has_double_support(dev): if has_struct_arg_count_bug(dev) == "apple": dtypes.extend([np.float64]) else: dtypes.extend([np.float64, np.complex128]) for a_dtype in dtypes: for b_dtype in dtypes: print(a_dtype, b_dtype) a_gpu = general_clrand(queue, (200000,), a_dtype) a = a_gpu.get() b_gpu = general_clrand(queue, (200000,), b_dtype) b = b_gpu.get() dot_ab = np.dot(a, b) dot_ab_gpu = cl_array.dot(a_gpu, b_gpu).get() assert abs(dot_ab_gpu - dot_ab) / abs(dot_ab) < 1e-4 try: vdot_ab = np.vdot(a, b) except NotImplementedError: import sys is_pypy = '__pypy__' in sys.builtin_module_names if is_pypy: print("PYPY: VDOT UNIMPLEMENTED") continue else: raise vdot_ab_gpu = cl_array.vdot(a_gpu, b_gpu).get() rel_err = abs(vdot_ab_gpu - vdot_ab) / abs(vdot_ab) assert rel_err < 1e-4, rel_err
def test_pow_neg1_vs_inv(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) device = ctx.devices[0] if not has_double_support(device): from pytest import skip skip("double precision not supported on %s" % device) if has_struct_arg_count_bug(device) == "apple": from pytest import xfail xfail("apple struct arg counting broken") a_dev = make_random_array(queue, np.complex128, 20000) res1 = (a_dev ** (-1)).get() res2 = (1/a_dev).get() ref = 1/a_dev.get() assert la.norm(res1-ref, np.inf) / la.norm(ref) < 1e-13 assert la.norm(res2-ref, np.inf) / la.norm(ref) < 1e-13
def test_pow_neg1_vs_inv(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) device = ctx.devices[0] if not has_double_support(device): from pytest import skip skip("double precision not supported on %s" % device) if has_struct_arg_count_bug(device) == "apple": from pytest import xfail xfail("apple struct arg counting broken") a_dev = make_random_array(queue, np.complex128, 20000) res1 = (a_dev**(-1)).get() res2 = (1 / a_dev).get() ref = 1 / a_dev.get() assert la.norm(res1 - ref, np.inf) / la.norm(ref) < 1e-13 assert la.norm(res2 - ref, np.inf) / la.norm(ref) < 1e-13
def test(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) gpu_func = getattr(clmath, name) cpu_func = getattr(np, numpy_func_names.get(name, name)) dev = context.devices[0] if has_double_support(dev): if use_complex and has_struct_arg_count_bug(dev) == "apple": dtypes = [np.float32, np.float64, np.complex64] elif use_complex: dtypes = [np.float32, np.float64, np.complex64, np.complex128] else: dtypes = [np.float32, np.float64] else: if use_complex: dtypes = [np.float32, np.complex64] else: dtypes = [np.float32] for s in sizes: for dtype in dtypes: dtype = np.dtype(dtype) args = cl_array.arange(queue, a, b, (b - a) / s, dtype=dtype) if dtype.kind == "c": # args = args + dtype.type(1j) * args args = args + args * dtype.type(1j) gpu_results = gpu_func(args).get() cpu_results = cpu_func(args.get()) my_threshold = threshold if dtype.kind == "c" and isinstance(use_complex, float): my_threshold = use_complex max_err = np.max(np.abs(cpu_results - gpu_results)) assert (max_err <= my_threshold).all(), \ (max_err, name, dtype)
def test(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) gpu_func = getattr(clmath, name) cpu_func = getattr(np, numpy_func_names.get(name, name)) dev = context.devices[0] if has_double_support(dev): if use_complex and has_struct_arg_count_bug(dev) == "apple": dtypes = [np.float32, np.float64, np.complex64] elif use_complex: dtypes = [np.float32, np.float64, np.complex64, np.complex128] else: dtypes = [np.float32, np.float64] else: if use_complex: dtypes = [np.float32, np.complex64] else: dtypes = [np.float32] for s in sizes: for dtype in dtypes: dtype = np.dtype(dtype) args = cl_array.arange(queue, a, b, (b-a)/s, dtype=dtype) if dtype.kind == "c": # args = args + dtype.type(1j) * args args = args + args * dtype.type(1j) gpu_results = gpu_func(args).get() cpu_results = cpu_func(args.get()) my_threshold = threshold if dtype.kind == "c" and isinstance(use_complex, float): my_threshold = use_complex max_err = np.max(np.abs(cpu_results - gpu_results)) assert (max_err <= my_threshold).all(), \ (max_err, name, dtype)
def test_mix_complex(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) size = 10 dtypes = [ (np.float32, np.complex64), #(np.int32, np.complex64), ] dev = context.devices[0] if has_double_support(dev) and has_struct_arg_count_bug(dev) == "apple": dtypes.extend([ (np.float32, np.float64), ]) elif has_double_support(dev): dtypes.extend([ (np.float32, np.float64), (np.float32, np.complex128), (np.float64, np.complex64), (np.float64, np.complex128), ]) from operator import add, mul, sub, truediv for op in [add, sub, mul, truediv, pow]: for dtype_a0, dtype_b0 in dtypes: for dtype_a, dtype_b in [ (dtype_a0, dtype_b0), (dtype_b0, dtype_a0), ]: for is_scalar_a, is_scalar_b in [ (False, False), (False, True), (True, False), ]: if is_scalar_a: ary_a = make_random_array(queue, dtype_a, 1).get()[0] host_ary_a = ary_a else: ary_a = make_random_array(queue, dtype_a, size) host_ary_a = ary_a.get() if is_scalar_b: ary_b = make_random_array(queue, dtype_b, 1).get()[0] host_ary_b = ary_b else: ary_b = make_random_array(queue, dtype_b, size) host_ary_b = ary_b.get() print(op, dtype_a, dtype_b, is_scalar_a, is_scalar_b) dev_result = op(ary_a, ary_b).get() host_result = op(host_ary_a, host_ary_b) if host_result.dtype != dev_result.dtype: # This appears to be a numpy bug, where we get # served a Python complex that is really a # smaller numpy complex. print("HOST_DTYPE: %s DEV_DTYPE: %s" % ( host_result.dtype, dev_result.dtype)) dev_result = dev_result.astype(host_result.dtype) err = la.norm(host_result-dev_result)/la.norm(host_result) print(err) correct = err < 1e-4 if not correct: print(host_result) print(dev_result) print(host_result - dev_result) assert correct
def generate_value_arg_setup(gen, kernel, cl_kernel, impl_arg_info, options): import loopy as lp from loopy.kernel.array import ArrayBase # {{{ arg counting bug handling # For example: # https://github.com/pocl/pocl/issues/197 # (but Apple CPU has a similar bug) work_around_arg_count_bug = False warn_about_arg_count_bug = False devices = cl_kernel.context.devices try: from pyopencl.characterize import has_struct_arg_count_bug except ImportError: count_bug_per_dev = [False]*len(devices) else: count_bug_per_dev = [ has_struct_arg_count_bug(dev) for dev in devices] if any(count_bug_per_dev): if all(count_bug_per_dev): work_around_arg_count_bug = True else: warn_about_arg_count_bug = True # }}} cl_arg_idx = 0 arg_idx_to_cl_arg_idx = {} fp_arg_count = 0 for arg_idx, arg in enumerate(impl_arg_info): arg_idx_to_cl_arg_idx[arg_idx] = cl_arg_idx if arg.arg_class is not lp.ValueArg: assert issubclass(arg.arg_class, ArrayBase) # assume each of those generates exactly one... cl_arg_idx += 1 continue gen("# {{{ process %s" % arg.name) gen("") if not options.skip_arg_checks: gen(""" if {name} is None: raise RuntimeError("input argument '{name}' must " "be supplied") """.format(name=arg.name)) if sys.version_info < (2, 7) and arg.dtype.kind == "i": gen("# cast to long to avoid trouble with struct packing") gen("%s = long(%s)" % (arg.name, arg.name)) gen("") if arg.dtype.char == "V": gen("cl_kernel.set_arg(%d, %s)" % (cl_arg_idx, arg.name)) cl_arg_idx += 1 elif arg.dtype.kind == "c": if warn_about_arg_count_bug: from warnings import warn warn("{knl_name}: arguments include complex numbers, and " "some (but not all) of the target devices mishandle " "struct kernel arguments (hence the workaround is " "disabled".format( knl_name=kernel.name)) if arg.dtype == np.complex64: arg_char = "f" elif arg.dtype == np.complex128: arg_char = "d" else: raise TypeError("unexpected complex type: %s" % arg.dtype) if (work_around_arg_count_bug and arg.dtype == np.complex128 and fp_arg_count + 2 <= 8): gen( "buf = _lpy_pack('{arg_char}', {arg_var}.real)" .format(arg_char=arg_char, arg_var=arg.name)) gen( "cl_kernel.set_arg({cl_arg_idx}, buf)" .format(cl_arg_idx=cl_arg_idx)) cl_arg_idx += 1 gen( "buf = _lpy_pack('{arg_char}', {arg_var}.imag)" .format(arg_char=arg_char, arg_var=arg.name)) gen( "cl_kernel.set_arg({cl_arg_idx}, buf)" .format(cl_arg_idx=cl_arg_idx)) cl_arg_idx += 1 else: gen( "buf = _lpy_pack('{arg_char}{arg_char}', " "{arg_var}.real, {arg_var}.imag)" .format(arg_char=arg_char, arg_var=arg.name)) gen( "cl_kernel.set_arg({cl_arg_idx}, buf)" .format(cl_arg_idx=cl_arg_idx)) cl_arg_idx += 1 fp_arg_count += 2 else: if arg.dtype.kind == "f": fp_arg_count += 1 gen("cl_kernel.set_arg(%d, _lpy_pack('%s', %s))" % (cl_arg_idx, arg.dtype.char, arg.name)) cl_arg_idx += 1 gen("") gen("# }}}") gen("") assert cl_arg_idx == cl_kernel.num_args return arg_idx_to_cl_arg_idx
def generate_value_arg_setup(kernel, devices, implemented_data_info): options = kernel.options import loopy as lp from loopy.kernel.array import ArrayBase # {{{ arg counting bug handling # For example: # https://github.com/pocl/pocl/issues/197 # (but Apple CPU has a similar bug) work_around_arg_count_bug = False warn_about_arg_count_bug = False try: from pyopencl.characterize import has_struct_arg_count_bug except ImportError: count_bug_per_dev = [False]*len(devices) else: count_bug_per_dev = [ has_struct_arg_count_bug(dev) if dev is not None else False for dev in devices] if any(dev is None for dev in devices): warn("{knl_name}: device not supplied to PyOpenCLTarget--" "workarounds for broken OpenCL implementations " "(such as those relating to complex numbers) " "may not be enabled when needed" .format(knl_name=kernel.name)) if any(count_bug_per_dev): if all(count_bug_per_dev): work_around_arg_count_bug = True else: warn_about_arg_count_bug = True # }}} cl_arg_idx = 0 arg_idx_to_cl_arg_idx = {} fp_arg_count = 0 from genpy import ( Comment, Line, If, Raise, Assign, Statement as S, Suite) result = [] gen = result.append for arg_idx, idi in enumerate(implemented_data_info): arg_idx_to_cl_arg_idx[arg_idx] = cl_arg_idx if not issubclass(idi.arg_class, lp.ValueArg): assert issubclass(idi.arg_class, ArrayBase) # assume each of those generates exactly one... cl_arg_idx += 1 continue gen(Comment("{{{ process %s" % idi.name)) gen(Line()) if not options.skip_arg_checks: gen(If("%s is None" % idi.name, Raise('RuntimeError("input argument \'{name}\' ' 'must be supplied")'.format(name=idi.name)))) if idi.dtype.is_integral(): gen(Comment("cast to Python int to avoid trouble " "with struct packing or Boost.Python")) if sys.version_info < (3,): py_type = "long" else: py_type = "int" gen(Assign(idi.name, "%s(%s)" % (py_type, idi.name))) gen(Line()) if idi.dtype.is_composite(): gen(S("_lpy_knl.set_arg(%d, %s)" % (cl_arg_idx, idi.name))) cl_arg_idx += 1 elif idi.dtype.is_complex(): assert isinstance(idi.dtype, NumpyType) dtype = idi.dtype if warn_about_arg_count_bug: warn("{knl_name}: arguments include complex numbers, and " "some (but not all) of the target devices mishandle " "struct kernel arguments (hence the workaround is " "disabled".format( knl_name=kernel.name)) if dtype.numpy_dtype == np.complex64: arg_char = "f" elif dtype.numpy_dtype == np.complex128: arg_char = "d" else: raise TypeError("unexpected complex type: %s" % dtype) if (work_around_arg_count_bug and dtype.numpy_dtype == np.complex128 and fp_arg_count + 2 <= 8): gen(Assign( "_lpy_buf", "_lpy_pack('{arg_char}', {arg_var}.real)" .format(arg_char=arg_char, arg_var=idi.name))) gen(S( "_lpy_knl.set_arg({cl_arg_idx}, _lpy_buf)" .format(cl_arg_idx=cl_arg_idx))) cl_arg_idx += 1 gen(Assign( "_lpy_buf", "_lpy_pack('{arg_char}', {arg_var}.imag)" .format(arg_char=arg_char, arg_var=idi.name))) gen(S( "_lpy_knl.set_arg({cl_arg_idx}, _lpy_buf)" .format(cl_arg_idx=cl_arg_idx))) cl_arg_idx += 1 else: gen(Assign( "_lpy_buf", "_lpy_pack('{arg_char}{arg_char}', " "{arg_var}.real, {arg_var}.imag)" .format(arg_char=arg_char, arg_var=idi.name))) gen(S( "_lpy_knl.set_arg({cl_arg_idx}, _lpy_buf)" .format(cl_arg_idx=cl_arg_idx))) cl_arg_idx += 1 fp_arg_count += 2 elif isinstance(idi.dtype, NumpyType): if idi.dtype.dtype.kind == "f": fp_arg_count += 1 gen(S( "_lpy_knl.set_arg(%d, _lpy_pack('%s', %s))" % (cl_arg_idx, idi.dtype.dtype.char, idi.name))) cl_arg_idx += 1 else: raise LoopyError("do not know how to pass argument of type '%s'" % idi.dtype) gen(Line()) gen(Comment("}}}")) gen(Line()) return Suite(result), arg_idx_to_cl_arg_idx, cl_arg_idx
def test_mix_complex(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) size = 10 dtypes = [ (np.float32, np.complex64), #(np.int32, np.complex64), ] dev = context.devices[0] if has_double_support(dev) and has_struct_arg_count_bug(dev) == "apple": dtypes.extend([ (np.float32, np.float64), ]) elif has_double_support(dev): dtypes.extend([ (np.float32, np.float64), (np.float32, np.complex128), (np.float64, np.complex64), (np.float64, np.complex128), ]) from operator import add, mul, sub, truediv for op in [add, sub, mul, truediv, pow]: for dtype_a0, dtype_b0 in dtypes: for dtype_a, dtype_b in [ (dtype_a0, dtype_b0), (dtype_b0, dtype_a0), ]: for is_scalar_a, is_scalar_b in [ (False, False), (False, True), (True, False), ]: if is_scalar_a: ary_a = make_random_array(queue, dtype_a, 1).get()[0] host_ary_a = ary_a else: ary_a = make_random_array(queue, dtype_a, size) host_ary_a = ary_a.get() if is_scalar_b: ary_b = make_random_array(queue, dtype_b, 1).get()[0] host_ary_b = ary_b else: ary_b = make_random_array(queue, dtype_b, size) host_ary_b = ary_b.get() print(op, dtype_a, dtype_b, is_scalar_a, is_scalar_b) dev_result = op(ary_a, ary_b).get() host_result = op(host_ary_a, host_ary_b) if host_result.dtype != dev_result.dtype: # This appears to be a numpy bug, where we get # served a Python complex that is really a # smaller numpy complex. print("HOST_DTYPE: {} DEV_DTYPE: {}".format( host_result.dtype, dev_result.dtype)) dev_result = dev_result.astype(host_result.dtype) err = la.norm(host_result - dev_result) / la.norm(host_result) print(err) correct = err < 1e-4 if not correct: print(host_result) print(dev_result) print(host_result - dev_result) assert correct
def generate_value_arg_setup(kernel, devices, implemented_data_info): options = kernel.options import loopy as lp from loopy.kernel.array import ArrayBase # {{{ arg counting bug handling # For example: # https://github.com/pocl/pocl/issues/197 # (but Apple CPU has a similar bug) work_around_arg_count_bug = False warn_about_arg_count_bug = False try: from pyopencl.characterize import has_struct_arg_count_bug except ImportError: count_bug_per_dev = [False]*len(devices) else: count_bug_per_dev = [ has_struct_arg_count_bug(dev) if dev is not None else False for dev in devices] if any(dev is None for dev in devices): warn("{knl_name}: device not supplied to PyOpenCLTarget--" "workarounds for broken OpenCL implementations " "(such as those relating to complex numbers) " "may not be enabled when needed. To avoid this, " "pass target=lp.PyOpenCLTarget(dev) when creating " "the kernel." .format(knl_name=kernel.name)) if any(count_bug_per_dev): if all(count_bug_per_dev): work_around_arg_count_bug = True else: warn_about_arg_count_bug = True # }}} cl_arg_idx = 0 arg_idx_to_cl_arg_idx = {} fp_arg_count = 0 from genpy import ( Comment, Line, If, Raise, Assign, Statement as S, Suite) result = [] gen = result.append for arg_idx, idi in enumerate(implemented_data_info): arg_idx_to_cl_arg_idx[arg_idx] = cl_arg_idx if not issubclass(idi.arg_class, lp.ValueArg): assert issubclass(idi.arg_class, ArrayBase) # assume each of those generates exactly one... cl_arg_idx += 1 continue gen(Comment("{{{ process %s" % idi.name)) gen(Line()) if not options.skip_arg_checks: gen(If("%s is None" % idi.name, Raise('RuntimeError("input argument \'{name}\' ' 'must be supplied")'.format(name=idi.name)))) if idi.dtype.is_integral(): gen(Comment("cast to Python int to avoid trouble " "with struct packing or Boost.Python")) if sys.version_info < (3,): py_type = "long" else: py_type = "int" gen(Assign(idi.name, "%s(%s)" % (py_type, idi.name))) gen(Line()) if idi.dtype.is_composite(): gen(S("_lpy_knl.set_arg(%d, %s)" % (cl_arg_idx, idi.name))) cl_arg_idx += 1 elif idi.dtype.is_complex(): assert isinstance(idi.dtype, NumpyType) dtype = idi.dtype if warn_about_arg_count_bug: warn("{knl_name}: arguments include complex numbers, and " "some (but not all) of the target devices mishandle " "struct kernel arguments (hence the workaround is " "disabled".format( knl_name=kernel.name)) if dtype.numpy_dtype == np.complex64: arg_char = "f" elif dtype.numpy_dtype == np.complex128: arg_char = "d" else: raise TypeError("unexpected complex type: %s" % dtype) if (work_around_arg_count_bug and dtype.numpy_dtype == np.complex128 and fp_arg_count + 2 <= 8): gen(Assign( "_lpy_buf", "_lpy_pack('{arg_char}', {arg_var}.real)" .format(arg_char=arg_char, arg_var=idi.name))) gen(S( "_lpy_knl.set_arg({cl_arg_idx}, _lpy_buf)" .format(cl_arg_idx=cl_arg_idx))) cl_arg_idx += 1 gen(Assign( "_lpy_buf", "_lpy_pack('{arg_char}', {arg_var}.imag)" .format(arg_char=arg_char, arg_var=idi.name))) gen(S( "_lpy_knl.set_arg({cl_arg_idx}, _lpy_buf)" .format(cl_arg_idx=cl_arg_idx))) cl_arg_idx += 1 else: gen(Assign( "_lpy_buf", "_lpy_pack('{arg_char}{arg_char}', " "{arg_var}.real, {arg_var}.imag)" .format(arg_char=arg_char, arg_var=idi.name))) gen(S( "_lpy_knl.set_arg({cl_arg_idx}, _lpy_buf)" .format(cl_arg_idx=cl_arg_idx))) cl_arg_idx += 1 fp_arg_count += 2 elif isinstance(idi.dtype, NumpyType): if idi.dtype.dtype.kind == "f": fp_arg_count += 1 gen(S( "_lpy_knl.set_arg(%d, _lpy_pack('%s', %s))" % (cl_arg_idx, idi.dtype.dtype.char, idi.name))) cl_arg_idx += 1 else: raise LoopyError("do not know how to pass argument of type '%s'" % idi.dtype) gen(Line()) gen(Comment("}}}")) gen(Line()) return Suite(result), arg_idx_to_cl_arg_idx, cl_arg_idx
def generate_value_arg_setup(kernel, devices, implemented_data_info): options = kernel.options import loopy as lp from loopy.kernel.array import ArrayBase # {{{ arg counting bug handling # For example: # https://github.com/pocl/pocl/issues/197 # (but Apple CPU has a similar bug) work_around_arg_count_bug = False warn_about_arg_count_bug = False try: from pyopencl.characterize import has_struct_arg_count_bug except ImportError: count_bug_per_dev = [False] * len(devices) else: count_bug_per_dev = [ has_struct_arg_count_bug(dev) if dev is not None else False for dev in devices ] if any(dev is None for dev in devices): warn("{knl_name}: device not supplied to PyOpenCLTarget--" "workarounds for broken OpenCL implementations " "(such as those relating to complex numbers) " "may not be enabled when needed. To avoid this, " "pass target=lp.PyOpenCLTarget(dev) when creating " "the kernel.".format(knl_name=kernel.name)) if any(count_bug_per_dev): if all(count_bug_per_dev): work_around_arg_count_bug = True else: warn_about_arg_count_bug = True # }}} cl_arg_idx = 0 arg_idx_to_cl_arg_idx = {} fp_arg_count = 0 from genpy import If, Raise, Statement as S, Suite result = [] gen = result.append buf_indices_and_args = [] buf_pack_indices_and_args = [] from pyopencl.invoker import BUF_PACK_TYPECHARS def add_buf_arg(arg_idx, typechar, expr_str): if typechar in BUF_PACK_TYPECHARS: buf_pack_indices_and_args.append(arg_idx) buf_pack_indices_and_args.append(repr(typechar.encode())) buf_pack_indices_and_args.append(expr_str) else: buf_indices_and_args.append(arg_idx) buf_indices_and_args.append(f"pack('{typechar}', {expr_str})") for arg_idx, idi in enumerate(implemented_data_info): arg_idx_to_cl_arg_idx[arg_idx] = cl_arg_idx if not issubclass(idi.arg_class, lp.ValueArg): assert issubclass(idi.arg_class, ArrayBase) # assume each of those generates exactly one... cl_arg_idx += 1 continue if not options.skip_arg_checks: gen( If( "%s is None" % idi.name, Raise('RuntimeError("input argument \'{name}\' ' 'must be supplied")'.format(name=idi.name)))) if idi.dtype.is_composite(): buf_indices_and_args.append(cl_arg_idx) buf_indices_and_args.append(f"{idi.name}") cl_arg_idx += 1 elif idi.dtype.is_complex(): assert isinstance(idi.dtype, NumpyType) dtype = idi.dtype if warn_about_arg_count_bug: warn("{knl_name}: arguments include complex numbers, and " "some (but not all) of the target devices mishandle " "struct kernel arguments (hence the workaround is " "disabled".format(knl_name=kernel.name)) if dtype.numpy_dtype == np.complex64: arg_char = "f" elif dtype.numpy_dtype == np.complex128: arg_char = "d" else: raise TypeError("unexpected complex type: %s" % dtype) if (work_around_arg_count_bug and dtype.numpy_dtype == np.complex128 and fp_arg_count + 2 <= 8): add_buf_arg(cl_arg_idx, arg_char, f"{idi.name}.real") cl_arg_idx += 1 add_buf_arg(cl_arg_idx, arg_char, f"{idi.name}.imag") cl_arg_idx += 1 else: buf_indices_and_args.append(cl_arg_idx) buf_indices_and_args.append( f"_lpy_pack('{arg_char}{arg_char}', " f"{idi.name}.real, {idi.name}.imag)") cl_arg_idx += 1 fp_arg_count += 2 elif isinstance(idi.dtype, NumpyType): if idi.dtype.dtype.kind == "f": fp_arg_count += 1 add_buf_arg(cl_arg_idx, idi.dtype.dtype.char, idi.name) cl_arg_idx += 1 else: raise LoopyError("do not know how to pass argument of type '%s'" % idi.dtype) for arg_kind, args_and_indices, entry_length in [ ("_buf", buf_indices_and_args, 2), ("_buf_pack", buf_pack_indices_and_args, 3), ]: assert len(args_and_indices) % entry_length == 0 if args_and_indices: gen( S(f"_lpy_knl._set_arg{arg_kind}_multi(" f"({', '.join(str(i) for i in args_and_indices)},), " ")")) return Suite(result), arg_idx_to_cl_arg_idx, cl_arg_idx