def test_callees_with_gbarriers_are_inlined_with_nested_calls(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) ones_and_zeros = lp.make_function("{[i, j]: 0<=i<6 and 0<=j<3}", """ x[i] = 0.0f ...gbarrier x[j] = 1.0f """, seq_dependencies=True, name="ones_and_zeros") dummy_ones_and_zeros = lp.make_function("{[i]: 0<=i<6}", """ [i]: y[i] = ones_and_zeros() """, name="dummy_ones_and_zeros") t_unit = lp.make_kernel( "{ : }", """ y[:] = dummy_ones_and_zeros() """, [lp.GlobalArg("y", shape=6, dtype=lp.auto)]) t_unit = lp.merge([t_unit, dummy_ones_and_zeros, ones_and_zeros]) evt, (out, ) = t_unit(queue) expected_out = np.array([1, 1, 1, 0, 0, 0]).astype(np.float32) assert (expected_out == out.get()).all()
def test_stride_depending_on_args(ctx_factory): ctx = ctx_factory() twice = lp.make_function("{[i, j]: 0<=i, j < n}", """ b[i, j] = 2*a[i, j] """, [lp.ValueArg("n"), lp.GlobalArg("a"), lp.GlobalArg("b")], name="twice") thrice = lp.make_function("{[i, j]: 0<=i, j < n}", """ b[i, j] = 3*a[i, j] """, [ lp.ValueArg("n"), lp.GlobalArg("a", shape=lp.auto), lp.GlobalArg("b", shape=lp.auto) ], name="thrice") prog = lp.make_kernel( "{[i0,i1,i2,i3,i4,i5,i6,i7]: 0<=i0, i1, i2, i3, i4, i5, i6, i7< N}", """ [i0, i1]: y[i0, i1] = twice(N, [i2, i3]: x[2*i2, i3]) [i4, i5]: z[i4, i5] = thrice(N, [i6, i7]: x[2*i6+1, i7]) """, [ lp.ValueArg("N", dtype=np.int32), lp.GlobalArg("x", shape=lp.auto, dtype=np.float64), ... ]) prog = lp.merge([prog, twice]) prog = lp.merge([prog, thrice]) lp.auto_test_vs_ref(prog, ctx, prog, parameters={"N": 4})
def test_shape_translation_through_sub_array_ref(ctx_factory, inline): ctx = ctx_factory() queue = cl.CommandQueue(ctx) x1 = cl.clrandom.rand(queue, (3, 2), dtype=np.float64) x2 = cl.clrandom.rand(queue, (6, ), dtype=np.float64) x3 = cl.clrandom.rand(queue, (6, 6), dtype=np.float64) callee1 = lp.make_function("{[i]: 0<=i<6}", """ b[i] = 2*abs(a[i]) """, name="callee_fn1") callee2 = lp.make_function("{[i, j]: 0<=i<3 and 0 <= j < 2}", """ b[i, j] = 3*a[i, j] """, name="callee_fn2") callee3 = lp.make_function("{[i]: 0<=i<6}", """ b[i] = 5*a[i] """, name="callee_fn3") knl = lp.make_kernel( "{[i, j, k, l]: 0<= i < 6 and 0 <= j < 3 and 0 <= k < 2 and 0<=l<6}", """ [i]: y1[i//2, i%2] = callee_fn1([i]: x1[i//2, i%2]) [j, k]: y2[2*j+k] = callee_fn2([j, k]: x2[2*j+k]) [l]: y3[l, l] = callee_fn3([l]: x3[l, l]) """) knl = lp.merge([knl, callee1]) knl = lp.merge([knl, callee2]) knl = lp.merge([knl, callee3]) if inline: knl = lp.inline_callable_kernel(knl, "callee_fn1") knl = lp.inline_callable_kernel(knl, "callee_fn2") knl = lp.inline_callable_kernel(knl, "callee_fn3") knl = lp.set_options(knl, "write_cl") knl = lp.set_options(knl, "return_dict") evt, out_dict = knl(queue, x1=x1, x2=x2, x3=x3) y1 = out_dict["y1"].get() y2 = out_dict["y2"].get() y3 = out_dict["y3"].get() assert (np.linalg.norm(y1 - 2 * x1.get())) < 1e-15 assert (np.linalg.norm(y2 - 3 * x2.get())) < 1e-15 assert (np.linalg.norm(np.diag(y3 - 5 * x3.get()))) < 1e-15
def expression_kernel(expr, args): r"""Produce a :class:`pyop2.Kernel` from the processed UFL expression expr and the corresponding args.""" # Empty slot indicating assignment to indexed LHS, so don't do anything if type(expr) is Zero: return fs = args[0].function.function_space() import islpy as isl inames = isl.make_zero_and_vars(["d"]) domain = (inames[0].le_set( inames["d"])) & (inames["d"].lt_set(inames[0] + fs.dof_dset.cdim)) context = Bag() context.within_inames = frozenset(["d"]) context.indices = (p.Variable("d"), ) insn = loopy_instructions(expr, context) data = [arg.arg for arg in args] knl = loopy.make_function([domain], [insn], data, name="expression", silenced_warnings=["summing_if_branches_ops"]) return op2.Kernel(knl, "expression")
def test_double_hw_axes_used_in_knl_call(inline): from loopy.diagnostic import LoopyError twice = lp.make_function("{[i]: 0<=i<10}", """ y[i] = 2*x[i] """, name="twice") knl = lp.make_kernel("{[i]: 0<=i<10}", """ y[:, i] = twice(x[:, i]) """, [ lp.GlobalArg("x", shape=(10, 10), dtype=float), lp.GlobalArg("y", shape=(10, 10)) ], name="outer") twice = lp.tag_inames(twice, {"i": "l.0"}) knl = lp.tag_inames(knl, {"i": "l.0"}) knl = lp.merge([knl, twice]) if inline: knl = lp.inline_callable_kernel(knl, "twice") with pytest.raises(LoopyError): lp.generate_code_v2(knl)
def test_check_bounds_with_caller_assumptions(ctx_factory): import islpy as isl from loopy.diagnostic import LoopyIndexError arange = lp.make_function("{[i]: 0<=i<n}", """ y[i] = i """, name="arange") knl = lp.make_kernel( "{[i]: 0<=i<20}", """ [i]: Y[i] = arange(N) """, [lp.GlobalArg("Y", shape=(20, )), lp.ValueArg("N", dtype=np.int32)], name="epoint") knl = lp.merge([knl, arange]) with pytest.raises(LoopyIndexError): lp.generate_code_v2(knl) knl = knl.with_kernel( lp.assume(knl.default_entrypoint, isl.BasicSet("[N] -> { : N <= 20}"))) lp.auto_test_vs_ref(knl, ctx_factory(), parameters={"N": 15})
def test_unused_hw_axes_in_callee(ctx_factory, inline): ctx = ctx_factory() twice = lp.make_function("{[i]: 0<=i<10}", """ y[i] = 2*x[i] """, name="twice") knl = lp.make_kernel("{[i]: 0<=i<10}", """ y[:, i] = twice(x[:, i]) """, [ lp.GlobalArg("x", shape=(10, 10), dtype=float), lp.GlobalArg("y", shape=(10, 10)) ], name="outer") twice = lp.tag_inames(twice, {"i": "l.1"}) knl = lp.tag_inames(knl, {"i": "l.0"}) knl = lp.merge([knl, twice]) if inline: knl = lp.inline_callable_kernel(knl, "twice") lp.auto_test_vs_ref(knl, ctx, knl)
def test_slices_with_negative_step(ctx_factory, inline): ctx = ctx_factory() queue = cl.CommandQueue(ctx) n = 4 x = np.random.rand(n, n, n, n, n) y = np.random.rand(n, n, n, n, n) child_knl = lp.make_function("{[i, j]:0<=i, j < 4}", """ g[i, j] = 2*e[i, j] + 3*f[i, j] """, name="linear_combo") parent_knl = lp.make_kernel("{[i, k, m]: 0<=i, k, m<4}", """ z[i, 3:-1:-1, k, :, m] = linear_combo(x[i, :, k, :, m], y[i, :, k, :, m]) """, kernel_data=[ lp.GlobalArg(name="x, y, z", dtype=np.float64, shape=(n, n, n, n, n)), ... ]) knl = lp.merge([parent_knl, child_knl]) if inline: knl = lp.inline_callable_kernel(knl, "linear_combo") evt, (out, ) = knl(queue, x=x, y=y) assert (np.linalg.norm(2 * x + 3 * y - out[:, ::-1, :, :, :]) / (np.linalg.norm(2 * x + 3 * y))) < 1e-15
def test_array_inputs_to_callee_kernels(ctx_factory, inline): ctx = ctx_factory() queue = cl.CommandQueue(ctx) n = 2**3 x = np.random.rand(n, n) y = np.random.rand(n, n) child_knl = lp.make_function("{[i, j]:0<=i, j < 8}", """ g[i, j] = 2*e[i, j] + 3*f[i, j] """, name="linear_combo") parent_knl = lp.make_kernel("{:}", """ z[:, :] = linear_combo(x, y) """, kernel_data=[ lp.GlobalArg(name="x, y, z", dtype=np.float64, shape=(n, n)), ... ]) knl = lp.merge([parent_knl, child_knl]) if inline: knl = lp.inline_callable_kernel(knl, "linear_combo") evt, (out, ) = knl(queue, x=x, y=y) assert (np.linalg.norm(2 * x + 3 * y - out) / (np.linalg.norm(2 * x + 3 * y))) < 1e-15
def merge_loopy(slate_loopy, output_arg, builder, var2terminal): """ Merges tsfc loopy kernels and slate loopy kernel into a wrapper kernel.""" from firedrake.slate.slac.kernel_builder import SlateWrapperBag coeffs = builder.collect_coefficients() builder.bag = SlateWrapperBag(coeffs) # In the initialisation the loopy tensors for the terminals are generated # Those are the needed again for generating the TSFC calls inits, tensor2temp = builder.initialise_terminals(var2terminal, builder.bag.coefficients) terminal_tensors = list(filter(lambda x: isinstance(x, sl.Tensor), var2terminal.values())) tsfc_calls, tsfc_kernels = zip(*itertools.chain.from_iterable( (builder.generate_tsfc_calls(terminal, tensor2temp[terminal]) for terminal in terminal_tensors))) # Construct args args = [output_arg] + builder.generate_wrapper_kernel_args(tensor2temp, tsfc_kernels) # Munge instructions insns = inits insns.extend(tsfc_calls) insns.append(builder.slate_call(slate_loopy, tensor2temp.values())) # Inames come from initialisations + loopyfying kernel args and lhs domains = builder.bag.index_creator.domains # Generates the loopy wrapper kernel slate_wrapper = lp.make_function(domains, insns, args, name="slate_wrapper", seq_dependencies=True, target=lp.CTarget()) # Generate program from kernel, so that one can register kernels prg = make_program(slate_wrapper) for tsfc_loopy in tsfc_kernels: prg = register_callable_kernel(prg, tsfc_loopy) prg = register_callable_kernel(prg, slate_loopy) return prg
def test_inlining_with_indirections(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) ones_and_zeros = lp.make_function("{[i, j]: 0<=i<6 and 0<=j<3}", """ x[i] = 0.0f ...gbarrier x[map[j]] = 1.0f """, seq_dependencies=True, name="ones_and_zeros") t_unit = lp.make_kernel( "{ : }", """ y[:] = ones_and_zeros(mymap[:]) """, [ lp.GlobalArg("y", shape=6, dtype=lp.auto), lp.GlobalArg("mymap", dtype=np.int32, shape=3) ]) t_unit = lp.merge([t_unit, ones_and_zeros]) t_unit = lp.inline_callable_kernel(t_unit, "ones_and_zeros") map_in = np.arange(3).astype(np.int32) evt, (out, ) = t_unit(queue, mymap=map_in) expected_out = np.array([1, 1, 1, 0, 0, 0]).astype(np.float32) assert (expected_out == out).all()
def test_empty_sub_array_refs(ctx_factory, inline): # See: https://github.com/OP2/PyOP2/pull/559#discussion_r272208618 ctx = ctx_factory() queue = cl.CommandQueue(ctx) x = np.random.randn(10) y = np.random.randn(10) callee = lp.make_function("{[d]:0<=d<1}", """ c[d] = a[d] - b[d] """, name="wence_function") caller = lp.make_kernel( "{[i,k]: 0<=i<10 and 0<=k<1}", """ [k]:z[i+k] = wence_function([k]:x[i+k], [k]:y[i+k]) """, [lp.GlobalArg("x, y", dtype=np.float64, shape=(10, )), ...]) caller = lp.merge([caller, callee]) if inline: caller = lp.inline_callable_kernel(caller, "wence_function") evt, (out, ) = caller(queue, x=x, y=y) assert np.allclose(out, x - y)
def merge_loopy(slate_loopy, output_arg, builder, var2terminal, name): """ Merges tsfc loopy kernels and slate loopy kernel into a wrapper kernel.""" from firedrake.slate.slac.kernel_builder import SlateWrapperBag coeffs = builder.collect_coefficients() builder.bag = SlateWrapperBag(coeffs) # In the initialisation the loopy tensors for the terminals are generated # Those are the needed again for generating the TSFC calls inits, tensor2temp = builder.initialise_terminals(var2terminal, builder.bag.coefficients) terminal_tensors = list( filter(lambda x: (x.terminal and not x.assembled), var2terminal.values())) calls_and_kernels = tuple((c, k) for terminal in terminal_tensors for c, k in builder.generate_tsfc_calls( terminal, tensor2temp[terminal])) if calls_and_kernels: # tsfc may not give a kernel back tsfc_calls, tsfc_kernels = zip(*calls_and_kernels) else: tsfc_calls = () tsfc_kernels = () # Construct args args = [output_arg] + builder.generate_wrapper_kernel_args(tensor2temp) # Munge instructions insns = inits insns.extend(tsfc_calls) insns.append(builder.slate_call(slate_loopy, tensor2temp.values())) # Inames come from initialisations + loopyfying kernel args and lhs domains = builder.bag.index_creator.domains # Generates the loopy wrapper kernel slate_wrapper = lp.make_function(domains, insns, args, name=name, seq_dependencies=True, target=lp.CTarget()) # Generate program from kernel, so that one can register kernels from pyop2.codegen.loopycompat import _match_caller_callee_argument_dimension_ from loopy.kernel.function_interface import CallableKernel for tsfc_loopy in tsfc_kernels: slate_wrapper = merge([slate_wrapper, tsfc_loopy]) names = tsfc_loopy.callables_table for name in names: if isinstance(slate_wrapper.callables_table[name], CallableKernel): slate_wrapper = _match_caller_callee_argument_dimension_( slate_wrapper, name) slate_wrapper = merge([slate_wrapper, slate_loopy]) names = slate_loopy.callables_table for name in names: if isinstance(slate_wrapper.callables_table[name], CallableKernel): slate_wrapper = _match_caller_callee_argument_dimension_( slate_wrapper, name) return slate_wrapper
def test_packing_unpacking(ctx_factory, inline): ctx = ctx_factory() queue = cl.CommandQueue(ctx) x1 = cl.clrandom.rand(queue, (3, 2), dtype=np.float64) x2 = cl.clrandom.rand(queue, (6, ), dtype=np.float64) callee1 = lp.make_function("{[i]: 0<=i<6}", """ b[i] = 2*a[i] """, name="callee_fn1") callee2 = lp.make_function("{[i, j]: 0<=i<2 and 0 <= j < 3}", """ b[i, j] = 3*a[i, j] """, name="callee_fn2") knl = lp.make_kernel( "{[i, j, k]: 0<= i < 3 and 0 <= j < 2 and 0 <= k < 6}", """ [i, j]: y1[i, j] = callee_fn1([i, j]: x1[i, j]) [k]: y2[k] = callee_fn2([k]: x2[k]) """) knl = lp.merge([knl, callee1]) knl = lp.merge([knl, callee2]) knl = lp.pack_and_unpack_args_for_call(knl, "callee_fn1") knl = lp.pack_and_unpack_args_for_call(knl, "callee_fn2") if inline: knl = lp.inline_callable_kernel(knl, "callee_fn1") knl = lp.inline_callable_kernel(knl, "callee_fn2") knl = lp.set_options(knl, "write_cl") knl = lp.set_options(knl, "return_dict") evt, out_dict = knl(queue, x1=x1, x2=x2) y1 = out_dict["y1"].get() y2 = out_dict["y2"].get() assert np.linalg.norm(2 * x1.get() - y1) / np.linalg.norm( 2 * x1.get()) < 1e-15 assert np.linalg.norm(3 * x2.get() - y2) / np.linalg.norm( 3 * x2.get()) < 1e-15
def generate(impero_c, args, precision, scalar_type, kernel_name="loopy_kernel", index_names=[]): """Generates loopy code. :arg impero_c: ImperoC tuple with Impero AST and other data :arg args: list of loopy.GlobalArgs :arg precision: floating-point precision for printing :arg scalar_type: type of scalars as C typename string :arg kernel_name: function name of the kernel :arg index_names: pre-assigned index names :returns: loopy kernel """ ctx = LoopyContext() ctx.indices = impero_c.indices ctx.index_names = defaultdict(lambda: "i", index_names) ctx.precision = precision ctx.scalar_type = scalar_type ctx.epsilon = 10.0 ** (-precision) # Create arguments data = list(args) for i, temp in enumerate(impero_c.temporaries): name = "t%d" % i if isinstance(temp, gem.Constant): data.append(lp.TemporaryVariable(name, shape=temp.shape, dtype=temp.array.dtype, initializer=temp.array, address_space=lp.AddressSpace.LOCAL, read_only=True)) else: shape = tuple([i.extent for i in ctx.indices[temp]]) + temp.shape data.append(lp.TemporaryVariable(name, shape=shape, dtype=numpy.float64, initializer=None, address_space=lp.AddressSpace.LOCAL, read_only=False)) ctx.gem_to_pymbolic[temp] = p.Variable(name) # Create instructions instructions = statement(impero_c.tree, ctx) # Create domains domains = [] for idx, extent in ctx.index_extent.items(): inames = isl.make_zero_and_vars([idx]) domains.append(((inames[0].le_set(inames[idx])) & (inames[idx].lt_set(inames[0] + extent)))) if not domains: domains = [isl.BasicSet("[] -> {[]}")] # Create loopy kernel knl = lp.make_function(domains, instructions, data, name=kernel_name, target=lp.CTarget(), seq_dependencies=True, silenced_warnings=["summing_if_branches_ops"]) # Prevent loopy interchange by loopy knl = lp.prioritize_loops(knl, ",".join(ctx.index_extent.keys())) # Help loopy in scheduling by assigning priority to instructions insn_new = [] for i, insn in enumerate(knl.instructions): insn_new.append(insn.copy(priority=len(knl.instructions) - i)) knl = knl.copy(instructions=insn_new) return knl
def _form_loopy_kernel(kernel_domains, instructions, measure, args, **kwargs): kargs = [] for var, (func, intent) in args.items(): if isinstance(func, constant.Constant): if intent is not READ: raise RuntimeError("Only READ access is allowed to Constant") # Constants modelled as Globals, so no need for double # indirection ndof = func.dat.cdim kargs.append(loopy.GlobalArg(var, dtype=func.dat.dtype, shape=(ndof,))) else: # Do we have a component of a mixed function? if isinstance(func, Indexed): c, i = func.ufl_operands idx = i._indices[0]._value ndof = c.function_space()[idx].finat_element.space_dimension() cdim = c.dat[idx].cdim dtype = c.dat[idx].dtype else: if func.function_space().ufl_element().family() == "Real": ndof = func.function_space().dim() # == 1 kargs.append(loopy.GlobalArg(var, dtype=func.dat.dtype, shape=(ndof,))) continue else: if len(func.function_space()) > 1: raise NotImplementedError("Must index mixed function in par_loop.") ndof = func.function_space().finat_element.space_dimension() cdim = func.dat.cdim dtype = func.dat.dtype if measure.integral_type() == 'interior_facet': ndof *= 2 # FIXME: shape for facets [2][ndof]? kargs.append(loopy.GlobalArg(var, dtype=dtype, shape=(ndof, cdim))) kernel_domains = kernel_domains.replace(var+".dofs", str(ndof)) if kernel_domains == "": kernel_domains = "[] -> {[]}" try: key = (kernel_domains, tuple(instructions), tuple(map(tuple, kwargs.items()))) if kernel_cache is not None: return kernel_cache[key] else: raise KeyError("No cache") except KeyError: kargs.append(...) knl = loopy.make_function(kernel_domains, instructions, kargs, seq_dependencies=True, name="par_loop_kernel", silenced_warnings=["summing_if_branches_ops"], target=loopy.CTarget()) knl = pyop2.Kernel(knl, "par_loop_kernel", **kwargs) if kernel_cache is not None: return kernel_cache.setdefault(key, knl) else: return knl
def test_register_knl(ctx_factory, inline): ctx = ctx_factory() queue = cl.CommandQueue(ctx) n = 4 x = np.random.rand(n, n, n, n, n) y = np.random.rand(n, n, n, n, n) grandchild_knl = lp.make_function("{[i, j]:0<= i, j< 4}", """ c[i, j] = 2*a[i, j] + 3*b[i, j] """, name="linear_combo1") child_knl = lp.make_function("{[i, j]:0<=i, j < 4}", """ [i, j]: g[i, j] = linear_combo1([i, j]: e[i, j], [i, j]: f[i, j]) """, name="linear_combo2") parent_knl = lp.make_kernel("{[i, j, k, l, m]: 0<=i, j, k, l, m<4}", """ [j, l]: z[i, j, k, l, m] = linear_combo2([j, l]: x[i, j, k, l, m], [j, l]: y[i, j, k, l, m]) """, kernel_data=[ lp.GlobalArg(name="x, y", dtype=np.float64, shape=(n, n, n, n, n)), ... ]) knl = lp.merge([grandchild_knl, child_knl, parent_knl]) if inline: knl = lp.inline_callable_kernel(knl, "linear_combo2") knl = lp.inline_callable_kernel(knl, "linear_combo1") evt, (out, ) = knl(queue, x=x, y=y) assert (np.linalg.norm(2 * x + 3 * y - out) / (np.linalg.norm(2 * x + 3 * y))) < 1e-15
def test_register_knl_with_hw_axes(ctx_factory, inline): ctx = ctx_factory() queue = cl.CommandQueue(ctx) n = 4 x_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float64) y_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float64) callee_knl = lp.make_function("{[i, j]:0<=i, j < 4}", """ g[i, j] = 2*e[i, j] + 3*f[i, j] """, name="linear_combo") callee_knl = lp.split_iname(callee_knl, "i", 1, inner_tag="l.0", outer_tag="g.0") caller_knl = lp.make_kernel("{[i, j, k, l, m]: 0<=i, j, k, l, m<4}", """ [j, l]: z[i, j, k, l, m] = linear_combo([j, l]: x[i, j, k, l, m], [j, l]: y[i, j, k, l, m]) """, name="caller") caller_knl = lp.split_iname(caller_knl, "i", 4, inner_tag="l.1", outer_tag="g.1") knl = lp.merge([caller_knl, callee_knl]) knl = lp.set_options(knl, "return_dict") if inline: knl = lp.inline_callable_kernel(knl, "linear_combo") evt, out = knl(queue, x=x_dev, y=y_dev) x_host = x_dev.get() y_host = y_dev.get() assert np.linalg.norm(2 * x_host + 3 * y_host - out["z"].get() ) / np.linalg.norm(2 * x_host + 3 * y_host) < 1e-15
def create_loop_kernel(component_name, domains, instructions, edges, signature): domains, assumption_string = create_domain_string(domains, edges) globals = create_globals(signature, edges) knl = lp.make_function( domains, instructions, globals + ["..."], name=component_name, assumptions=assumption_string, target=lp.CTarget() ) knl = add_instruction_deps(knl) if component_name == 'main': print(lp.generate_code_v2(knl).device_code()) return knl
def test_multi_arg_array_call(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) import pymbolic.primitives as p n = 10 acc_i = p.Variable("acc_i") i = p.Variable("i") index = p.Variable("index") a_i = p.Subscript(p.Variable("a"), p.Variable("i")) argmin_kernel = lp.make_function("{[i]: 0 <= i < n}", [ lp.Assignment(id="init2", assignee=index, expression=0), lp.Assignment(id="init1", assignee=acc_i, expression="214748367"), lp.Assignment(id="insn", assignee=index, expression=p.If(p.Expression.eq(acc_i, a_i), i, index), depends_on="update"), lp.Assignment(id="update", assignee=acc_i, expression=p.Variable("min")(acc_i, a_i), depends_on="init1,init2") ], [ lp.GlobalArg("a"), lp.GlobalArg( "acc_i, index", is_input=False, is_output=True, shape=lp.auto), ... ], name="custom_argmin") argmin_kernel = lp.fix_parameters(argmin_kernel, n=n) knl = lp.make_kernel( "{[i]:0<=i<n}", """ []: min_val[()], []: min_index[()] = custom_argmin([i]:b[i]) """) knl = lp.fix_parameters(knl, n=n) knl = lp.set_options(knl, return_dict=True) knl = lp.merge([knl, argmin_kernel]) b = np.random.randn(n) evt, out_dict = knl(queue, b=b) tol = 1e-15 from numpy.linalg import norm assert (norm(out_dict["min_val"] - np.min(b)) < tol) assert (norm(out_dict["min_index"] - np.argmin(b)) < tol)
def _form_loopy_kernel(kernel_domains, instructions, measure, args, **kwargs): kargs = [] for var, (func, intent) in args.items(): if isinstance(func, constant.Constant): if intent is not READ: raise RuntimeError("Only READ access is allowed to Constant") # Constants modelled as Globals, so no need for double # indirection ndof = func.dat.cdim kargs.append(loopy.GlobalArg(var, dtype=func.dat.dtype, shape=(ndof,))) else: # Do we have a component of a mixed function? if isinstance(func, Indexed): c, i = func.ufl_operands idx = i._indices[0]._value ndof = c.function_space()[idx].finat_element.space_dimension() cdim = c.dat[idx].cdim dtype = c.dat[idx].dtype else: if func.function_space().ufl_element().family() == "Real": ndof = func.function_space().dim() # == 1 kargs.append(loopy.GlobalArg(var, dtype=func.dat.dtype, shape=(ndof,))) continue else: if len(func.function_space()) > 1: raise NotImplementedError("Must index mixed function in par_loop.") ndof = func.function_space().finat_element.space_dimension() cdim = func.dat.cdim dtype = func.dat.dtype if measure.integral_type() == 'interior_facet': ndof *= 2 # FIXME: shape for facets [2][ndof]? kargs.append(loopy.GlobalArg(var, dtype=dtype, shape=(ndof, cdim))) kernel_domains = kernel_domains.replace(var+".dofs", str(ndof)) if kernel_domains == "": kernel_domains = "[] -> {[]}" kargs.append(...) knl = loopy.make_function(kernel_domains, instructions, kargs, seq_dependencies=True, name="par_loop_kernel", silenced_warnings=["summing_if_branches_ops"]) return pyop2.Kernel(knl, "par_loop_kernel", **kwargs)
def test_argument_matching_for_inplace_update(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) twice = lp.make_function("{[i]: 0<=i<10}", """ x[i] = 2*x[i] """, name="twice") knl = lp.make_kernel("{:}", """ x[:] = twice(x[:]) """, [lp.GlobalArg("x", shape=(10, ), dtype=np.float64)]) knl = lp.merge([knl, twice]) x = np.random.randn(10) evt, (out, ) = knl(queue, x=np.copy(x)) assert np.allclose(2 * x, out)
def test_inlining_with_callee_domain_param(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) fill2 = lp.make_function("{[i]: 0<=i<n}", """ y[i] = 2.0 """, name="fill2") caller = lp.make_kernel( "{[i]: 0<=i<10}", """ [i]: res[i] = fill2(10) """) caller = lp.merge([caller, fill2]) caller = lp.inline_callable_kernel(caller, "fill2") evt, (out, ) = caller(queue) assert (out == 2).all()
def test_non_zero_start_in_subarray_ref(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) twice = lp.make_function("{[i]: 0<=i<10}", """ b[i] = 2*a[i] """, name="twice") knl = lp.make_kernel( "{[i, j]: -5<=i<5 and 0<=j<10}", """ [i]:y[i+5] = twice([j]: x[j]) """, [lp.GlobalArg("x, y", shape=(10, ), dtype=np.float64)]) knl = lp.merge([knl, twice]) x = np.random.randn(10) evt, (out, ) = knl(queue, x=np.copy(x)) assert np.allclose(2 * x, out)
def test_valueargs_being_mapped_in_inling(ctx_factory): doublify = lp.make_function( "{[i]: 0<=i<n}", """ y[i] = n*x[i] """, [lp.ValueArg("n", dtype=np.int32), ...], name="doublify", ) knl = lp.make_kernel( "{[i, j]: 0<=i, j<10}", """ [i]: bar[i] = doublify(10, [j]: foo[j]) """, [lp.GlobalArg("foo", dtype=float, shape=lp.auto), ...], ) knl = lp.merge([knl, doublify]) knl = lp.inline_callable_kernel(knl, "doublify") lp.auto_test_vs_ref(knl, ctx_factory(), knl)
def test_passing_and_getting_scalar_in_clbl_knl(ctx_factory, inline): ctx = cl.create_some_context() cq = cl.CommandQueue(ctx) call_sin = lp.make_function("{:}", """ y = sin(x) """, name="call_sin") knl = lp.make_kernel( "{:}", """ []: real_y[()] = call_sin(real_x) """) knl = lp.merge([knl, call_sin]) knl = lp.set_options(knl, "write_cl") if inline: knl = lp.inline_callable_kernel(knl, "call_sin") evt, (out, ) = knl(cq, real_x=np.asarray(3.0, dtype=float))
def test_simplify_indices(ctx_factory): ctx = ctx_factory() twice = lp.make_function("{[i, j]: 0<=i<10 and 0<=j<4}", """ y[i,j] = 2*x[i,j] """, name="zerozerozeroonezeroify") knl = lp.make_kernel( "{:}", """ Y[:,:] = zerozerozeroonezeroify(X[:,:]) """, [lp.GlobalArg("X,Y", shape=(10, 4), dtype=np.float64)]) class ContainsFloorDiv(lp.symbolic.CombineMapper): def combine(self, values): return any(values) def map_floor_div(self, expr): return True def map_variable(self, expr): return False def map_constant(self, expr): return False knl = lp.merge([knl, twice]) knl = lp.inline_callable_kernel(knl, "zerozerozeroonezeroify") simplified_knl = lp.simplify_indices(knl) contains_floordiv = ContainsFloorDiv() assert any( contains_floordiv(insn.expression) for insn in knl.default_entrypoint.instructions if isinstance(insn, lp.MultiAssignmentBase)) assert all(not contains_floordiv(insn.expression) for insn in simplified_knl.default_entrypoint.instructions if isinstance(insn, lp.MultiAssignmentBase)) lp.auto_test_vs_ref(knl, ctx, simplified_knl)
def test_callee_with_auto_offset(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) arange = lp.make_function("{[i]: 0<=i<7}", """ y[i] = 2*y[i] """, [lp.GlobalArg("y", offset=lp.auto)], name="dosify") knl = lp.make_kernel( "{[i]: 0<=i<7}", """ [i]: y[i] = dosify([i]: y[i]) """, [lp.GlobalArg("y", offset=3, shape=10)]) knl = lp.merge([knl, arange]) y = np.arange(10) knl(queue, y=y) np.testing.assert_allclose(y[:3], np.arange(3)) np.testing.assert_allclose(y[3:], 2 * np.arange(3, 10))
def test_non1_step_slices(ctx_factory, start, inline): # See https://github.com/inducer/loopy/pull/222#discussion_r645905188 ctx = ctx_factory() cq = cl.CommandQueue(ctx) callee = lp.make_function("{[i]: 0<=i<n}", """ y[i] = i**2 """, [lp.ValueArg("n"), ...], name="squared_arange") t_unit = lp.make_kernel("{[i_init, j_init]: 0<=i_init, j_init<40}", f""" X[i_init] = 42 X[{start}:40:3] = squared_arange({len(range(start, 40, 3))}) Y[j_init] = 1729 Y[39:{start}:-3] = squared_arange({len(range(39, start, -3))}) """, [lp.GlobalArg("X,Y", shape=40)], seq_dependencies=True) expected_out1 = 42 * np.ones(40, dtype=np.int64) expected_out1[start:40:3] = np.arange(len(range(start, 40, 3)))**2 expected_out2 = 1729 * np.ones(40, dtype=np.int64) expected_out2[39:start:-3] = np.arange(len(range(39, start, -3)))**2 t_unit = lp.merge([t_unit, callee]) t_unit = lp.set_options(t_unit, "return_dict") if inline: t_unit = lp.inline_callable_kernel(t_unit, "squared_arange") evt, out_dict = t_unit(cq) np.testing.assert_allclose(out_dict["X"].get(), expected_out1) np.testing.assert_allclose(out_dict["Y"].get(), expected_out2)
def test_kc_with_floor_div_in_expr(ctx_factory, inline): # See https://github.com/inducer/loopy/issues/366 import loopy as lp ctx = ctx_factory() callee = lp.make_function("{[i]: 0<=i<10}", """ x[i] = 2*x[i] """, name="callee_with_update") knl = lp.make_kernel( "{[i]: 0<=i<10}", """ [i]: x[2*(i//2) + (i%2)] = callee_with_update([i]: x[i]) """) knl = lp.merge([knl, callee]) if inline: knl = lp.inline_callable_kernel(knl, "callee_with_update") lp.auto_test_vs_ref(knl, ctx, knl)
def test_callee_with_parameter_and_grid(ctx_factory): ctx = ctx_factory() cq = cl.CommandQueue(ctx) callee = lp.make_function("{[i]: 0<=i<n}", """ y[i] = i """, name="arange") knl = lp.make_kernel("{[i]: 0<=i<10}", """ [i]: y[i] = arange(10) """) knl = lp.merge([callee, knl]) knl = lp.split_iname(knl, "i", 2, outer_tag="g.0", within="in_kernel:arange") evt, (out, ) = knl(cq) np.testing.assert_allclose(out.get(), np.arange(10))
def expression_kernel(expr, args): r"""Produce a :class:`pyop2.Kernel` from the processed UFL expression expr and the corresponding args.""" # Empty slot indicating assignment to indexed LHS, so don't do anything if type(expr) is Zero: return fs = args[0].function.function_space() import islpy as isl inames = isl.make_zero_and_vars(["d"]) domain = (inames[0].le_set(inames["d"])) & (inames["d"].lt_set(inames[0] + fs.dof_dset.cdim)) context = Bag() context.within_inames = frozenset(["d"]) context.indices = (p.Variable("d"),) insn = loopy_instructions(expr, context) data = [arg.arg for arg in args] knl = loopy.make_function([domain], [insn], data, name="expression", silenced_warnings=["summing_if_branches_ops"]) return op2.Kernel(knl, "expression")