def generate_loopy_kernel(slate_expr, tsfc_parameters=None): cpu_time = time.time() if len(slate_expr.ufl_domains()) > 1: raise NotImplementedError("Multiple domains not implemented.") Citations().register("Gibson2018") # Create a loopy builder for the Slate expression, # e.g. contains the loopy kernels coming from TSFC gem_expr, var2terminal = slate_to_gem(slate_expr) scalar_type = tsfc_parameters["scalar_type"] slate_loopy, output_arg = gem_to_loopy(gem_expr, var2terminal, scalar_type) builder = LocalLoopyKernelBuilder(expression=slate_expr, tsfc_parameters=tsfc_parameters) name = "slate_wrapper" loopy_merged = merge_loopy(slate_loopy, output_arg, builder, var2terminal, name) loopy_merged = loopy.register_callable(loopy_merged, INVCallable.name, INVCallable()) loopy_merged = loopy.register_callable(loopy_merged, SolveCallable.name, SolveCallable()) loopykernel = op2.Kernel(loopy_merged, name, include_dirs=BLASLAPACK_INCLUDE.split(), ldargs=BLASLAPACK_LIB.split()) kinfo = KernelInfo( kernel=loopykernel, integral_type= "cell", # slate can only do things as contributions to the cell integrals oriented=builder.bag.needs_cell_orientations, subdomain_id="otherwise", domain_number=0, coefficient_map=tuple(range(len(slate_expr.coefficients()))), needs_cell_facets=builder.bag.needs_cell_facets, pass_layer_arg=builder.bag.needs_mesh_layers, needs_cell_sizes=builder.bag.needs_cell_sizes) # Cache the resulting kernel # Slate kernels are never split, so indicate that with None in the index slot. idx = tuple([None] * slate_expr.rank) logger.info(GREEN % "compile_slate_expression finished in %g seconds.", time.time() - cpu_time) return (SplitKernel(idx, kinfo), )
def test_register_function_lookup(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) from testlib import Log2Callable x = np.random.rand(10) queue = cl.CommandQueue(ctx) prog = lp.make_kernel("{[i]: 0<=i<10}", """ y[i] = log2(x[i]) """) prog = lp.register_callable(prog, "log2", Log2Callable("log2")) evt, (out, ) = prog(queue, x=x) assert np.linalg.norm(np.log2(x) - out) / np.linalg.norm( np.log2(x)) < 1e-15
def make_kernel(self, map_instructions, tmp_instructions, args, domains, **kwargs): temp_statements = [] temp_vars = [] from pystella.field import index_fields indexed_tmp_insns = index_fields(tmp_instructions) indexed_map_insns = index_fields(map_instructions) for statement in indexed_tmp_insns: if isinstance(statement, lp.InstructionBase): temp_statements += [statement] else: assignee, expression = statement # only declare temporary variables once if isinstance(assignee, pp.Variable): current_tmp = assignee elif isinstance(assignee, pp.Subscript): current_tmp = assignee.aggregate else: current_tmp = None if current_tmp is not None and current_tmp not in temp_vars: temp_vars += [current_tmp] tvt = lp.Optional(None) else: tvt = lp.Optional() temp_statements += [ self._assignment(assignee, expression, temp_var_type=tvt) ] output_statements = [] for statement in indexed_map_insns: if isinstance(statement, lp.InstructionBase): output_statements += [statement] else: assignee, expression = statement temp_statements += [self._assignment(assignee, expression)] options = kwargs.pop("options", lp.Options()) # ignore lack of supposed dependency for single-instruction kernels if len(map_instructions) + len(tmp_instructions) == 1: options.check_dep_resolution = False from pystella import get_field_args inferred_args = get_field_args([map_instructions, tmp_instructions]) all_args = append_new_args(args, inferred_args) t_unit = lp.make_kernel( domains, temp_statements + output_statements, all_args + [lp.ValueArg("Nx, Ny, Nz", dtype="int"), ...], options=options, **kwargs, ) new_args = [] knl = t_unit.default_entrypoint for arg in knl.args: if isinstance(arg, lp.KernelArgument) and arg.dtype is None: new_arg = arg.copy(dtype=self.dtype) new_args.append(new_arg) else: new_args.append(arg) t_unit = t_unit.with_kernel(knl.copy(args=new_args)) t_unit = lp.remove_unused_arguments(t_unit) t_unit = lp.register_callable(t_unit, "round", UnaryOpenCLCallable("round")) return t_unit
ecm(vec).expr, 1, ecm(result).expr, 1 ] return ( var(self.name_in_target)(*c_parameters), False # cblas_gemv does not return anything ) def generate_preambles(self, target): assert isinstance(target, CTarget) yield ("99_cblas", "#include <cblas.h>") return # }}} n = 10 knl = lp.make_kernel("{:}", """ y[:] = gemv(A[:, :], x[:]) """, [ lp.GlobalArg("A", dtype=np.float64, shape=(n, n)), lp.GlobalArg("x", dtype=np.float64, shape=(n, )), lp.GlobalArg("y", shape=(n, )), ... ], target=CTarget()) knl = lp.register_callable(knl, "gemv", CBLASGEMV(name="gemv")) print(lp.generate_code_v2(knl).device_code())