def build_loopy_kernel_A_text(): knl_name = "kernel_tensor_A" knl = lp.make_kernel("{ [i,j,k]: 0<=i,j<n and 0<=k<m }", """ A[i,j] = c*sum(k, B[k,i]*B[k,j]) """, name=knl_name, assumptions="n >= 1 and m >= 1", lang_version=lp.MOST_RECENT_LANGUAGE_VERSION, target=lp.CTarget()) knl = lp.add_and_infer_dtypes( knl, { "A": np.dtype(np.double), "B": np.dtype(np.double), "c": np.dtype(np.double) }) knl = lp.fix_parameters(knl, n=3, m=2) knl = lp.prioritize_loops(knl, "i,j") #print(knl) knl_c, knl_h = lp.generate_code_v2(knl).device_code(), str( lp.generate_header(knl)[0]) replacements = [("__restrict__", "restrict")] knl_c = utils.replace_strings(knl_c, replacements) knl_h = utils.replace_strings(knl_h, replacements) knl_call = "kernel_tensor_A(A, &B[0][0], 1.0/(2.0*Ae));" return knl_name, knl_call, knl_c, knl_h
def build_loopy_kernel_b_text(): knl_name = "kernel_tensor_b" knl = lp.make_kernel("{ [i]: 0<=i<n }", """ b[i] = c """, name="kernel_tensor_b", lang_version=lp.MOST_RECENT_LANGUAGE_VERSION, target=lp.CTarget()) knl = lp.add_and_infer_dtypes(knl, { "b": np.dtype(np.double), "c": np.dtype(np.double) }) knl = lp.fix_parameters(knl, n=3) #print(knl) knl_c, knl_h = lp.generate_code_v2(knl).device_code(), str( lp.generate_header(knl)[0]) replacements = [("__restrict__", "restrict")] knl_c = utils.replace_strings(knl_c, replacements) knl_h = utils.replace_strings(knl_h, replacements) knl_call = "kernel_tensor_b(b, Ae / 6.0);" return knl_name, knl_call, knl_c, knl_h
def generate_kernel(): '''Generates and returns source and header for a kernel using loopy''' knl = lp.make_kernel("{ [i]: 0<=i<n }", "out[i] = 2*a[i]", lang_version=lp.MOST_RECENT_LANGUAGE_VERSION, target=lp.CTarget()) knl = lp.add_and_infer_dtypes(knl, {"a": np.dtype(np.double)}) #knl = lp.split_iname(knl, "i", 4) #knl = lp.tag_inames(knl, dict(i_inner="unr")) return lp.generate_code_v2(knl).all_code(), str(lp.generate_header(knl)[0])
def get_header(knl): """ Returns header definition code for a :class:`loopy.LoopKernel` Parameters ---------- knl : :class:`loopy.LoopKernel` The kernel to generate a header definition for Returns ------- Generated device header code Notes ----- The kernel's Target and name should be set for proper functioning """ return str(lp.generate_header(knl)[0])
def get_header(knl, codegen_result=None): """ Returns header definition code for a :class:`loopy.LoopKernel` Parameters ---------- knl : :class:`loopy.LoopKernel` The kernel to generate a header definition for codegen_result : :class:`loopy.CodeGenerationResult` If supplied, the pre-generated code-gen result for this kernel (speeds up header generation) Returns ------- Generated device header code Notes ----- The kernel's Target and name should be set for proper functioning """ return str(lp.generate_header(knl, codegen_result=codegen_result)[0])
def __generate_loopy(self, knl_name: str, verbose: bool = False, **kwargs): """Generate cell kernel for the Laplace operator using Loopy""" n_dof, n_dim = self.n_dof, self.n_dim # Inputs to the kernel arg_names = ["A_T", "A0", "G_T"] # Kernel parameters that will be fixed later param_names = ["n", "m"] # Tuples of inames and extents of their loops loops = [("i", "n"), ("j", "n"), ("k", "m")] # Generate the domains for the loops isl_domains = [] for idx, extent in loops: # Create dict of loop variables (inames) and parameters vs = isl.make_zero_and_vars([idx], [extent]) # Create the loop domain using '<=' and '>' restrictions isl_domains.append(((vs[0].le_set(vs[idx])) & (vs[idx].lt_set(vs[0] + vs[extent])))) if verbose: print("ISL loop domains:") print(isl_domains) print("") # Generate pymbolic variables for all used symbols args = {arg: pb.Variable(arg) for arg in arg_names} params = {param: pb.Variable(param) for param in param_names} inames = {iname: pb.Variable(iname) for iname, extent in loops} # Input arguments for the loopy kernel n, m = params["n"], params["m"] lp_args = { "A_T": lp.GlobalArg("A_T", dtype=np.double, shape=(n, n)), "A0": lp.GlobalArg("A0", dtype=np.double, shape=(n, n, m)), "G_T": lp.GlobalArg("G_T", dtype=np.double, shape=(m)) } # Generate the list of arguments & parameters that will be passed to loopy data = [] data += [arg for arg in lp_args.values()] data += [lp.ValueArg(param) for param in param_names] # Build the kernel instruction: computation and assignment of the element matrix def build_ass(): # A_T[i,j] = sum(k, A0[i,j,k] * G_T[k]); # Get variable symbols for all required variables i, j, k = inames["i"], inames["j"], inames["k"] A_T, A0, G_T = args["A_T"], args["A0"], args["G_T"] # The target of the assignment target = pb.Subscript(A_T, (i, j)) # The rhs expression: Frobenius inner product <A0[i,j],G_T> reduce_op = lp.library.reduction.SumReductionOperation() reduce_expr = pb.Subscript(A0, (i, j, k)) * pb.Subscript(G_T, (k)) expr = lp.Reduction(reduce_op, k, reduce_expr) return lp.Assignment(target, expr) ass = build_ass() if verbose: print("Assignment expression:") print(ass) print("") instructions = [ass] # Construct the kernel knl = lp.make_kernel(isl_domains, instructions, data, name=knl_name, target=lp.CTarget(), lang_version=lp.MOST_RECENT_LANGUAGE_VERSION) knl = lp.fix_parameters(knl, n=n_dof, m=n_dim**2) knl = lp.prioritize_loops(knl, "i,j") if verbose: print("") print(knl) print("") # Generate kernel code knl_c, knl_h = lp.generate_code_v2(knl).device_code(), str( lp.generate_header(knl)[0]) if verbose: print(knl_c) print("") # Postprocess kernel code knl_c = knl_c.replace("__restrict__", "restrict") knl_h = knl_h.replace("__restrict__", "restrict") return knl_c, knl_h
def build_loopy_kernel_A_auto(): knl_name = "kernel_tensor_A" # Inputs to the kernel arg_names = ["A", "B", "c"] # Kernel parameters that will be fixed later param_names = ["n", "m"] # Tuples of inames and extents of their loops loops = [("i", "n"), ("j", "n"), ("k", "m")] # Generate the domains for the loops isl_domains = [] for idx, extent in loops: # Create dict of loop variables (inames) and parameters vs = isl.make_zero_and_vars([idx], [extent]) # Create the loop domain using '<=' and '>' restrictions isl_domains.append( ((vs[0].le_set(vs[idx])) & (vs[idx].lt_set(vs[0] + vs[extent])))) print("ISL loop domains:") print(isl_domains) print("") # Generate pymbolic variables for all used symbols args = {arg: pb.Variable(arg) for arg in arg_names} params = {param: pb.Variable(param) for param in param_names} inames = {iname: pb.Variable(iname) for iname, extent in loops} # Input arguments for the loopy kernel lp_args = { "A": lp.GlobalArg("A", dtype=np.double, shape=(params["n"], params["n"])), "B": lp.GlobalArg("B", dtype=np.double, shape=(params["m"], params["n"])), "c": lp.ValueArg("c", dtype=np.double) } # Generate the list of arguments & parameters that will be passed to loopy data = [] data += [arg for arg in lp_args.values()] data += [lp.ValueArg(param) for param in ["n", "m"]] # Build the kernel instruction: computation and assignment of the element matrix def build_ass(): """ A[i,j] = c*sum(k, B[k,i]*B[k,j]) """ # The target of the assignment target = pb.Subscript(args["A"], (inames["i"], inames["j"])) # The rhs expression: A reduce operation of the matrix columns # Maybe replace with manual increment? reduce_op = lp.library.reduction.SumReductionOperation() reduce_expr = pb.Subscript(args["B"], (inames["k"], inames["i"])) * pb.Subscript( args["B"], (inames["k"], inames["j"])) expr = args["c"] * lp.Reduction(reduce_op, inames["k"], reduce_expr) return lp.Assignment(target, expr) ass = build_ass() print("Assignment expression:") print(ass) print("") instructions = [ass] # Construct the kernel knl = lp.make_kernel(isl_domains, instructions, data, name=knl_name, target=lp.CTarget(), lang_version=lp.MOST_RECENT_LANGUAGE_VERSION) knl = lp.fix_parameters(knl, n=3, m=2) knl = lp.prioritize_loops(knl, "i,j") print(knl) print("") # Generate kernel code knl_c, knl_h = lp.generate_code_v2(knl).device_code(), str( lp.generate_header(knl)[0]) print(knl_c) print("") # Postprocess kernel code replacements = [("__restrict__", "restrict")] knl_c = utils.replace_strings(knl_c, replacements) knl_h = utils.replace_strings(knl_h, replacements) knl_call = "kernel_tensor_A(A, &B[0][0], 1.0/(2.0*Ae));" return knl_name, knl_call, knl_c, knl_h