def test_dependent_loop_bounds_4(): # https://gitlab.tiker.net/inducer/loopy/issues/23 import loopy as lp loopy_knl = lp.make_kernel( [ "{[a]: 0<=a<10}", "{[b]: b_start<=b<b_end}", "{[c,idim]: c_start<=c<c_end and 0<=idim<dim}", ], """ for a <> b_start = 1 <> b_end = 2 for b <> c_start = 1 <> c_end = 2 for c ... nop end <>t[idim] = 1 end end """, "...", seq_dependencies=True) loopy_knl = lp.fix_parameters(loopy_knl, dim=3) with lp.CacheMode(False): lp.generate_code_v2(loopy_knl)
def test_ispc_streaming_stores(): stream_dtype = np.float32 index_dtype = np.int32 knl = lp.make_kernel( "{[i]: 0<=i<n}", "a[i] = b[i] + scalar * c[i]", target=lp.ISPCTarget(), index_dtype=index_dtype, name="stream_triad") vars = ["a", "b", "c", "scalar"] knl = lp.assume(knl, "n>0") knl = lp.split_iname( knl, "i", 2**18, outer_tag="g.0", slabs=(0, 1)) knl = lp.split_iname(knl, "i_inner", 8, inner_tag="l.0") knl = lp.tag_instructions(knl, "!streaming_store") knl = lp.add_and_infer_dtypes(knl, { var: stream_dtype for var in vars }) knl = lp.set_argument_order(knl, vars + ["n"]) knl = lp.preprocess_kernel(knl) knl = lp.get_one_scheduled_kernel(knl) lp.generate_code_v2(knl).all_code()
def test_check_bounds_with_caller_assumptions(ctx_factory): import islpy as isl from loopy.diagnostic import LoopyIndexError arange = lp.make_function("{[i]: 0<=i<n}", """ y[i] = i """, name="arange") knl = lp.make_kernel( "{[i]: 0<=i<20}", """ [i]: Y[i] = arange(N) """, [lp.GlobalArg("Y", shape=(20, )), lp.ValueArg("N", dtype=np.int32)], name="epoint") knl = lp.merge([knl, arange]) with pytest.raises(LoopyIndexError): lp.generate_code_v2(knl) knl = knl.with_kernel( lp.assume(knl.default_entrypoint, isl.BasicSet("[N] -> { : N <= 20}"))) lp.auto_test_vs_ref(knl, ctx_factory(), parameters={"N": 15})
def test_dependent_loop_bounds_3(ctx_factory): # The point of this test is that it shows a dependency between # domains that is exclusively mediated by the row_len temporary. # It also makes sure that row_len gets read before any # conditionals use it. dtype = np.dtype(np.float32) ctx = ctx_factory() knl = lp.make_kernel([ "{[i]: 0<=i<n}", "{[jj]: 0<=jj<row_len}", ], [ "<> row_len = a_row_lengths[i]", "a[i,jj] = 1", ], [ lp.GlobalArg("a_row_lengths", np.int32, shape=lp.auto), lp.GlobalArg("a", dtype, shape=("n,n"), order="C"), lp.ValueArg("n", np.int32), ], target=lp.PyOpenCLTarget(ctx.devices[0]), name="loopy_kernel") assert knl["loopy_kernel"].parents_per_domain()[1] == 0 knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") print(lp.generate_code_v2(knl).device_code()) knl_bad = lp.split_iname(knl, "jj", 128, outer_tag="g.1", inner_tag="l.1") with pytest.raises(RuntimeError): list(lp.generate_code_v2(knl_bad))
def test_dependent_loop_bounds_4(): # https://gitlab.tiker.net/inducer/loopy/issues/23 import loopy as lp loopy_knl = lp.make_kernel([ "{[a]: 0<=a<10}", "{[b]: b_start<=b<b_end}", "{[c,idim]: c_start<=c<c_end and 0<=idim<dim}", ], """ for a <> b_start = 1 <> b_end = 2 for b <> c_start = 1 <> c_end = 2 for c ... nop end <>t[idim] = 1 end end """, "...", seq_dependencies=True) loopy_knl = lp.fix_parameters(loopy_knl, dim=3) with lp.CacheMode(False): lp.generate_code_v2(loopy_knl)
def test_double_hw_axes_used_in_knl_call(inline): from loopy.diagnostic import LoopyError twice = lp.make_function("{[i]: 0<=i<10}", """ y[i] = 2*x[i] """, name="twice") knl = lp.make_kernel("{[i]: 0<=i<10}", """ y[:, i] = twice(x[:, i]) """, [ lp.GlobalArg("x", shape=(10, 10), dtype=float), lp.GlobalArg("y", shape=(10, 10)) ], name="outer") twice = lp.tag_inames(twice, {"i": "l.0"}) knl = lp.tag_inames(knl, {"i": "l.0"}) knl = lp.merge([knl, twice]) if inline: knl = lp.inline_callable_kernel(knl, "twice") with pytest.raises(LoopyError): lp.generate_code_v2(knl)
def test_idempotency(form): k1 = compile_form(form)[0] k2 = compile_form(form)[0] assert k1.ast.gencode() == k2.ast.gencode() # Test loopy backend import loopy k1 = compile_form(form, coffee=False)[0] k2 = compile_form(form, coffee=False)[0] assert loopy.generate_code_v2( k1.ast).device_code() == loopy.generate_code_v2(k2.ast).device_code()
def test_lpy_wide_array_splitter(opts): from pymbolic.primitives import Subscript, Variable # create array split asplit = array_splitter(opts) # create a test kernel arg1 = lp.GlobalArg('a1', shape=(10, 10), order=opts.order) arg2 = lp.GlobalArg('a2', shape=(16, 16), order=opts.order) k = lp.make_kernel([ '{[i]: 0 <= i < 10}', '{{[j_outer]: 0 <= j_outer < {}}}'.format( int(np.ceil(10 / VECTOR_WIDTH))), '{{[j_inner]: 0 <= j_inner < {}}}'.format(VECTOR_WIDTH) ], """ for i, j_outer, j_inner a1[j_outer, i] = 1 {id=a1} a2[j_outer, i] = 1 {id=a2} end """, [arg1, arg2], silenced_warnings=['no_device_in_pre_codegen_checks'], target=lp.OpenCLTarget()) a1_hold = k.arg_dict['a1'].copy() a2_hold = k.arg_dict['a2'].copy() k = asplit.split_loopy_arrays(k) k = lp.tag_inames(k, {'j_inner': 'l.0' if not opts.is_simd else 'vec'}) # ensure there's no loopy errors lp.generate_code_v2(k).device_code() def __indexer(): if opts.order == 'C': return (Variable('j_outer'), Variable('i'), Variable('j_inner')) else: return (Variable('j_inner'), Variable('j_outer'), Variable('i')) # check dim a1 = k.arg_dict['a1'] assert a1.shape == asplit.split_shape(a1_hold)[0] # and indexing assign = next(insn.assignee for insn in k.instructions if insn.id == 'a1') # construct index assert isinstance(assign, Subscript) and assign.index == __indexer() # now test with evenly sized a2 = k.arg_dict['a2'] assert a2.shape == asplit.split_shape(a2_hold)[0] assign = next(insn.assignee for insn in k.instructions if insn.id == 'a2') assert isinstance(assign, Subscript) and assign.index == __indexer()
def test_lpy_deep_array_splitter(opts): from pymbolic.primitives import Subscript, Variable # create array split asplit = array_splitter(opts) # create a test kernel size = VECTOR_WIDTH * 3 loop_bound = VECTOR_WIDTH * 2 arg1 = lp.GlobalArg('a1', shape=(size, size), order=opts.order) arg2 = lp.GlobalArg('a2', shape=(16, 16), order=opts.order) k = lp.make_kernel('{{[i]: 0 <= i < {}}}'.format(loop_bound), """ a1[0, i] = 1 {id=a1} a2[0, i] = 1 {id=a2} """, [arg1, arg2], silenced_warnings=['no_device_in_pre_codegen_checks'], target=lp.OpenCLTarget()) k = lp.split_iname(k, 'i', VECTOR_WIDTH, inner_tag='l.0' if not opts.is_simd else 'vec') a1_hold = k.arg_dict['a1'].copy() a2_hold = k.arg_dict['a2'].copy() k = asplit.split_loopy_arrays(k) # ensure there's no loopy errors lp.generate_code_v2(k).device_code() def __indexer(): if opts.order == 'C': return (0, Variable('i_outer'), Variable('i_inner')) else: return (Variable('i_inner'), 0, Variable('i_outer')) # check dim a1 = k.arg_dict['a1'] assert a1.shape == asplit.split_shape(a1_hold)[0] # and indexing assign = next(insn.assignee for insn in k.instructions if insn.id == 'a1') # construct index assert isinstance(assign, Subscript) and assign.index == __indexer() # now test with evenly sized a2 = k.arg_dict['a2'] assert a2.shape == asplit.split_shape(a2_hold)[0] assign = next(insn.assignee for insn in k.instructions if insn.id == 'a2') assert isinstance(assign, Subscript) and assign.index == __indexer()
def test_global_temporary(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{ [i]: 0<=i<n}", """ <> c[i] = a[i + 1] out[i] = c[i] """) knl = lp.add_and_infer_dtypes(knl, {"a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32}) knl = lp.set_temporary_scope(knl, "c", "global") ref_knl = knl knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") cgr = lp.generate_code_v2(knl) assert len(cgr.device_programs) == 2 #print(cgr.device_code()) #print(cgr.host_code()) lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5))
def test_global_temporary(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{ [i]: 0<=i<n}", """ <> c[i] = a[i + 1] out[i] = c[i] """) knl = lp.add_and_infer_dtypes(knl, { "a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32 }) knl = lp.set_temporary_scope(knl, "c", "global") ref_knl = knl knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") cgr = lp.generate_code_v2(knl) assert len(cgr.device_programs) == 2 #print(cgr.device_code()) #print(cgr.host_code()) lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5))
def test_c_execution_with_global_temporaries(): # ensure that the "host" code of a bare ExecutableCTarget with # global constant temporaries is None from loopy.target.c import ExecutableCTarget AS = lp.AddressSpace # noqa n = 10 knl = lp.make_kernel( "{[i]: 0 <= i < n}", """ a[i] = b[i] """, [ lp.GlobalArg("a", shape=(n, ), dtype=np.int32), lp.TemporaryVariable("b", shape=(n, ), initializer=np.arange(n, dtype=np.int32), dtype=np.int32, read_only=True, address_space=AS.GLOBAL) ], target=ExecutableCTarget()) knl = lp.fix_parameters(knl, n=n) assert ("int b[%d]" % n) not in lp.generate_code_v2(knl).host_code() assert np.allclose(knl(a=np.zeros(10, dtype=np.int32))[1], np.arange(10))
def test_missing_compilers(): from loopy.target.c import ExecutableCTarget, CTarget from loopy.target.c.c_execution import CCompiler from codepy.toolchain import GCCToolchain def __test(evalfunc, target, **targetargs): n = 10 knl = lp.make_kernel( "{[i]: 0 <= i < n}", """ a[i] = b[i] """, [ lp.GlobalArg("a", shape=(n, ), dtype=np.int32), lp.GlobalArg("b", shape=(n, ), dtype=np.int32) ], target=target(**targetargs)) knl = lp.fix_parameters(knl, n=n) return evalfunc(knl) assert __test(lambda knl: lp.generate_code_v2(knl).device_code(), CTarget) from pytools.prefork import ExecError def eval_tester(knl): return np.allclose( knl(a=np.zeros(10, dtype=np.int32), b=np.arange(10, dtype=np.int32))[1], np.arange(10)) import os path_store = os.environ["PATH"] ccomp = None try: # test with path wiped out such that we can't find gcc with pytest.raises(ExecError): os.environ["PATH"] = "" ccomp = CCompiler() __test(eval_tester, ExecutableCTarget, compiler=ccomp) finally: # make sure we restore the path os.environ["PATH"] = path_store # and, with the path restored we should now be able to properly execute with # the default (non-guessed) toolchain! __test(eval_tester, ExecutableCTarget, compiler=ccomp) # and test that we will fail if we remove a required attribute del ccomp.toolchain.undefines with pytest.raises(AttributeError): __test(eval_tester, ExecutableCTarget, compiler=ccomp) # next test that some made up compiler can be specified ccomp = CCompiler(cc="foo") assert isinstance(ccomp.toolchain, GCCToolchain) assert ccomp.toolchain.cc == "foo" # and that said made up compiler errors out with pytest.raises(ExecError): __test(eval_tester, ExecutableCTarget, compiler=ccomp)
def make_kernels(self, Vf, Vc): """ Interpolation and restriction kernels between arbitrary elements. This is temporary while we wait for structure-preserving tfsc kernels. """ self.prolong_kernel = self.prolongation_transfer_kernel_action( Vf, self.uc) matrix_kernel = self.prolongation_transfer_kernel_action( Vf, firedrake.TestFunction(Vc)) # The way we transpose the prolongation kernel is suboptimal. # A local matrix is generated each time the kernel is executed. element_kernel = loopy.generate_code_v2( matrix_kernel.code).device_code() element_kernel = element_kernel.replace( "void expression_kernel", "static void expression_kernel") dimc = Vc.finat_element.space_dimension() * Vc.value_size dimf = Vf.finat_element.space_dimension() * Vf.value_size restrict_code = f""" {element_kernel} void restriction({ScalarType_c} *restrict Rc, const {ScalarType_c} *restrict Rf, const {ScalarType_c} *restrict w) {{ {ScalarType_c} Afc[{dimf}*{dimc}] = {{0}}; expression_kernel(Afc); for ({IntType_c} i = 0; i < {dimf}; i++) for ({IntType_c} j = 0; j < {dimc}; j++) Rc[j] += Afc[i*{dimc} + j] * Rf[i] * w[i]; }} """ self.restrict_kernel = op2.Kernel(restrict_code, "restriction")
def test_recursive_nested_dependent_reduction(ctx_factory): dtype = np.dtype(np.int32) ctx = ctx_factory() knl = lp.make_kernel([ "{[itgt]: 0 <= itgt < ntgts}", "{[isrc_box]: 0 <= isrc_box < nboxes}", "{[isrc]: 0 <= isrc < npart}" ], """ for itgt for isrc_box <> npart = nparticles_per_box[isrc_box] <> boxsum = sum(isrc, isrc+isrc_box+itgt) end a[itgt] = sum(isrc_box, boxsum) end """, [ lp.ValueArg("n", np.int32), lp.GlobalArg("a", dtype, ("n", )), lp.GlobalArg("nparticles_per_box", np.int32, ("nboxes", )), lp.ValueArg("ntgts", np.int32), lp.ValueArg("nboxes", np.int32), ], assumptions="ntgts>=1", target=lp.PyOpenCLTarget(ctx.devices[0])) print(lp.generate_code_v2(knl).device_code())
def test_c_execution_with_global_temporaries(): # ensure that the "host" code of a bare ExecutableCTarget with # global constant temporaries is None from loopy.target.c import ExecutableCTarget from loopy.kernel.data import temp_var_scope as scopes n = 10 knl = lp.make_kernel( '{[i]: 0 <= i < n}', """ a[i] = b[i] """, [ lp.GlobalArg('a', shape=(n, ), dtype=np.int32), lp.TemporaryVariable('b', shape=(n, ), initializer=np.arange(n, dtype=np.int32), dtype=np.int32, read_only=True, scope=scopes.GLOBAL) ], target=ExecutableCTarget()) knl = lp.fix_parameters(knl, n=n) assert ('int b[%d]' % n) not in lp.generate_code_v2(knl).host_code() assert np.allclose(knl(a=np.zeros(10, dtype=np.int32))[1], np.arange(10))
def test_automatic_scan_detection(): knl = lp.make_kernel(["[n] -> {[i]: 0<=i<n}", "{[j]: 0<=j<=2*i}"], """ a[i] = sum(j, j**2) """) cgr = lp.generate_code_v2(knl) assert "scan" not in cgr.device_code()
def generate_single_cell_wrapper(iterset, args, forward_args=(), kernel_name=None, wrapper_name=None): """Generates wrapper for a single cell. No iteration loop, but cellwise data is extracted. Cell is expected as an argument to the wrapper. For extruded, the numbering of the cells is columnwise continuous, bottom to top. :param iterset: The iteration set :param args: :class:`Arg`s :param forward_args: To forward unprocessed arguments to the kernel via the wrapper, give an iterable of strings describing their C types. :param kernel_name: Kernel function name :param wrapper_name: Wrapper function name :return: string containing the C code for the single-cell wrapper """ from pyop2.codegen.builder import WrapperBuilder from pyop2.codegen.rep2loopy import generate from loopy.types import OpaqueType forward_arg_types = [OpaqueType(fa) for fa in forward_args] builder = WrapperBuilder(iterset=iterset, single_cell=True, forward_arg_types=forward_arg_types) for arg in args: builder.add_argument(arg) builder.set_kernel(Kernel("", kernel_name)) wrapper = generate(builder, wrapper_name) code = loopy.generate_code_v2(wrapper) return code.device_code()
def test_kernel_splitting_with_loop(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{ [i,k]: 0<=i<n and 0<=k<3 }", """ c[k,i] = a[k, i + 1] out[k,i] = c[k,i] """) knl = lp.add_and_infer_dtypes(knl, {"a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32}) ref_knl = knl knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") # schedule from loopy.preprocess import preprocess_kernel knl = preprocess_kernel(knl) from loopy.schedule import get_one_scheduled_kernel knl = get_one_scheduled_kernel(knl) # map schedule onto host or device print(knl) cgr = lp.generate_code_v2(knl) assert len(cgr.device_programs) == 2 print(cgr.device_code()) print(cgr.host_code()) lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5))
def build_loopy_kernel_A_text(): knl_name = "kernel_tensor_A" knl = lp.make_kernel("{ [i,j,k]: 0<=i,j<n and 0<=k<m }", """ A[i,j] = c*sum(k, B[k,i]*B[k,j]) """, name=knl_name, assumptions="n >= 1 and m >= 1", lang_version=lp.MOST_RECENT_LANGUAGE_VERSION, target=lp.CTarget()) knl = lp.add_and_infer_dtypes( knl, { "A": np.dtype(np.double), "B": np.dtype(np.double), "c": np.dtype(np.double) }) knl = lp.fix_parameters(knl, n=3, m=2) knl = lp.prioritize_loops(knl, "i,j") #print(knl) knl_c, knl_h = lp.generate_code_v2(knl).device_code(), str( lp.generate_header(knl)[0]) replacements = [("__restrict__", "restrict")] knl_c = utils.replace_strings(knl_c, replacements) knl_h = utils.replace_strings(knl_h, replacements) knl_call = "kernel_tensor_A(A, &B[0][0], 1.0/(2.0*Ae));" return knl_name, knl_call, knl_c, knl_h
def build_loopy_kernel_b_text(): knl_name = "kernel_tensor_b" knl = lp.make_kernel("{ [i]: 0<=i<n }", """ b[i] = c """, name="kernel_tensor_b", lang_version=lp.MOST_RECENT_LANGUAGE_VERSION, target=lp.CTarget()) knl = lp.add_and_infer_dtypes(knl, { "b": np.dtype(np.double), "c": np.dtype(np.double) }) knl = lp.fix_parameters(knl, n=3) #print(knl) knl_c, knl_h = lp.generate_code_v2(knl).device_code(), str( lp.generate_header(knl)[0]) replacements = [("__restrict__", "restrict")] knl_c = utils.replace_strings(knl_c, replacements) knl_h = utils.replace_strings(knl_h, replacements) knl_call = "kernel_tensor_b(b, Ae / 6.0);" return knl_name, knl_call, knl_c, knl_h
def test_generate_c_snippet(): from pymbolic import var I = var("I") # noqa f = var("f") df = var("df") q_v = var("q_v") eN = var("eN") # noqa k = var("k") u = var("u") from functools import partial l_sum = partial(lp.Reduction, "sum", allow_simultaneous=True) Instr = lp.Assignment # noqa knl = lp.make_kernel("{[I, k]: 0<=I<nSpace and 0<=k<nQuad}", [ Instr(f[I], l_sum(k, q_v[k, I] * u)), Instr(df[I], l_sum(k, q_v[k, I])), ], [ lp.GlobalArg("q_v", np.float64, shape="nQuad, nSpace"), lp.GlobalArg("f,df", np.float64, shape="nSpace"), lp.ValueArg("u", np.float64), "...", ], target=CTarget(), assumptions="nQuad>=1") if 0: # enable to play with prefetching # (prefetch currently requires constant sizes) knl = lp.fix_parameters(knl, nQuad=5, nSpace=3) knl = lp.add_prefetch(knl, "q_v", "k,I", default_tag=None) knl = lp.split_iname(knl, "k", 4, inner_tag="unr", slabs=(0, 1)) knl = lp.prioritize_loops(knl, "I,k_outer,k_inner") print(lp.generate_code_v2(knl))
def test_solve_callable(self, zero_vec, solve_mat, solve_vec): loopy.set_caching_enabled(False) k = loopy.make_kernel( ["{[i,j] : 0 <= i,j < 2}"], """ x[:] = solve(A[:,:], b[:]) """, [ loopy.GlobalArg('x', dtype=np.float64, shape=(2, )), loopy.GlobalArg('A', dtype=np.float64, shape=(2, 2)), loopy.GlobalArg( 'b', dtype=np.float64, shape=(2, ), ) ], target=loopy.CTarget(), name="callable_kernel2", lang_version=(2018, 2)) k = loopy.register_function_id_to_in_knl_callable_mapper( k, solve_fn_lookup) code = loopy.generate_code_v2(k).device_code() code.replace('void callable_kernel2', 'static void callable_kernel2') loopykernel = op2.Kernel(code, k.name, ldargs=["-llapack"]) args = [zero_vec(op2.READ), solve_mat(op2.READ), solve_vec(op2.WRITE)] op2.par_loop(loopykernel, solve_mat.dataset.set, *args) expected = np.linalg.solve(solve_mat.data, solve_vec.data) assert np.allclose(expected, zero_vec.data)
def test_inverse_callable(self, zero_mat, inv_mat): loopy.set_caching_enabled(False) k = loopy.make_kernel( ["{[i,j] : 0 <= i,j < 2}"], """ B[:,:] = inv(A[:,:]) """, [ loopy.GlobalArg('B', dtype=np.float64, shape=(2, 2)), loopy.GlobalArg('A', dtype=np.float64, shape=(2, 2)) ], target=loopy.CTarget(), name="callable_kernel", lang_version=(2018, 2)) k = loopy.register_function_id_to_in_knl_callable_mapper( k, inv_fn_lookup) code = loopy.generate_code_v2(k).device_code() code.replace('void callable_kernel', 'static void callable_kernel') loopykernel = op2.Kernel(code, k.name, ldargs=["-llapack"]) op2.par_loop(loopykernel, zero_mat.dataset.set, zero_mat(op2.WRITE), inv_mat(op2.READ)) expected = np.linalg.inv(inv_mat.data) assert np.allclose(expected, zero_mat.data)
def loopy_example(): knl = lp.make_kernel("{ [i]: 0<=i<n }", "out[i] = 2*a[i]", lang_version=lp.MOST_RECENT_LANGUAGE_VERSION, target=lp.CTarget()) knl = lp.add_and_infer_dtypes(knl, {"a": np.dtype(np.float32)}) print(lp.generate_code_v2(knl).device_code())
def test_numba_target(): knl = lp.make_kernel("{[i,j,k]: 0<=i,j<M and 0<=k<N}", "D[i,j] = sqrt(sum(k, (X[i, k]-X[j, k])**2))", target=lp.NumbaTarget()) knl = lp.add_and_infer_dtypes(knl, {"X": np.float32}) print(lp.generate_code_v2(knl).device_code())
def make_domain(prog): knl_code = lp.generate_code_v2(prog) for id, v in knl_code.implemented_domains.items(): # print(v[0].compute_schedule()) print(v[0].to_str()) print(dir(v[0]))
def write_ptx(ctx, knl, filename=None): cl_program = cl.Program(ctx, lp.generate_code_v2(knl).device_code()).build( options=knl.options.cl_build_options) ptx_src = cl_program.binaries[0] if not filename: filename = "ptx_" + knl.name + ".ptx" ptx_src_file = open(filename, 'w') ptx_src_file.write(ptx_src.decode('utf-8', 'ignore'))
def test_numba_target(): knl = lp.make_kernel( "{[i,j,k]: 0<=i,j<M and 0<=k<N}", "D[i,j] = sqrt(sum(k, (X[i, k]-X[j, k])**2))", target=lp.NumbaTarget()) knl = lp.add_and_infer_dtypes(knl, {"X": np.float32}) print(lp.generate_code_v2(knl).device_code())
def test_missing_compilers(): from loopy.target.c import ExecutableCTarget, CTarget from loopy.target.c.c_execution import CCompiler from codepy.toolchain import GCCToolchain def __test(evalfunc, target, **targetargs): n = 10 knl = lp.make_kernel('{[i]: 0 <= i < n}', """ a[i] = b[i] """, [lp.GlobalArg('a', shape=(n,), dtype=np.int32), lp.GlobalArg('b', shape=(n,), dtype=np.int32)], target=target(**targetargs)) knl = lp.fix_parameters(knl, n=n) return evalfunc(knl) assert __test(lambda knl: lp.generate_code_v2(knl).device_code(), CTarget) from pytools.prefork import ExecError def eval_tester(knl): return np.allclose(knl(a=np.zeros(10, dtype=np.int32), b=np.arange(10, dtype=np.int32))[1], np.arange(10)) import os path_store = os.environ["PATH"] ccomp = None try: # test with path wiped out such that we can't find gcc with pytest.raises(ExecError): os.environ["PATH"] = '' ccomp = CCompiler() __test(eval_tester, ExecutableCTarget, compiler=ccomp) finally: # make sure we restore the path os.environ["PATH"] = path_store # and, with the path restored we should now be able to properly execute with # the default (non-guessed) toolchain! __test(eval_tester, ExecutableCTarget, compiler=ccomp) # and test that we will fail if we remove a required attribute del ccomp.toolchain.undefines with pytest.raises(AttributeError): __test(eval_tester, ExecutableCTarget, compiler=ccomp) # next test that some made up compiler can be specified ccomp = CCompiler(cc='foo') assert isinstance(ccomp.toolchain, GCCToolchain) assert ccomp.toolchain.cc == 'foo' # and that said made up compiler errors out with pytest.raises(ExecError): __test(eval_tester, ExecutableCTarget, compiler=ccomp)
def test_triangle_domain(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel("{[i,j]: 0<=i,j<n and i <= j}", "a[i,j] = 17", assumptions="n>=1", target=lp.PyOpenCLTarget(ctx.devices[0])) print(knl) print(lp.generate_code_v2(knl).device_code())
def test_math_function(target, tp): # Test correct maths functions are generated for C and OpenCL # backend instead for different data type data_type = {"f32": np.float32, "f64": np.float64}[tp] import pymbolic.primitives as p i = p.Variable("i") xi = p.Subscript(p.Variable("x"), i) yi = p.Subscript(p.Variable("y"), i) zi = p.Subscript(p.Variable("z"), i) n = 100 domain = "{[i]: 0<=i<%d}" % n data = [ lp.GlobalArg("x", data_type, shape=(n, )), lp.GlobalArg("y", data_type, shape=(n, )), lp.GlobalArg("z", data_type, shape=(n, )) ] inst = [lp.Assignment(xi, p.Variable("min")(yi, zi))] knl = lp.make_kernel(domain, inst, data, target=target()) code = lp.generate_code_v2(knl).device_code() assert "fmin" in code if tp == "f32" and target == CTarget: assert "fminf" in code else: assert "fminf" not in code inst = [lp.Assignment(xi, p.Variable("max")(yi, zi))] knl = lp.make_kernel(domain, inst, data, target=target()) code = lp.generate_code_v2(knl).device_code() assert "fmax" in code if tp == "f32" and target == CTarget: assert "fmaxf" in code else: assert "fmaxf" not in code
def test_lpy_iname_presplit(opts): """ Tests that inames access to pre-split inames in non-split loopy arrays are correctly handled """ from pymbolic.primitives import Subscript, Variable # create array split asplit = array_splitter(opts) # create a test kernel arg1 = lp.GlobalArg('a1', shape=(20, 10), order=opts.order) arg2 = lp.GlobalArg('a2', shape=(16, 16), order=opts.order) k = lp.make_kernel([ '{[i]: 0 <= i < 10}', '{{[j_outer]: 0 <= j_outer < {}}}'.format( int(np.ceil(10 / VECTOR_WIDTH))), '{{[j_inner]: 0 <= j_inner < {}}}'.format(VECTOR_WIDTH) ], """ a1[j_outer, i] = 1 {id=a1} a2[j_outer, i] = 1 {id=a2} """, [arg1, arg2], silenced_warnings=['no_device_in_pre_codegen_checks'], target=lp.OpenCLTarget()) k = asplit.split_loopy_arrays(k, dont_split=['a1', 'a2']) # ensure there's no loopy errors lp.generate_code_v2(k).device_code() def __indexer(): return (Variable('j_outer') * VECTOR_WIDTH + Variable('j_inner'), Variable('i')) # check indexing assign = next(insn.assignee for insn in k.instructions if insn.id == 'a1') # construct index assert isinstance(assign, Subscript) and assign.index == __indexer() # now test with evenly sized assign = next(insn.assignee for insn in k.instructions if insn.id == 'a2') assert isinstance(assign, Subscript) and assign.index == __indexer()
def test_math_function(target, tp): # Test correct maths functions are generated for C and OpenCL # backend instead for different data type data_type = {"f32": np.float32, "f64": np.float64}[tp] import pymbolic.primitives as p i = p.Variable("i") xi = p.Subscript(p.Variable("x"), i) yi = p.Subscript(p.Variable("y"), i) zi = p.Subscript(p.Variable("z"), i) n = 100 domain = "{[i]: 0<=i<%d}" % n data = [lp.GlobalArg("x", data_type, shape=(n,)), lp.GlobalArg("y", data_type, shape=(n,)), lp.GlobalArg("z", data_type, shape=(n,))] inst = [lp.Assignment(xi, p.Variable("min")(yi, zi))] knl = lp.make_kernel(domain, inst, data, target=target()) code = lp.generate_code_v2(knl).device_code() assert "fmin" in code if tp == "f32" and target == CTarget: assert "fminf" in code else: assert "fminf" not in code inst = [lp.Assignment(xi, p.Variable("max")(yi, zi))] knl = lp.make_kernel(domain, inst, data, target=target()) code = lp.generate_code_v2(knl).device_code() assert "fmax" in code if tp == "f32" and target == CTarget: assert "fmaxf" in code else: assert "fmaxf" not in code
def test_rob_stroud_bernstein(ctx_factory): ctx = ctx_factory() # NOTE: tmp would have to be zero-filled beforehand knl = lp.make_kernel( "{[el, i2, alpha1,alpha2]: \ 0 <= el < nels and \ 0 <= i2 < nqp1d and \ 0 <= alpha1 <= deg and 0 <= alpha2 <= deg-alpha1 }", """ for el,i2 <> xi = qpts[1, i2] <> s = 1-xi <> r = xi/s <> aind = 0 {id=aind_init} for alpha1 <> w = s**(deg-alpha1) {id=init_w} for alpha2 tmp[el,alpha1,i2] = tmp[el,alpha1,i2] + w * coeffs[aind] \ {id=write_tmp,dep=init_w:aind_init} w = w * r * ( deg - alpha1 - alpha2 ) / (1 + alpha2) \ {id=update_w,dep=init_w:write_tmp} aind = aind + 1 \ {id=aind_incr,dep=aind_init:write_tmp:update_w} end end end """, [ # Must declare coeffs to have "no" shape, to keep loopy # from trying to figure it out the shape automatically. lp.GlobalArg("coeffs", None, shape=None), "..." ], assumptions="deg>=0 and nels>=1", target=lp.PyOpenCLTarget(ctx.devices[0]) ) knl = lp.fix_parameters(knl, nqp1d=7, deg=4) knl = lp.split_iname(knl, "el", 16, inner_tag="l.0") knl = lp.split_iname(knl, "el_outer", 2, outer_tag="g.0", inner_tag="ilp", slabs=(0, 1)) knl = lp.tag_inames(knl, dict(i2="l.1", alpha1="unr", alpha2="unr")) knl = lp.add_dtypes(knl, dict( qpts=np.float32, coeffs=np.float32, tmp=np.float32, )) print(lp.generate_code_v2(knl))
def create_rand_args(ctx, knl, param_dict): queue = cl.CommandQueue(ctx) info = lp.generate_code_v2(knl).implemented_data_info args, arg_data = lp.auto_test.make_ref_args(knl, info, queue, param_dict) args.clear() del args rand_args = lp.auto_test.make_args(knl, info, queue, arg_data, param_dict) del arg_data[:] del arg_data return rand_args
def test_automatic_scan_detection(): knl = lp.make_kernel( [ "[n] -> {[i]: 0<=i<n}", "{[j]: 0<=j<=2*i}" ], """ a[i] = sum(j, j**2) """ ) cgr = lp.generate_code_v2(knl) assert "scan" not in cgr.device_code()
def test_cuda_short_vector(): knl = lp.make_kernel( "{ [i]: 0<=i<n }", "out[i] = 2*a[i]", target=lp.CudaTarget()) knl = lp.set_options(knl, write_code=True) knl = lp.split_iname(knl, "i", 4, slabs=(0, 1), inner_tag="vec") knl = lp.split_array_axis(knl, "a,out", axis_nr=0, count=4) knl = lp.tag_array_axes(knl, "a,out", "C,vec") knl = lp.set_options(knl, write_wrapper=True) knl = lp.add_and_infer_dtypes(knl, {"a": np.float32}) print(lp.generate_code_v2(knl).device_code())
def test_numba_cuda_target(): knl = lp.make_kernel( "{[i,j,k]: 0<=i,j<M and 0<=k<N}", "D[i,j] = sqrt(sum(k, (X[i, k]-X[j, k])**2))", target=lp.NumbaCudaTarget()) knl = lp.assume(knl, "M>0") knl = lp.split_iname(knl, "i", 16, outer_tag='g.0') knl = lp.split_iname(knl, "j", 128, inner_tag='l.0', slabs=(0, 1)) knl = lp.add_prefetch(knl, "X[i,:]", default_tag="l.auto") knl = lp.fix_parameters(knl, N=3) knl = lp.prioritize_loops(knl, "i_inner,j_outer") knl = lp.tag_inames(knl, "k:unr") knl = lp.tag_array_axes(knl, "X", "N0,N1") knl = lp.add_and_infer_dtypes(knl, {"X": np.float32}) print(lp.generate_code_v2(knl).all_code())
def code_to_compile(self): from pyop2.codegen.builder import WrapperBuilder from pyop2.codegen.rep2loopy import generate builder = WrapperBuilder(iterset=self._iterset, iteration_region=self._iteration_region, pass_layer_to_kernel=self._pass_layer_arg) for arg in self._args: builder.add_argument(arg) builder.set_kernel(self._kernel) wrapper = generate(builder) code = loopy.generate_code_v2(wrapper) if self._kernel._cpp: from loopy.codegen.result import process_preambles preamble = "".join(process_preambles(getattr(code, "device_preambles", []))) device_code = "\n\n".join(str(dp.ast) for dp in code.device_programs) return preamble + "\nextern \"C\" {\n" + device_code + "\n}\n" return code.device_code()
def test_reduction_with_conditional(): # The purpose of the 'l' iname is to force the entire kernel (including the # predicate) into device code. knl = lp.make_kernel( "{ [l,i] : 0<=l,i<42 }", """ if l > 0 b[l] = sum(i, l*a[i]) end """, [lp.ValueArg("n", dtype=np.int32), "..."]) knl = lp.tag_inames(knl, "l:g.0") knl = lp.add_and_infer_dtypes(knl, {"a": np.float32}) code = lp.generate_code_v2(knl).device_code() print(code) # Check that the if appears before the loop that realizes the reduction. assert code.index("if") < code.index("for")
def test_ispc_target(occa_mode=False): from loopy.target.ispc import ISPCTarget knl = lp.make_kernel( "{ [i]: 0<=i<n }", "out[i] = 2*a[i]", [ lp.GlobalArg("out,a", np.float32, shape=lp.auto), "..." ], target=ISPCTarget(occa_mode=occa_mode)) knl = lp.split_iname(knl, "i", 8, inner_tag="l.0") knl = lp.split_iname(knl, "i_outer", 4, outer_tag="g.0", inner_tag="ilp") knl = lp.add_prefetch(knl, "a", ["i_inner", "i_outer_inner"]) codegen_result = lp.generate_code_v2( lp.get_one_scheduled_kernel( lp.preprocess_kernel(knl))) print(codegen_result.device_code()) print(codegen_result.host_code())
def test_c_execution_with_global_temporaries(): # ensure that the "host" code of a bare ExecutableCTarget with # global constant temporaries is None from loopy.target.c import ExecutableCTarget from loopy.kernel.data import temp_var_scope as scopes n = 10 knl = lp.make_kernel('{[i]: 0 <= i < n}', """ a[i] = b[i] """, [lp.GlobalArg('a', shape=(n,), dtype=np.int32), lp.TemporaryVariable('b', shape=(n,), initializer=np.arange(n, dtype=np.int32), dtype=np.int32, read_only=True, scope=scopes.GLOBAL)], target=ExecutableCTarget()) knl = lp.fix_parameters(knl, n=n) assert ('int b[%d]' % n) not in lp.generate_code_v2(knl).host_code() assert np.allclose(knl(a=np.zeros(10, dtype=np.int32))[1], np.arange(10))
def gen_code(knl): knl = lp.preprocess_kernel(knl) knl = lp.get_one_scheduled_kernel(knl) codegen_result = lp.generate_code_v2(knl) return codegen_result.device_code() + "\n" + codegen_result.host_code()
import loopy as lp import pyopencl as cl import pyopencl.array from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2 # setup # ----- ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) n = 15 * 10**6 a = cl.array.arange(queue, n, dtype=np.float32) # create # ------ knl = lp.make_kernel( "{ [i]: 0<=i<n }", "out[i] = 2*a[i]") # transform # --------- knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") # execute # ------- evt, (out,) = knl(queue, a=a) # ENDEXAMPLE knl = lp.add_and_infer_dtypes(knl, {"a": np.dtype(np.float32)}) print(lp.generate_code_v2(knl).device_code())
knl = lp.make_kernel( "{ [i,k]: 0<=i<n and 0<=k<3 }", """ for i, k ... gbarrier c[k,i] = a[k, i + 1] ... gbarrier out[k,i] = c[k,i] end """, seq_dependencies=True) # transform knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") knl = lp.add_and_infer_dtypes(knl, {"a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32}) # schedule from loopy.preprocess import preprocess_kernel knl = preprocess_kernel(knl) from loopy.schedule import get_one_scheduled_kernel knl = get_one_scheduled_kernel(knl) # map schedule onto host or device print(knl) cgr = lp.generate_code_v2(knl) print(cgr.device_code()) print(cgr.host_code())