def test_type_inference_no_artificial_doubles(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel("{[i]: 0<=i<n}", """ <> bb = a[i] - b[i] c[i] = bb """, [ lp.GlobalArg("a", np.float32, shape=("n", )), lp.GlobalArg("b", np.float32, shape=("n", )), lp.GlobalArg("c", np.float32, shape=("n", )), lp.ValueArg("n", np.int32), ], assumptions="n>=1") knl = lp.preprocess_kernel(knl, ctx.devices[0]) for k in lp.generate_loop_schedules(knl): code = lp.generate_code(k) assert "double" not in code
def test_sized_and_complex_literals(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel("{[i]: 0<=i<n}", """ <> aa = 5jf <> bb = 5j a[i] = imag(aa) b[i] = imag(bb) c[i] = 5f """, [ lp.GlobalArg("a", np.float32, shape=("n", )), lp.GlobalArg("b", np.float32, shape=("n", )), lp.GlobalArg("c", np.float32, shape=("n", )), lp.ValueArg("n", np.int32), ], assumptions="n>=1") lp.auto_test_vs_ref(knl, ctx, knl, parameters=dict(n=5))
def test_vector_types(ctx_factory, vec_len): ctx = ctx_factory() knl = lp.make_kernel( "{ [i,j]: 0<=i<n and 0<=j<vec_len }", "out[i,j] = 2*a[i,j]", [ lp.GlobalArg("a", np.float32, shape=lp.auto), lp.GlobalArg("out", np.float32, shape=lp.auto), "..." ]) knl = lp.fix_parameters(knl, vec_len=vec_len) ref_knl = knl knl = lp.tag_data_axes(knl, "out", "c,vec") knl = lp.tag_inames(knl, dict(j="unr")) knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=20000))
def test_conditional(ctx_factory): #logging.basicConfig(level=logging.DEBUG) ctx = ctx_factory() knl = lp.make_kernel( "{ [i,j]: 0<=i,j<n }", """ <> my_a = a[i,j] {id=read_a} <> a_less_than_zero = my_a < 0 {dep=read_a,inames=i:j} my_a = 2*my_a {id=twice_a,dep=read_a,if=a_less_than_zero} my_a = my_a+1 {id=aplus,dep=twice_a,if=a_less_than_zero} out[i,j] = 2*my_a {dep=aplus} """, [ lp.GlobalArg("a", np.float32, shape=lp.auto), lp.GlobalArg("out", np.float32, shape=lp.auto), "..." ]) ref_knl = knl lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=200))
def test_generate_c_snippet(): from loopy.target.c import CTarget from pymbolic import var I = var("I") # noqa f = var("f") df = var("df") q_v = var("q_v") eN = var("eN") # noqa k = var("k") u = var("u") from functools import partial l_sum = partial(lp.Reduction, "sum", allow_simultaneous=True) Instr = lp.Assignment # noqa knl = lp.make_kernel( "{[I, k]: 0<=I<nSpace and 0<=k<nQuad}", [ Instr(f[I], l_sum(k, q_v[k, I]*u)), Instr(df[I], l_sum(k, q_v[k, I])), ], [ lp.GlobalArg("q_v", np.float64, shape="nQuad, nSpace"), lp.GlobalArg("f,df", np.float64, shape="nSpace"), lp.ValueArg("u", np.float64), "...", ], target=CTarget(), assumptions="nQuad>=1") if 0: # enable to play with prefetching # (prefetch currently requires constant sizes) knl = lp.fix_parameters(knl, nQuad=5, nSpace=3) knl = lp.add_prefetch(knl, "q_v", "k,I", default_tag=None) knl = lp.split_iname(knl, "k", 4, inner_tag="unr", slabs=(0, 1)) knl = lp.prioritize_loops(knl, "I,k_outer,k_inner") knl = lp.preprocess_kernel(knl) knl = lp.get_one_scheduled_kernel(knl) print(lp.generate_body(knl))
def test_troublesome_premagma_fermi_matrix_mul(ctx_factory): dtype = np.float32 ctx = ctx_factory() order = "C" n = 6 * 16 * 2 knl = lp.make_kernel( "{[i,j,k]: 0<=i,j,k<%d}" % n, ["c[i, j] = sum(k, a[i, k]*b[k, j])"], [ lp.GlobalArg("a", dtype, shape=(n, n), order=order), lp.GlobalArg("b", dtype, shape=(n, n), order=order), lp.GlobalArg("c", dtype, shape=(n, n), order=order), ], name="matmul") seq_knl = knl i_reg = 2 j_reg = 2 i_chunks = 16 j_chunks = 16 knl = lp.split_iname(knl, "i", i_reg * i_chunks, outer_tag="g.0") knl = lp.split_iname(knl, "i_inner", i_reg, outer_tag="l.0", inner_tag="ilp") knl = lp.split_iname(knl, "j", j_reg * j_chunks, outer_tag="g.1") knl = lp.split_iname(knl, "j_inner", j_reg, outer_tag="l.1", inner_tag="ilp") knl = lp.split_iname(knl, "k", 16) knl = lp.add_prefetch(knl, 'a', ["k_inner", "i_inner_inner", "i_inner_outer"]) lp.auto_test_vs_ref(seq_knl, ctx, knl, op_count=[2 * n**3 / 1e9], op_label=["GFlops"], parameters={})
def get_source_args(self): return [ KernelArgument( loopy_arg=lp.GlobalArg( self.dir_vec_name, None, shape=(self.dim, "nsources"), dim_tags="sep,C"), ) ] + self.inner_kernel.get_source_args()
def test_reduction_with_conditional(): # Test whether realization of a reduction inherits predicates # of the original instruction. Tested with the CTarget, because # the PyOpenCL target will hoist the conditional into the host # code in this minimal example. knl = lp.make_kernel( "{ [i] : 0<=i<42 }", """ if n > 0 <>b = sum(i, a[i]) end """, [lp.GlobalArg("a", dtype=np.float32, shape=(42,)), lp.GlobalArg("n", dtype=np.float32, shape=())], target=lp.CTarget()) code = lp.generate_body(knl) # Check that the if appears before the loop that realizes the reduction. assert code.index("if") < code.index("for")
def test_write_parameter(ctx_factory): dtype = np.float32 ctx = ctx_factory() knl = lp.make_kernel("{[i,j]: 0<=i,j<n }", """ a = sum((i,j), i*j) b = sum(i, sum(j, i*j)) n = 15 """, [ lp.GlobalArg("a", dtype, shape=()), lp.GlobalArg("b", dtype, shape=()), lp.ValueArg("n", np.int32, approximately=1000), ], assumptions="n>=1") import pytest with pytest.raises(RuntimeError): lp.CompiledKernel(ctx, knl).get_code()
def test_memory_tools_defn(): wrapper = __test_cases() for opts in wrapper: # create a dummy callgen callgen = CallgenResult(order=opts.order, lang=opts.lang, dev_mem_type=wrapper.state['dev_mem_type'], type_map=type_map(opts.lang)) # create a memory manager mem = get_memory(callgen, host_namer=HostNamer(), device_namer=DeviceNamer()) a1 = lp.GlobalArg('a1', shape=(arc.problem_size), dtype=np.int32) a2 = lp.GlobalArg('a2', shape=(arc.problem_size, 10), dtype=np.int64) d3 = lp.GlobalArg('d3', shape=(arc.problem_size, 10, 10), dtype=np.float64) a4 = lp.ValueArg('a4', dtype=np.int64) a5 = lp.ValueArg('a5', dtype=np.int32) a6 = lp.TemporaryVariable('a6', initializer=np.array([0, 1, 2]), read_only=True) if opts.lang == 'opencl': assert mem.define(True, a1) == 'cl_mem d_a1;' assert mem.define(False, a2) == 'long int* h_a2;' assert mem.define(True, d3) == 'cl_mem d_d3;' assert mem.define(False, a4) == 'long int h_a4;' assert mem.define(True, a5) == 'cl_uint d_a5;' assert mem.define(True, a5) == 'cl_uint d_a5;' with assert_raises(Exception): mem.define(True, a6, host_constant=True) assert mem.define(False, a6, host_constant=True) == \ 'const long int h_a6[3] = {0, 1, 2};' elif opts.lang == 'c': assert mem.define(True, a1) == 'int* d_a1;' assert mem.define(False, a2) == 'long int* h_a2;' assert mem.define(True, d3) == 'double* d_d3;' assert mem.define(False, a4) == 'long int h_a4;' assert mem.define(True, a5) == 'int d_a5;' with assert_raises(Exception): mem.define(True, a6, host_constant=True) assert mem.define(False, a6, host_constant=True) == \ 'const long int h_a6[3] = {0, 1, 2};' else: raise NotImplementedError
def test_lpy_iname_presplit(opts): """ Tests that inames access to pre-split inames in non-split loopy arrays are correctly handled """ from pymbolic.primitives import Subscript, Variable # create array split asplit = array_splitter(opts) # create a test kernel arg1 = lp.GlobalArg('a1', shape=(20, 10), order=opts.order) arg2 = lp.GlobalArg('a2', shape=(16, 16), order=opts.order) k = lp.make_kernel([ '{[i]: 0 <= i < 10}', '{{[j_outer]: 0 <= j_outer < {}}}'.format( int(np.ceil(10 / VECTOR_WIDTH))), '{{[j_inner]: 0 <= j_inner < {}}}'.format(VECTOR_WIDTH) ], """ a1[j_outer, i] = 1 {id=a1} a2[j_outer, i] = 1 {id=a2} """, [arg1, arg2], silenced_warnings=['no_device_in_pre_codegen_checks'], target=lp.OpenCLTarget()) k = asplit.split_loopy_arrays(k, dont_split=['a1', 'a2']) # ensure there's no loopy errors lp.generate_code_v2(k).device_code() def __indexer(): return (Variable('j_outer') * VECTOR_WIDTH + Variable('j_inner'), Variable('i')) # check indexing assign = next(insn.assignee for insn in k.instructions if insn.id == 'a1') # construct index assert isinstance(assign, Subscript) and assign.index == __indexer() # now test with evenly sized assign = next(insn.assignee for insn in k.instructions if insn.id == 'a2') assert isinstance(assign, Subscript) and assign.index == __indexer()
def test_math_function(target, tp): # Test correct maths functions are generated for C and OpenCL # backend instead for different data type data_type = {"f32": np.float32, "f64": np.float64}[tp] import pymbolic.primitives as p i = p.Variable("i") xi = p.Subscript(p.Variable("x"), i) yi = p.Subscript(p.Variable("y"), i) zi = p.Subscript(p.Variable("z"), i) n = 100 domain = "{[i]: 0<=i<%d}" % n data = [ lp.GlobalArg("x", data_type, shape=(n, )), lp.GlobalArg("y", data_type, shape=(n, )), lp.GlobalArg("z", data_type, shape=(n, )) ] inst = [lp.Assignment(xi, p.Variable("min")(yi, zi))] knl = lp.make_kernel(domain, inst, data, target=target()) code = lp.generate_code_v2(knl).device_code() assert "fmin" in code if tp == "f32" and target == CTarget: assert "fminf" in code else: assert "fminf" not in code inst = [lp.Assignment(xi, p.Variable("max")(yi, zi))] knl = lp.make_kernel(domain, inst, data, target=target()) code = lp.generate_code_v2(knl).device_code() assert "fmax" in code if tp == "f32" and target == CTarget: assert "fmaxf" in code else: assert "fmaxf" not in code
def __get_knl(): return lp.make_kernel( '{[i]: 0 <= i < 10}', """ a[i] = b[i] """, [ lp.GlobalArg('a', shape=(10, ), dtype=np.int32), lp.ConstantArg('b', shape=(10)) ], target=ExecutableCTarget(), name='cache_test')
def test_np_bool_handling(ctx_factory): import pymbolic.primitives as p from loopy.symbolic import parse ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl = lp.make_kernel( "{:}", [lp.Assignment(parse("y"), p.LogicalNot(np.bool_(False)))], [lp.GlobalArg("y", dtype=np.bool_, shape=lp.auto)]) evt, (out, ) = knl(queue) assert out.get().item() is True
def kernel_data(self) -> List[str]: "Return arguments / data to kernel." # normalize wrt. key set like ['n,out', 'foo,bar'] csk = ','.join(self.kernel_dtypes().keys()) data = [key for key in csk.split(',')] if hasattr(self, 'extra_data_shape'): for name, shape in self.extra_data_shape.items(): shape = tuple(pm.parse(_) for _ in shape.split(',')) arg = lp.GlobalArg(name, shape=shape) data[data.index(name)] = arg return data
def test_divisibility_assumption(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel("[n] -> {[i]: 0<=i<n}", ["b[i] = 2*a[i]"], [ lp.GlobalArg("a", np.float32, shape=("n", )), lp.GlobalArg("b", np.float32, shape=("n", )), lp.ValueArg("n", np.int32), ], assumptions="n>=1 and (exists zz: n = 16*zz)") ref_knl = knl knl = lp.split_iname(knl, "i", 16) knl = lp.preprocess_kernel(knl, ctx.devices[0]) for k in lp.generate_loop_schedules(knl): code = lp.generate_code(k) assert "if" not in code lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters={"n": 16**3})
def test_join_inames(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{[i,j]: 0<=i,j<16}", [ "b[i,j] = 2*a[i,j]" ], [ lp.GlobalArg("a", np.float32, shape=(16, 16,)), lp.GlobalArg("b", np.float32, shape=(16, 16,)) ], ) ref_knl = knl knl = lp.add_prefetch(knl, "a", sweep_inames=["i", "j"], default_tag="l.auto") knl = lp.join_inames(knl, ["a_dim_0", "a_dim_1"]) lp.auto_test_vs_ref(ref_knl, ctx, knl, print_ref_code=True)
def expression_argument(expr, parameters): name = expr.name shape = expr.shape dtype = expr.dtype if shape == (): arg = loopy.ValueArg(name, dtype=dtype) else: arg = loopy.GlobalArg(name, dtype=dtype, shape=shape) idx = parameters.wrapper_arguments.index(expr) parameters.kernel_data[idx] = arg return pym.Variable(name)
def vanilla(): k = lp.make_kernel( "{ [i] : k <= i < n}", """ a[i] = a[i] + 1 """, [ lp.ValueArg("k", dtype="int32"), lp.ValueArg("n", dtype="int32"), lp.GlobalArg("a", shape=(None, ), dtype="int32") ]) k = lp.assume(k, "k >= 0 and n >= k") return k
def test_nonsense_reduction(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{[i]: 0<=i<100}", """ a[i] = sum(i, 2) """, [lp.GlobalArg("a", np.float32, shape=(100, ))]) import pytest with pytest.raises(RuntimeError): knl = lp.preprocess_kernel(knl, ctx.devices[0])
def test_nested_dependent_reduction(ctx_factory): dtype = np.dtype(np.int32) ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl = lp.make_kernel(["{[i]: 0<=i<n}", "{[j]: 0<=j<i+sumlen}"], [ "<> sumlen = ell[i]", "a[i] = sum(j, j)", ], [ lp.ValueArg("n", np.int32), lp.GlobalArg("a", dtype, ("n", )), lp.GlobalArg("ell", np.int32, ("n", )), ]) n = 330 ell = np.arange(n, dtype=np.int32) evt, (a, ) = knl(queue, ell=ell, n=n, out_host=True) tgt_result = (2 * ell - 1) * 2 * ell / 2 assert (a == tgt_result).all()
def __get_knl(): return lp.make_kernel( "{[i]: 0 <= i < 10}", """ a[i] = b[i] """, [ lp.GlobalArg("a", shape=(10, ), dtype=np.int32), lp.ConstantArg("b", shape=(10)) ], target=ExecutableCTarget(), name="cache_test")
def test_split_reduction(ctx_factory): knl = lp.make_kernel( "{[i,j,k]: 0<=i,j,k<n}", """ b = sum((i,j,k), a[i,j,k]) """, [ lp.GlobalArg("box_source_starts,box_source_counts_nonchild,a", None, shape=None), "..." ]) knl = lp.split_reduction_outward(knl, "j,k")
def test_modulo_indexing(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{[i,j]: 0<=i<n and 0<=j<5}", """ b[i] = sum(j, a[(i+j)%n]) """, [lp.GlobalArg("a", None, shape="n"), "..."]) print(knl) print( lp.CompiledKernel(ctx, knl).get_highlighted_code(dict(a=np.float32, )))
def pick_used_centers(self): knl = lp.make_kernel( """{[i]: 0<=i<ntargets}""", """ <>target_has_center = (target_to_center[i] >= 0) center_is_used[target_to_center[i]] = 1 \ {id=center_is_used_write,if=target_has_center} """, [ lp.GlobalArg( "target_to_center", shape="ntargets", offset=lp.auto), lp.GlobalArg("center_is_used", shape="ncenters"), lp.ValueArg("ncenters", np.int32), lp.ValueArg("ntargets", np.int32), ], name="pick_used_centers", silenced_warnings="write_race(center_is_used_write)", lang_version=MOST_RECENT_LANGUAGE_VERSION) knl = lp.split_iname(knl, "i", 128, inner_tag="l.0", outer_tag="g.0") return knl
def test_plain_matrix_mul(ctx_factory): ctx = ctx_factory() order = "C" n = get_suitable_size(ctx) for dtype, check, vec_size in [ (cl_array.vec.float4, check_float4, 4), (np.float32, None, 1), ]: knl = lp.make_kernel( "{[i,j,k]: 0<=i,j,k<%d}" % n, ["c[i, j] = sum(k, a[i, k]*b[k, j])"], [ lp.GlobalArg("a", dtype, shape=(n, n), order=order), lp.GlobalArg("b", dtype, shape=(n, n), order=order), lp.GlobalArg("c", dtype, shape=(n, n), order=order), ], name="matmul") ref_knl = knl knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.1") knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.0") knl = lp.split_iname(knl, "k", 16) knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"], default_tag="l.auto") knl = lp.add_prefetch(knl, "b", [ "j_inner", "k_inner", ], default_tag="l.auto") lp.auto_test_vs_ref(ref_knl, ctx, knl, op_count=[vec_size * 2 * n**3 / 1e9], op_label=["GFlops"], parameters={"n": n}, check_result=check)
def test_dependent_loop_bounds_3(ctx_factory): # The point of this test is that it shows a dependency between # domains that is exclusively mediated by the row_len temporary. # It also makes sure that row_len gets read before any # conditionals use it. dtype = np.dtype(np.float32) ctx = ctx_factory() knl = lp.make_kernel( [ "{[i]: 0<=i<n}", "{[jj]: 0<=jj<row_len}", ], [ "<> row_len = a_row_lengths[i]", "a[i,jj] = 1", ], [ lp.GlobalArg("a_row_lengths", np.int32, shape=lp.auto), lp.GlobalArg("a", dtype, shape=("n,n"), order="C"), lp.ValueArg("n", np.int32), ]) assert knl.parents_per_domain()[1] == 0 knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") cknl = lp.CompiledKernel(ctx, knl) print("---------------------------------------------------") print(cknl.get_highlighted_code()) print("---------------------------------------------------") knl_bad = lp.split_iname(knl, "jj", 128, outer_tag="g.1", inner_tag="l.1") knl = lp.preprocess_kernel(knl, ctx.devices[0]) with pytest.raises(RuntimeError): list(lp.generate_loop_schedules(knl_bad))
def test_get_field_args(proc_shape): if proc_shape != (1, 1, 1): pytest.skip("test field only on one rank") from pystella import Field, DynamicField, get_field_args x = Field("x", offset=(1, 2, 3)) y = Field("y", offset="h") z = DynamicField("z", shape=(2, "a")) import loopy as lp true_args = [ lp.GlobalArg("x", shape="(Nx+2, Ny+4, Nz+6)", offset=lp.auto), lp.GlobalArg("y", shape="(Nx+2*h, Ny+2*h, Nz+2*h)", offset=lp.auto), lp.GlobalArg("z", shape="(2, a, Nx, Ny, Nz)", offset=lp.auto), lp.GlobalArg("dzdx", shape="(2, a, 3, Nx, Ny, Nz)", offset=lp.auto), ] def lists_equal(a, b): equal = True for x in a: equal *= x in b for x in b: equal *= x in a return equal expressions = {x: y, y: x * z + z.pd[0]} args = get_field_args(expressions) assert lists_equal(args, true_args) expressions = x * y + z + z.pd[2] args = get_field_args(expressions) assert lists_equal(args, true_args) expressions = [x, y, y * z**2, 3 + z.pd[0] + z.pd[1]] args = get_field_args(expressions) assert lists_equal(args, true_args) expressions = [shift_fields(x, (1, 2, 3)), y + z.pd[0], y * z**2] args = get_field_args(expressions) assert lists_equal(args, true_args)
def left_W(ctx): order = 'C' dtype = np.float64 knl = lp.make_kernel(ctx.devices[0], [ "{[j,i,alpha,alpha1]: 0<=alpha,alpha1<r and 0<=j,i<n}", ], [ "l[alpha,alpha1]=sum((i), u[i,alpha]*u[i,alpha1])*sum((j),v[j,alpha]*v[j,alpha1])", ], [ lp.GlobalArg("v", dtype, shape="n, r", order=order), lp.GlobalArg("u", dtype, shape="n, r", order=order), lp.GlobalArg("l", dtype, shape="r, r", order=order), lp.ValueArg("n", np.int64), lp.ValueArg("r", np.int64), ], assumptions="n>=1") knl = lp.split_iname(knl, "alpha1", 16, outer_tag="g.0", inner_tag="l.0") knl = lp.split_iname(knl, "alpha", 3, outer_tag="g.1", inner_tag="l.1") knl = lp.split_iname(knl, "j", 16) knl = lp.split_iname(knl, "i", 16) return knl
def left_V(ctx): order = 'C' dtype = np.float32 knl = lp.make_kernel(ctx.devices[0], [ "{[i,k,alpha,alpha1]: 0<=alpha,alpha1<r and 0<=i,k<n}", ], [ "l[alpha,alpha1]=sum((i), u[alpha,i]*u[alpha1,i])*sum((k),w[alpha,k]*w[alpha1,k])", ], [ lp.GlobalArg("u", dtype, shape="r, n", order=order), lp.GlobalArg("w", dtype, shape="r, n", order=order), lp.GlobalArg("l", dtype, shape="r, r", order=order), lp.ValueArg("n", np.int64), lp.ValueArg("r", np.int64), ], assumptions="n>=1") knl = lp.split_iname(knl, "alpha1", 16, outer_tag="g.0", inner_tag="l.0") knl = lp.split_iname(knl, "alpha", 3, outer_tag="g.1", inner_tag="l.1") knl = lp.split_iname(knl, "i", 16) knl = lp.split_iname(knl, "k", 16) return knl