def get_2d_knl(context, dtype): knl = lp.make_kernel("{[i,j]: 0<=i,j<n}", """ <> xx = 4*i/(n-1) <> yy = 4*j/(n-1) <float64> angle = 0.3 <> s = sin(angle) <> c = cos(angle) x[i,j] = c*xx + s*yy - 2 y[i,j] = -s*xx + c*yy - 2 """, [ lp.GlobalArg("x,y", dtype, shape=lp.auto), lp.ValueArg("n", np.int32), ], assumptions="n>0") knl = lp.split_iname(knl, "i", 16, outer_tag="g.1", inner_tag="l.1") knl = lp.split_iname(knl, "j", 16, outer_tag="g.0", inner_tag="l.0") return lp.CompiledKernel(context, knl)
def test_recursive_nested_dependent_reduction(ctx_factory): dtype = np.dtype(np.int32) ctx = ctx_factory() knl = lp.make_kernel( [ "{[itgt]: 0 <= itgt < ntgts}", "{[isrc_box]: 0 <= isrc_box < nboxes}", "{[isrc]: 0 <= isrc < npart}" ], """ for itgt for isrc_box <> npart = nparticles_per_box[isrc_box] <> boxsum = sum(isrc, isrc+isrc_box+itgt) end a[itgt] = sum(isrc_box, boxsum) end """, [ lp.ValueArg("n", np.int32), lp.GlobalArg("a", dtype, ("n",)), lp.GlobalArg("nparticles_per_box", np.int32, ("nboxes",)), lp.ValueArg("ntgts", np.int32), lp.ValueArg("nboxes", np.int32), ], assumptions="ntgts>=1") cknl = lp.CompiledKernel(ctx, knl) print(cknl.get_code())
def test_offsets_and_slicing(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) n = 20 knl = lp.make_kernel("{[i,j]: 0<=i<n and 0<=j<m }", """ b[i,j] = 2*a[i,j] """, assumptions="n>=1 and m>=1", default_offset=lp.auto) knl = lp.tag_data_axes(knl, "a,b", "stride:auto,stride:1") cknl = lp.CompiledKernel(ctx, knl) a_full = cl.clrandom.rand(queue, (n, n), np.float64) a_full_h = a_full.get() b_full = cl.clrandom.rand(queue, (n, n), np.float64) b_full_h = b_full.get() a_sub = (slice(3, 10), slice(5, 10)) a = a_full[a_sub] b_sub = (slice(3 + 3, 10 + 3), slice(5 + 4, 10 + 4)) b = b_full[b_sub] b_full_h[b_sub] = 2 * a_full_h[a_sub] print(cknl.get_highlighted_code({"a": a.dtype})) cknl(queue, a=a, b=b) import numpy.linalg as la assert la.norm(b_full.get() - b_full_h) < 1e-13
def get_3d_knl(context, dtype): knl = lp.make_kernel( "{[i,j]: 0<=i,j<n}", """ <> phi = 2*M_PI/n * i <> theta = 2*M_PI/n * j x[i,j] = 5*cos(phi) * (3 + cos(theta)) y[i,j] = 5*sin(phi) * (3 + cos(theta)) z[i,j] = 5*sin(theta) """, [ lp.GlobalArg("x,y,z,", dtype, shape=lp.auto), lp.ValueArg("n", np.int32), ]) knl = lp.split_iname(knl, "i", 16, outer_tag="g.1", inner_tag="l.1") knl = lp.split_iname(knl, "j", 16, outer_tag="g.0", inner_tag="l.0") return lp.CompiledKernel(context, knl)
def test_dependent_loop_bounds_2(ctx_factory): dtype = np.dtype(np.float32) ctx = ctx_factory() knl = lp.make_kernel([ "{[i]: 0<=i<n}", "{[jj]: 0<=jj<row_len}", ], [ "<> row_start = a_rowstarts[i]", "<> row_len = a_rowstarts[i+1] - row_start", "ax[i] = sum(jj, a_values[[row_start+jj]])", ], [ lp.GlobalArg("a_rowstarts", np.int32, shape=lp.auto), lp.GlobalArg("a_indices", np.int32, shape=lp.auto), lp.GlobalArg("a_values", dtype, strides=(1, )), lp.GlobalArg("ax", dtype, shape=lp.auto), lp.ValueArg("n", np.int32), ], assumptions="n>=1 and row_len>=1") knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") cknl = lp.CompiledKernel(ctx, knl) print("---------------------------------------------------") print(cknl.get_highlighted_code()) print("---------------------------------------------------")
def test_dependent_domain_insn_iname_finding(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel([ "{[isrc_box]: 0<=isrc_box<nsrc_boxes}", "{[isrc,idim]: isrc_start<=isrc<isrc_end and 0<=idim<dim}", ], """ <> src_ibox = source_boxes[isrc_box] <> isrc_start = box_source_starts[src_ibox] <> isrc_end = isrc_start+box_source_counts_nonchild[src_ibox] <> strength = strengths[isrc] {id=set_strength} """, [ lp.GlobalArg( "box_source_starts,box_source_counts_nonchild", None, shape=None), lp.GlobalArg("strengths", None, shape="nsources"), "..." ]) print(knl) assert "isrc_box" in knl.insn_inames("set_strength") print( lp.CompiledKernel(ctx, knl).get_highlighted_code( dict( source_boxes=np.int32, box_source_starts=np.int32, box_source_counts_nonchild=np.int32, strengths=np.float64, nsources=np.int32, )))
def test_independent_multi_domain(ctx_factory): dtype = np.dtype(np.float32) ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl = lp.make_kernel([ "{[i]: 0<=i<n}", "{[j]: 0<=j<n}", ], [ "a[i] = 1", "b[j] = 2", ], [ lp.GlobalArg("a", dtype, shape=("n"), order="C"), lp.GlobalArg("b", dtype, shape=("n"), order="C"), lp.ValueArg("n", np.int32), ]) knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.0") knl = lp.split_iname(knl, "j", 16, outer_tag="g.0", inner_tag="l.0") assert knl.parents_per_domain() == 2 * [None] n = 50 cknl = lp.CompiledKernel(ctx, knl) evt, (a, b) = cknl(queue, n=n, out_host=True) assert a.shape == (50, ) assert b.shape == (50, ) assert (a == 1).all() assert (b == 2).all()
def test_multi_nested_dependent_reduction(ctx_factory): dtype = np.dtype(np.int32) ctx = ctx_factory() knl = lp.make_kernel( [ "{[itgt]: 0 <= itgt < ntgts}", "{[isrc_box]: 0 <= isrc_box < nboxes}", "{[isrc]: 0 <= isrc < npart}" ], [ "<> npart = nparticles_per_box[isrc_box]", "a[itgt] = sum((isrc_box, isrc), 1)", ], [ lp.ValueArg("n", np.int32), lp.GlobalArg("a", dtype, ("n",)), lp.GlobalArg("nparticles_per_box", np.int32, ("nboxes",)), lp.ValueArg("ntgts", np.int32), lp.ValueArg("nboxes", np.int32), ], assumptions="ntgts>=1") cknl = lp.CompiledKernel(ctx, knl) print(cknl.get_code())
def time_kernel( self, knl, param_dict, ): if param_dict is None: raise ValueError( "Wall time requires dictionary of kernel parameters.") ctx = self.get_cl_context(knl) queue = cl.CommandQueue(ctx) arg_arrays = create_rand_args(ctx, knl, param_dict) knl = lp.set_options(knl, no_numpy=True) compiled = lp.CompiledKernel(ctx, knl) wtimes = [] import time for t in range(self.n_time_trials + self.n_warmup_time_trials): queue.finish() tstart = time.time() evt, out = compiled(queue, **arg_arrays) queue.finish() tend = time.time() wtimes.append(tend - tstart) import numpy as np return np.average(wtimes[self.n_warmup_time_trials:])
def test_nested_dependent_reduction(ctx_factory): dtype = np.dtype(np.int32) ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl = lp.make_kernel( [ "{[i]: 0<=i<n}", "{[j]: 0<=j<i+sumlen}" ], [ "<> sumlen = ell[i]", "a[i] = sum(j, j)", ], [ lp.ValueArg("n", np.int32), lp.GlobalArg("a", dtype, ("n",)), lp.GlobalArg("ell", np.int32, ("n",)), ]) cknl = lp.CompiledKernel(ctx, knl) n = 330 ell = np.arange(n, dtype=np.int32) evt, (a,) = cknl(queue, ell=ell, n=n, out_host=True) tgt_result = (2*ell-1)*2*ell/2 assert (a == tgt_result).all()
def test_dependent_loop_bounds(ctx_factory): dtype = np.dtype(np.float32) ctx = ctx_factory() knl = lp.make_kernel( [ "{[i]: 0<=i<n}", "{[jj]: 0<=jj<row_len}", ], [ "<> row_len = a_rowstarts[i+1] - a_rowstarts[i]", "a_sum[i] = sum(jj, a_values[[a_rowstarts[i]+jj]])", ], [ lp.GlobalArg("a_rowstarts", np.int32, shape=lp.auto), lp.GlobalArg("a_indices", np.int32, shape=lp.auto), lp.GlobalArg("a_values", dtype), lp.GlobalArg("a_sum", dtype, shape=lp.auto), lp.ValueArg("n", np.int32), ], assumptions="n>=1 and row_len>=1") cknl = lp.CompiledKernel(ctx, knl) print("---------------------------------------------------") print(cknl.get_highlighted_code()) print("---------------------------------------------------")
def test_triangle_domain(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel("{[i,j]: 0<=i,j<n and i <= j}", "a[i,j] = 17", assumptions="n>=1") print(knl) print(lp.CompiledKernel(ctx, knl).get_highlighted_code())
def solve_it(n, r, ctx, a, b): bcopy = b.copy() decompose_knl = LU_decomposition(ctx) queue = cl.CommandQueue( ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) cknl = lp.CompiledKernel(ctx, decompose_knl) parameters = {"syst": a, "n": r} evt, (LU) = cknl(queue, **parameters) LU = LU[0].astype(np.float64) solve_knl = LU_solver(ctx) queue = cl.CommandQueue( ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) cknl = lp.CompiledKernel(ctx, solve_knl) parameters = {"LU": LU, "bcopy": bcopy, "n": r, "r": n} evt, (c) = cknl(queue, **parameters) return c[0].get().transpose().astype(np.float64).copy()
def test_rob_stroud_bernstein(ctx_factory): ctx = ctx_factory() # NOTE: tmp would have to be zero-filled beforehand knl = lp.make_kernel( "{[el, i2, alpha1,alpha2]: \ 0 <= el < nels and \ 0 <= i2 < nqp1d and \ 0 <= alpha1 <= deg and 0 <= alpha2 <= deg-alpha1 }", """ for el,i2 <> xi = qpts[1, i2] <> s = 1-xi <> r = xi/s <> aind = 0 {id=aind_init} for alpha1 <> w = s**(deg-alpha1) {id=init_w} for alpha2 tmp[el,alpha1,i2] = tmp[el,alpha1,i2] + w * coeffs[aind] \ {id=write_tmp} w = w * r * ( deg - alpha1 - alpha2 ) / (1 + alpha2) \ {id=update_w,dep=init_w:write_tmp} aind = aind + 1 \ {id=aind_incr,dep=aind_init:write_tmp:update_w} end end end """, [ # Must declare coeffs to have "no" shape, to keep loopy # from trying to figure it out the shape automatically. lp.GlobalArg("coeffs", None, shape=None), "..." ], assumptions="deg>=0 and nels>=1") knl = lp.fix_parameters(knl, nqp1d=7, deg=4) knl = lp.split_iname(knl, "el", 16, inner_tag="l.0") knl = lp.split_iname(knl, "el_outer", 2, outer_tag="g.0", inner_tag="ilp", slabs=(0, 1)) knl = lp.tag_inames(knl, dict(i2="l.1", alpha1="unr", alpha2="unr")) print( lp.CompiledKernel(ctx, knl).get_highlighted_code( dict( qpts=np.float32, coeffs=np.float32, tmp=np.float32, )))
def test_modulo_indexing(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{[i,j]: 0<=i<n and 0<=j<5}", """ b[i] = sum(j, a[(i+j)%n]) """, [lp.GlobalArg("a", None, shape="n"), "..."]) print(knl) print( lp.CompiledKernel(ctx, knl).get_highlighted_code(dict(a=np.float32, )))
def test_arg_guessing(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel("{[i,j]: 0<=i,j<n }", """ a = 1.5 + sum((i,j), i*j) b[i, j] = i*j c[i+j, j] = b[j,i] """, assumptions="n>=1") print(knl) print(lp.CompiledKernel(ctx, knl).get_highlighted_code())
def test_triangle_domain(ctx): knl = lp.make_kernel(ctx.devices[0], [ "{[i,j]: 0<=i,j<n and i <= j}", ], "a[i,j] = 17", assumptions="n>=1") print knl print lp.CompiledKernel(ctx, knl).get_highlighted_code() return knl
def test_owed_barriers(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel("{[i]: 0<=i<100}", ["<float32> z[i] = a[i]"], [lp.GlobalArg("a", np.float32, shape=(100, ))]) knl = lp.tag_inames(knl, dict(i="l.0")) knl = lp.preprocess_kernel(knl, ctx.devices[0]) kernel_gen = lp.generate_loop_schedules(knl) for gen_knl in kernel_gen: compiled = lp.CompiledKernel(ctx, gen_knl) print(compiled.get_code())
def test_nonlinear_index(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel("{[i,j]: 0<=i,j<n }", """ a[i*i] = 17 """, [ lp.GlobalArg("a", shape="n"), lp.ValueArg("n"), ], assumptions="n>=1") print(knl) print(lp.CompiledKernel(ctx, knl).get_highlighted_code())
def test_arg_guessing_with_reduction(ctx_factory): #logging.basicConfig(level=logging.DEBUG) ctx = ctx_factory() knl = lp.make_kernel("{[i,j]: 0<=i,j<n }", """ a = 1.5 + simul_reduce(sum, (i,j), i*j) d = 1.5 + simul_reduce(sum, (i,j), b[i,j]) b[i, j] = i*j c[i+j, j] = b[j,i] """, assumptions="n>=1") print(knl) print(lp.CompiledKernel(ctx, knl).get_highlighted_code())
def test_simple_side_effect(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{[i,j]: 0<=i,j<100}", """ a[i] = a[i] + 1 """, [lp.GlobalArg("a", np.float32, shape=(100, ))]) knl = lp.preprocess_kernel(knl, ctx.devices[0]) kernel_gen = lp.generate_loop_schedules(knl) for gen_knl in kernel_gen: print(gen_knl) compiled = lp.CompiledKernel(ctx, gen_knl) print(compiled.get_code())
def test_fuzz_code_generator(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) if ctx.devices[0].platform.vendor.startswith("Advanced Micro"): pytest.skip("crashes on AMD 15.12") #from expr_fuzz import get_fuzz_examples #for expr, var_values in get_fuzz_examples(): for expr, var_values in generate_random_fuzz_examples(50): from pymbolic import evaluate try: true_value = evaluate(expr, var_values) except ZeroDivisionError: continue def get_dtype(x): if isinstance(x, (complex, np.complexfloating)): return np.complex128 else: return np.float64 knl = lp.make_kernel("{ : }", [lp.Assignment("value", expr)], [lp.GlobalArg("value", np.complex128, shape=())] + [ lp.ValueArg(name, get_dtype(val)) for name, val in six.iteritems(var_values) ]) ck = lp.CompiledKernel(ctx, knl) evt, (lp_value, ) = ck(queue, out_host=True, **var_values) err = abs(true_value - lp_value) / abs(true_value) if abs(err) > 1e-10: print(80 * "-") print("WRONG: rel error=%g" % err) print("true=%r" % true_value) print("loopy=%r" % lp_value) print(80 * "-") print(ck.get_code()) print(80 * "-") print(var_values) print(80 * "-") print(repr(expr)) print(80 * "-") print(expr) print(80 * "-") 1 / 0
def test_wg_too_small(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel("{[i]: 0<=i<100}", ["<float32> z[i] = a[i] {id=copy}"], [lp.GlobalArg("a", np.float32, shape=(100, ))], local_sizes={0: 16}) knl = lp.tag_inames(knl, dict(i="l.0")) knl = lp.preprocess_kernel(knl, ctx.devices[0]) kernel_gen = lp.generate_loop_schedules(knl) import pytest for gen_knl in kernel_gen: with pytest.raises(RuntimeError): lp.CompiledKernel(ctx, gen_knl).get_code()
def test_multi_cse(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel("{[i]: 0<=i<100}", ["<float32> z[i] = a[i] + a[i]**2"], [lp.GlobalArg("a", np.float32, shape=(100, ))], local_sizes={0: 16}) knl = lp.split_iname(knl, "i", 16, inner_tag="l.0") knl = lp.add_prefetch(knl, "a", []) knl = lp.preprocess_kernel(knl, ctx.devices[0]) kernel_gen = lp.generate_loop_schedules(knl) for gen_knl in kernel_gen: compiled = lp.CompiledKernel(ctx, gen_knl) print(compiled.get_code())
def get_2d_knl(context, dtype): knl = lp.make_kernel( "{[i]: 0<=i<n}", """ <> phi = 2*M_PI/n * i x[i] = 0.5* (3*cos(phi) + 2*sin(3*phi)) y[i] = 0.5* (1*sin(phi) + 1.5*sin(2*phi)) """, [ lp.GlobalArg("x,y", dtype, shape=lp.auto), lp.ValueArg("n", np.int32), ]) knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") return lp.CompiledKernel(context, knl)
def test_assume(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel("{[i]: 0<=i<n}", "a[i] = a[i] + 1", [lp.GlobalArg("a", np.float32, shape="n"), "..."]) knl = lp.split_iname(knl, "i", 16) knl = lp.set_loop_priority(knl, "i_outer,i_inner") knl = lp.assume(knl, "n mod 16 = 0") knl = lp.assume(knl, "n > 10") knl = lp.preprocess_kernel(knl, ctx.devices[0]) kernel_gen = lp.generate_loop_schedules(knl) for gen_knl in kernel_gen: print(gen_knl) compiled = lp.CompiledKernel(ctx, gen_knl) print(compiled.get_code()) assert "if" not in compiled.get_code()
def test_arg_shape_guessing(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel("{[i,j]: 0<=i,j<n }", """ a = 1.5 + sum((i,j), i*j) b[i, j] = i*j c[i+j, j] = b[j,i] """, [ lp.GlobalArg("a", shape=lp.auto), lp.GlobalArg("b", shape=lp.auto), lp.GlobalArg("c", shape=lp.auto), lp.ValueArg("n"), ], assumptions="n>=1") print(knl) print(lp.CompiledKernel(ctx, knl).get_highlighted_code())
def test_write_parameter(ctx_factory): dtype = np.float32 ctx = ctx_factory() knl = lp.make_kernel("{[i,j]: 0<=i,j<n }", """ a = sum((i,j), i*j) b = sum(i, sum(j, i*j)) n = 15 """, [ lp.GlobalArg("a", dtype, shape=()), lp.GlobalArg("b", dtype, shape=()), lp.ValueArg("n", np.int32, approximately=1000), ], assumptions="n>=1") import pytest with pytest.raises(RuntimeError): lp.CompiledKernel(ctx, knl).get_code()
def get_3d_knl(context, dtype): knl = lp.make_kernel( "{[i,j,k]: 0<=i,j,k<n}", """ <> xx = i/(n-1) <> yy = j/(n-1) <> zz = k/(n-1) <float64> phi = 0.3 <> s1 = sin(phi) <> c1 = cos(phi) <> xxx = c1*xx + s1*yy <> yyy = -s1*xx + c1*yy <> zzz = zz <float64> theta = 0.7 <> s2 = sin(theta) <> c2 = cos(theta) x[i,j,k] = 4 * (c2*xxx + s2*zzz) - 2 y[i,j,k] = 4 * yyy - 2 z[i,j,k] = 4 * (-s2*xxx + c2*zzz) - 2 """, [ lp.GlobalArg("x,y,z", dtype, shape=lp.auto), lp.ValueArg("n", np.int32), ], assumptions="n>0") knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.1") knl = lp.split_iname(knl, "k", 16, outer_tag="g.0", inner_tag="l.0") return lp.CompiledKernel(context, knl)
def test_dependent_loop_bounds_3(ctx_factory): # The point of this test is that it shows a dependency between # domains that is exclusively mediated by the row_len temporary. # It also makes sure that row_len gets read before any # conditionals use it. dtype = np.dtype(np.float32) ctx = ctx_factory() knl = lp.make_kernel( [ "{[i]: 0<=i<n}", "{[jj]: 0<=jj<row_len}", ], [ "<> row_len = a_row_lengths[i]", "a[i,jj] = 1", ], [ lp.GlobalArg("a_row_lengths", np.int32, shape=lp.auto), lp.GlobalArg("a", dtype, shape=("n,n"), order="C"), lp.ValueArg("n", np.int32), ]) assert knl.parents_per_domain()[1] == 0 knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") cknl = lp.CompiledKernel(ctx, knl) print("---------------------------------------------------") print(cknl.get_highlighted_code()) print("---------------------------------------------------") knl_bad = lp.split_iname(knl, "jj", 128, outer_tag="g.1", inner_tag="l.1") knl = lp.preprocess_kernel(knl, ctx.devices[0]) with pytest.raises(RuntimeError): list(lp.generate_loop_schedules(knl_bad))