def test_to_batched_temp(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( ''' { [i,j]: 0<=i,j<n } ''', ''' cnst = 2.0 out[i] = sum(j, cnst*a[i,j]*x[j])''', [lp.TemporaryVariable( "cnst", dtype=np.float32, shape=(), scope=lp.temp_var_scope.PRIVATE), '...']) knl = lp.add_and_infer_dtypes(knl, dict(out=np.float32, x=np.float32, a=np.float32)) ref_knl = lp.make_kernel( ''' { [i,j]: 0<=i,j<n } ''', '''out[i] = sum(j, 2.0*a[i,j]*x[j])''') ref_knl = lp.add_and_infer_dtypes(ref_knl, dict(out=np.float32, x=np.float32, a=np.float32)) bknl = lp.to_batched(knl, "nbatches", "out,x") bref_knl = lp.to_batched(ref_knl, "nbatches", "out,x") # checking that cnst is not being bathced assert bknl.temporary_variables['cnst'].shape == () a = np.random.randn(5, 5) x = np.random.randn(7, 5) # Checking that the program compiles and the logic is correct lp.auto_test_vs_ref( bref_knl, ctx, bknl, parameters=dict(a=a, x=x, n=5, nbatches=7))
def test_to_batched(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl = lp.make_kernel( ''' { [i,j]: 0<=i,j<n } ''', ''' out[i] = sum(j, a[i,j]*x[j])''') knl = lp.add_and_infer_dtypes(knl, dict(out=np.float32, x=np.float32, a=np.float32)) bknl = lp.to_batched(knl, "nbatches", "out,x") ref_knl = lp.make_kernel( ''' { [i,j,k]: 0<=i,j<n and 0<=k<nbatches} ''', '''out[k, i] = sum(j, a[i,j]*x[k, j])''') ref_knl = lp.add_and_infer_dtypes(ref_knl, dict(out=np.float32, x=np.float32, a=np.float32)) a = np.random.randn(5, 5).astype(np.float32) x = np.random.randn(7, 5).astype(np.float32) # Running both the kernels evt, (out1, ) = bknl(queue, a=a, x=x, n=5, nbatches=7) evt, (out2, ) = ref_knl(queue, a=a, x=x, n=5, nbatches=7) # checking that the outputs are same assert np.linalg.norm(out1-out2) < 1e-15
def test_unschedulable_kernel_detection(): knl = lp.make_kernel(["{[i,j]:0<=i,j<n}"], """ mat1[i,j] = mat1[i,j] + 1 {inames=i:j, id=i1} mat2[j] = mat2[j] + 1 {inames=j, id=i2} mat3[i] = mat3[i] + 1 {inames=i, id=i3} """) knl = lp.preprocess_kernel(knl) # Check that loopy can detect the unschedulability of the kernel assert lp.needs_iname_duplication(knl) assert len(list(lp.get_iname_duplication_options(knl))) == 4 for inames, insns in lp.get_iname_duplication_options(knl): fixed_knl = lp.duplicate_inames(knl, inames, insns) assert not lp.needs_iname_duplication(fixed_knl) knl = lp.make_kernel(["{[i,j,k,l,m]:0<=i,j,k,l,m<n}"], """ mat1[l,m,i,j,k] = mat1[l,m,i,j,k] + 1 {inames=i:j:k:l:m} mat2[l,m,j,k] = mat2[l,m,j,k] + 1 {inames=j:k:l:m} mat3[l,m,k] = mat3[l,m,k] + 11 {inames=k:l:m} mat4[l,m,i] = mat4[l,m,i] + 1 {inames=i:l:m} """) assert lp.needs_iname_duplication(knl) assert len(list(lp.get_iname_duplication_options(knl))) == 10
def test_arg_shape_uses_assumptions(ctx_factory): # If arg shape determination does not use assumptions, then it won't find a # static shape for out, which is at least 1 x 1 in size, but otherwise of # size n x n. lp.make_kernel( "{ [i,j]: 0<=i,j<n }", """ out[i,j] = 2*a[i,j] out[0,0] = 13.0 """, assumptions="n>=1")
def test_fusion(): exp_kernel = lp.make_kernel( ''' { [i]: 0<=i<n } ''', ''' exp[i] = pow(E, z[i])''', assumptions="n>0") sum_kernel = lp.make_kernel( '{ [j]: 0<=j<n }', 'out2 = sum(j, exp[j])', assumptions='n>0') knl = lp.fuse_kernels([exp_kernel, sum_kernel]) print(knl)
def test_finite_difference_expr_subst(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) grid = np.linspace(0, 2*np.pi, 2048, endpoint=False) h = grid[1] - grid[0] u = cl.clmath.sin(cl.array.to_device(queue, grid)) fin_diff_knl = lp.make_kernel( "{[i]: 1<=i<=n}", "out[i] = -(f[i+1] - f[i-1])/h", [lp.GlobalArg("out", shape="n+2"), "..."]) flux_knl = lp.make_kernel( "{[j]: 1<=j<=n}", "f[j] = u[j]**2/2", [ lp.GlobalArg("f", shape="n+2"), lp.GlobalArg("u", shape="n+2"), ]) fused_knl = lp.fuse_kernels([fin_diff_knl, flux_knl], data_flow=[ ("f", 1, 0) ]) fused_knl = lp.set_options(fused_knl, write_cl=True) evt, _ = fused_knl(queue, u=u, h=np.float32(1e-1)) fused_knl = lp.assignment_to_subst(fused_knl, "f") fused_knl = lp.set_options(fused_knl, write_cl=True) # This is the real test here: The automatically generated # shape expressions are '2+n' and the ones above are 'n+2'. # Is loopy smart enough to understand that these are equal? evt, _ = fused_knl(queue, u=u, h=np.float32(1e-1)) fused0_knl = lp.affine_map_inames(fused_knl, "i", "inew", "inew+1=i") gpu_knl = lp.split_iname( fused0_knl, "inew", 128, outer_tag="g.0", inner_tag="l.0") precomp_knl = lp.precompute( gpu_knl, "f_subst", "inew_inner", fetch_bounding_box=True) precomp_knl = lp.tag_inames(precomp_knl, {"j_0_outer": "unr"}) precomp_knl = lp.set_options(precomp_knl, return_dict=True) evt, _ = precomp_knl(queue, u=u, h=h)
def test_op_counter_logic(): knl = lp.make_kernel( "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", [ """ e[i,k] = if(not(k<l-2) and k>6 or k/2==l, g[i,k]*2, g[i,k]+h[i,k]/2) """ ], name="logic", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64)) poly = get_op_poly(knl) n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} f32mul = poly[(np.dtype(np.float32), 'mul')].eval_with_dict(params) f64add = poly[(np.dtype(np.float64), 'add')].eval_with_dict(params) f64div = poly[(np.dtype(np.float64), 'div')].eval_with_dict(params) i32add = poly[(np.dtype(np.int32), 'add')].eval_with_dict(params) assert f32mul == n*m assert f64div == 2*n*m # TODO why? assert f64add == n*m assert i32add == n*m
def test_tim2d(ctx_factory): dtype = np.float32 ctx = ctx_factory() order = "C" n = 8 from pymbolic import var K_sym = var("K") field_shape = (K_sym, n, n) # K - run-time symbolic knl = lp.make_kernel(ctx.devices[0], "[K] -> {[i,j,e,m,o,gi]: 0<=i,j,m,o<%d and 0<=e<K and 0<=gi<3}" % n, [ "ur(a,b) := sum_float32(@o, D[a,o]*u[e,o,b])", "us(a,b) := sum_float32(@o, D[b,o]*u[e,a,o])", "lap[e,i,j] = " " sum_float32(m, D[m,i]*(G[0,e,m,j]*ur(m,j) + G[1,e,m,j]*us(m,j)))" "+ sum_float32(m, D[m,j]*(G[1,e,i,m]*ur(i,m) + G[2,e,i,m]*us(i,m)))" ], [ lp.ArrayArg("u", dtype, shape=field_shape, order=order), lp.ArrayArg("lap", dtype, shape=field_shape, order=order), lp.ArrayArg("G", dtype, shape=(3,)+field_shape, order=order), # lp.ConstantArrayArg("D", dtype, shape=(n, n), order=order), lp.ArrayArg("D", dtype, shape=(n, n), order=order), # lp.ImageArg("D", dtype, shape=(n, n)), lp.ValueArg("K", np.int32, approximately=1000), ], name="semlap2D", assumptions="K>=1") unroll = 32 seq_knl = knl knl = lp.add_prefetch(knl, "D", ["m", "j", "i","o"], default_tag="l.auto") knl = lp.add_prefetch(knl, "u", ["i", "j", "o"], default_tag="l.auto") knl = lp.precompute(knl, "ur", np.float32, ["a", "b"], default_tag="l.auto") knl = lp.precompute(knl, "us", np.float32, ["a", "b"], default_tag="l.auto") knl = lp.split_iname(knl, "e", 1, outer_tag="g.0")#, slabs=(0, 1)) knl = lp.tag_inames(knl, dict(i="l.0", j="l.1")) knl = lp.tag_inames(knl, dict(o="unr")) knl = lp.tag_inames(knl, dict(m="unr")) # knl = lp.add_prefetch(knl, "G", [2,3], default_tag=None) # axis/argument indices on G knl = lp.add_prefetch(knl, "G", [2,3], default_tag="l.auto") # axis/argument indices on G kernel_gen = lp.generate_loop_schedules(knl) kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000)) K = 1000 lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen, op_count=K*(n*n*n*2*2 + n*n*2*3 + n**3 * 2*2)/1e9, op_label="GFlops", parameters={"K": K})
def pick_knl(): import loopy as lp knl = lp.make_kernel( """{[k,i,j]: 0<=k<nelements and 0<=i<n_to_nodes}""", "result[to_element_indices[k], i] \ = vec[from_element_indices[k], pick_list[i]]", [ lp.GlobalArg("result", None, shape="nelements_result, n_to_nodes", offset=lp.auto), lp.GlobalArg("vec", None, shape="nelements_vec, n_from_nodes", offset=lp.auto), lp.ValueArg("nelements_result", np.int32), lp.ValueArg("nelements_vec", np.int32), lp.ValueArg("n_from_nodes", np.int32), "...", ], name="resample_by_picking", lang_version=MOST_RECENT_LANGUAGE_VERSION) knl = lp.split_iname(knl, "i", 16, inner_tag="l.0") return lp.tag_inames(knl, dict(k="g.0"))
def test_gmem_access_counter_logic(): knl = lp.make_kernel( "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", [ """ e[i,k] = if(not(k<l-2) and k>6 or k/2==l, g[i,k]*2, g[i,k]+h[i,k]/2) """ ], name="logic", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64)) poly = get_gmem_access_poly(knl) n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} f32 = poly[ (np.dtype(np.float32), 'uniform', 'load') ].eval_with_dict(params) f64 = poly[ (np.dtype(np.float64), 'uniform', 'load') ].eval_with_dict(params) assert f32 == 2*n*m assert f64 == n*m f64 = poly[ (np.dtype(np.float64), 'uniform', 'store') ].eval_with_dict(params) assert f64 == n*m
def test_op_counter_triangular_domain(): knl = lp.make_kernel( "{[i,j]: 0<=i<n and 0<=j<m and i<j}", """ a[i, j] = b[i,j] * 2 """, name="bitwise", assumptions="n,m >= 1") knl = lp.add_and_infer_dtypes(knl, dict(b=np.float64)) expect_fallback = False import islpy as isl try: isl.BasicSet.card except AttributeError: expect_fallback = True else: expect_fallback = False poly = get_op_poly(knl)[(np.dtype(np.float64), 'mul')] value_dict = dict(m=13, n=200) flops = poly.eval_with_dict(value_dict) if expect_fallback: assert flops == 144 else: assert flops == 78
def test_op_counter_basic(): knl = lp.make_kernel( "[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", [ """ c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k] e[i, k+1] = -g[i,k]*h[i,k+1] """ ], name="basic", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) poly = get_op_poly(knl) n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} f32add = poly[(np.dtype(np.float32), 'add')].eval_with_dict(params) f32mul = poly[(np.dtype(np.float32), 'mul')].eval_with_dict(params) f32div = poly[(np.dtype(np.float32), 'div')].eval_with_dict(params) f64mul = poly[(np.dtype(np.float64), 'mul')].eval_with_dict(params) i32add = poly[(np.dtype(np.int32), 'add')].eval_with_dict(params) assert f32add == f32mul == f32div == n*m*l assert f64mul == n*m assert i32add == n*m*2
def get_tensor(ctx): order='C' dtype = np.float32 knl = lp.make_kernel(ctx.devices[0], [ "{[j,i,alpha,k]: 0<=alpha<r and 0<=i,j,k<n}", ], [ "res[i,j,k]=sum((alpha), u[alpha,i]*v[alpha,j]*w[alpha,k])", ], [ lp.GlobalArg("res", dtype, shape="n, n, n", order=order), lp.GlobalArg("v", dtype, shape="r, n", order=order), lp.GlobalArg("u", dtype, shape="r, n", order=order), lp.GlobalArg("w", dtype, shape="r, n", order=order), lp.ValueArg("n", np.int32), lp.ValueArg("r", np.int32), ], assumptions="n>=1") knl = lp.split_iname(knl, "i", 8,outer_tag="g.0", inner_tag="l.0") knl = lp.split_iname(knl, "j", 8, outer_tag="g.1", inner_tag="l.1") knl = lp.split_iname(knl, "alpha", 2) knl = lp.split_iname(knl, "k", 8, outer_tag="g.2", inner_tag="l.2" ) return knl
def test_troublesome_premagma_fermi_matrix_mul(ctx_factory): dtype = np.float32 ctx = ctx_factory() order = "C" n = 6*16*2 knl = lp.make_kernel( "{[i,j,k]: 0<=i,j,k<%d}" % n, [ "c[i, j] = sum(k, a[i, k]*b[k, j])" ], [ lp.GlobalArg("a", dtype, shape=(n, n), order=order), lp.GlobalArg("b", dtype, shape=(n, n), order=order), lp.GlobalArg("c", dtype, shape=(n, n), order=order), ], name="matmul") seq_knl = knl i_reg = 2 j_reg = 2 i_chunks = 16 j_chunks = 16 knl = lp.split_iname(knl, "i", i_reg*i_chunks, outer_tag="g.0") knl = lp.split_iname(knl, "i_inner", i_reg, outer_tag="l.0", inner_tag="ilp") knl = lp.split_iname(knl, "j", j_reg*j_chunks, outer_tag="g.1") knl = lp.split_iname(knl, "j_inner", j_reg, outer_tag="l.1", inner_tag="ilp") knl = lp.split_iname(knl, "k", 16) knl = lp.add_prefetch(knl, 'a', ["k_inner", "i_inner_inner", "i_inner_outer"]) lp.auto_test_vs_ref(seq_knl, ctx, knl, op_count=[2*n**3/1e9], op_label=["GFlops"], parameters={})
def LU_solver(ctx): order='C' dtype = np.float32 knl = lp.make_kernel(ctx.devices[0], [ "{[l,k,i,j,m]: 0<=l<r and 0<=k<n-1 and k+1<=i<n and 0<=j<n-1 and 0<=m<n-1-j}", ], [ "bcopy[i,l] = bcopy[i,l]-bcopy[k,l]*LU[i,k] {id=lab1}", "bcopy[n-1-j,l]=bcopy[n-j-1,l]/LU[n-j-1,n-1-j] {id=l2, dep=lab1}", "bcopy[m,l]= bcopy[m,l]-bcopy[n-j-1,l]*LU[m,n-1-j] {id=l3, dep =l2}", "bcopy[0,l]=bcopy[0,l]/LU[0,0]{id=l4, dep=l2}", ], [ lp.GlobalArg("LU", dtype, shape = "n, n" , order=order), lp.GlobalArg("bcopy", dtype, shape = "n, r" , order=order), lp.ValueArg("n", np.int64), lp.ValueArg("r", np.int64), ], assumptions="n>=1") knl = lp.split_iname(knl, "k", 1) knl = lp.split_iname(knl, "i", 32) knl = lp.split_iname(knl, "j", 32) knl = lp.split_iname(knl, "l", 32, outer_tag="g.0", inner_tag="l.0") # print knl # print lp.CompiledKernel(ctx, knl).get_highlighted_code() return knl
def test_small_batched_matvec(ctx_factory): dtype = np.float32 ctx = ctx_factory() order = "C" K = 9997 # noqa Np = 36 # noqa knl = lp.make_kernel( "{[i,j,k]: 0<=k<K and 0<= i,j < %d}" % Np, [ "result[k, i] = sum(j, d[i, j]*f[k, j])" ], [ lp.GlobalArg("d", dtype, shape=(Np, Np), order=order), lp.GlobalArg("f", dtype, shape=("K", Np), order=order), lp.GlobalArg("result", dtype, shape=("K", Np), order=order), lp.ValueArg("K", np.int32, approximately=1000), ], name="batched_matvec", assumptions="K>=1") seq_knl = knl align_bytes = 64 knl = lp.add_prefetch(knl, 'd[:,:]') pad_mult = lp.find_padding_multiple(knl, "f", 0, align_bytes) knl = lp.split_array_dim(knl, ("f", 0), pad_mult) knl = lp.add_padding(knl, "f", 0, align_bytes) lp.auto_test_vs_ref(seq_knl, ctx, knl, op_count=[K*2*Np**2/1e9], op_label=["GFlops"], parameters=dict(K=K))
def test_op_counter_bitwise(): knl = lp.make_kernel( "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", [ """ c[i, j, k] = (a[i,j,k] | 1) + (b[i,j,k] & 1) e[i, k] = (g[i,k] ^ k)*(~h[i,k+1]) + (g[i, k] << (h[i,k] >> k)) """ ], name="bitwise", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes( knl, dict( a=np.int32, b=np.int32, g=np.int64, h=np.int64)) poly = get_op_poly(knl) n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} i32add = poly[(np.dtype(np.int32), 'add')].eval_with_dict(params) i32bw = poly[(np.dtype(np.int32), 'bw')].eval_with_dict(params) i64bw = poly[(np.dtype(np.int64), 'bw')].eval_with_dict(params) i64mul = poly[(np.dtype(np.int64), 'mul')].eval_with_dict(params) i64add = poly[(np.dtype(np.int64), 'add')].eval_with_dict(params) i64shift = poly[(np.dtype(np.int64), 'shift')].eval_with_dict(params) assert i32add == n*m+n*m*l assert i32bw == 2*n*m*l assert i64bw == 2*n*m assert i64add == i64mul == n*m assert i64shift == 2*n*m
def LU_decomposition(ctx): order='C' dtype = np.float32 knl = lp.make_kernel(ctx.devices[0], [ "{[k,i]: 0<=k<n-1 and k+1<=i<n}", "{[j,l]: 0<=k<n-1 and k+1<=j,l<n}", ], [ "syst[i,k] = syst[i,k]/syst[k,k] {id=lab1}", "syst[l,j]= syst[l,j] - syst[l,k]*syst[k,j] {dep=lab1}", ], [ lp.GlobalArg("syst", dtype, shape = "n, n" , order=order), lp.ValueArg("n", np.int32), ], assumptions="n>=1") knl = lp.split_iname(knl, "k", n) knl = lp.split_iname(knl, "i", 32) knl = lp.split_iname(knl, "j", 32) knl = lp.split_iname(knl, "l", 32) # print knl # print lp.CompiledKernel(ctx, knl).get_highlighted_code() return knl
def test_fancy_matrix_mul(ctx_factory): dtype = np.float32 ctx = ctx_factory() order = "C" n = get_suitable_size(ctx) knl = lp.make_kernel( "[n] -> {[i,j,k]: 0<=i,j,k<n }", [ "c[i, j] = sum(k, a[i, k]*b[k, j])" ], [ lp.GlobalArg("a", dtype, shape="(n, n)", order=order), lp.GlobalArg("b", dtype, shape="(n, n)", order=order), lp.GlobalArg("c", dtype, shape="(n, n)", order=order), lp.ValueArg("n", np.int32, approximately=1000), ], name="fancy_matmul", assumptions="n>=1") seq_knl = knl knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.1") knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.0") knl = lp.split_iname(knl, "k", 16, slabs=(0, 1)) knl = lp.add_prefetch(knl, 'a', ["i_inner", "k_inner"]) knl = lp.add_prefetch(knl, 'b', ["k_inner", "j_inner"]) lp.auto_test_vs_ref(seq_knl, ctx, knl, op_count=[2*n**3/1e9], op_label=["GFlops"], parameters=dict(n=n))
def test_variable_size_matrix_mul(ctx_factory): ctx = ctx_factory() n = get_suitable_size(ctx) knl = lp.make_kernel( "{[i,j,k]: 0<=i,j,k<n}", "c[i, j] = sum(k, a[i, k]*b[k, j])") knl = lp.add_dtypes(knl, { "a": np.float32, "b": np.float32, }) ref_knl = knl knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.1", slabs=(0, 1)) knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.0", slabs=(0, 1)) knl = lp.split_iname(knl, "k", 8, slabs=(0, 1)) knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"]) knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"]) lp.auto_test_vs_ref(ref_knl, ctx, knl, op_count=[2*n**3/1e9], op_label=["GFlops"], parameters={"n": n})
def test_transpose(ctx_factory): dtype = np.dtype(np.float32) ctx = ctx_factory() order = "C" n = get_suitable_size(ctx) knl = lp.make_kernel( "{[i,j]: 0<=i,j<%d}" % n, [ "b[i, j] = a[j, i]" ], [ lp.GlobalArg("a", dtype, shape=(n, n), order=order), lp.GlobalArg("b", dtype, shape=(n, n), order=order), ], name="transpose") seq_knl = knl knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.1") knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.0") knl = lp.add_prefetch(knl, 'a', ["i_inner", "j_inner"]) lp.auto_test_vs_ref(seq_knl, ctx, knl, op_count=[dtype.itemsize*n**2*2/1e9], op_label=["GByte"], parameters={})
def test_plain_matrix_mul(ctx_factory): ctx = ctx_factory() order = "C" n = get_suitable_size(ctx) for dtype, check, vec_size in [ (cl_array.vec.float4, check_float4, 4), (np.float32, None, 1), ]: knl = lp.make_kernel( "{[i,j,k]: 0<=i,j,k<%d}" % n, [ "c[i, j] = sum(k, a[i, k]*b[k, j])" ], [ lp.GlobalArg("a", dtype, shape=(n, n), order=order), lp.GlobalArg("b", dtype, shape=(n, n), order=order), lp.GlobalArg("c", dtype, shape=(n, n), order=order), ], name="matmul") ref_knl = knl knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.1") knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.0") knl = lp.split_iname(knl, "k", 16) knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"]) knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner", ]) lp.auto_test_vs_ref(ref_knl, ctx, knl, op_count=[vec_size*2*n**3/1e9], op_label=["GFlops"], parameters={"n": n}, check_result=check)
def Prav_V(ctx): order='C' dtype = np.float32 knl = lp.make_kernel(ctx.devices[0], [ "{[i,j,k,alpha]: 0<=alpha<r and 0<=i,j,k<n}", ], [ "f[alpha,j]=sum((k,i), a[i,j,k]*w[alpha, k]*u[alpha, i])", ], [ lp.GlobalArg("a", dtype, shape="n, n, n", order=order), lp.GlobalArg("u", dtype, shape="r, n", order=order), lp.GlobalArg("w", dtype, shape="r, n", order=order), lp.GlobalArg("f", dtype, shape="r, n", order=order), lp.ValueArg("n", np.int64), lp.ValueArg("r", np.int64), ], assumptions="n>=1") knl = lp.split_iname(knl, "j", 16,outer_tag="g.0", inner_tag="l.0") knl = lp.split_iname(knl, "alpha", 3, outer_tag="g.1", inner_tag="l.1") knl = lp.split_iname(knl, "i", 16) knl = lp.split_iname(knl, "k", 16) return knl
def kproj(): import loopy as lp knl = lp.make_kernel([ "{[k]: 0 <= k < nelements}", "{[j]: 0 <= j < n_from_nodes}" ], """ for k <> element_dot = \ sum(j, vec[from_element_indices[k], j] * \ basis[j] * weights[j]) result[to_element_indices[k], ibasis] = \ result[to_element_indices[k], ibasis] + element_dot end """, [ lp.GlobalArg("vec", None, shape=("n_from_elements", "n_from_nodes")), lp.GlobalArg("result", None, shape=("n_to_elements", "n_to_nodes")), lp.GlobalArg("basis", None, shape="n_from_nodes"), lp.GlobalArg("weights", None, shape="n_from_nodes"), lp.ValueArg("n_from_elements", np.int32), lp.ValueArg("n_to_elements", np.int32), lp.ValueArg("n_to_nodes", np.int32), lp.ValueArg("ibasis", np.int32), '...' ], name="conn_projection_knl", lang_version=MOST_RECENT_LANGUAGE_VERSION) return knl
def test_sum_factorization(): knl = lp.make_kernel( "{[i,j,ip,jp,k,l]: " "0<=i<I and 0<=j<J and 0<=ip<IP and 0<=jp<JP and 0<=k,l<Q}", """ phi1(i, x) := x**i phi2(i, x) := x**i psi1(i, x) := x**i psi2(i, x) := x**i a(x, y) := 1 A[i,j,ip,jp] = sum(k,sum(l, phi1(i,x[0,k]) * phi2(j,x[1,l]) * psi1(ip, x[0,k]) * psi2(jp, x[1, l]) * w[0,k] * w[1,l] * a(x[0,k], x[1,l]) )) """) pytest.xfail("extract_subst is currently too stupid for sum factorization") knl = lp.extract_subst(knl, "temp_array", "phi1(i,x[0,k]) *psi1(ip, x[0,k]) * w[0,k]") knl = lp.extract_subst(knl, "temp_array", "sum(k, phi1(i,x[0,k]) *psi1(ip, x[0,k]) * w[0,k])") print(knl)
def test_op_counter_specialops(): knl = lp.make_kernel( "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", [ """ c[i, j, k] = (2*a[i,j,k])%(2+b[i,j,k]/3.0) e[i, k] = (1+g[i,k])**(1+h[i,k+1]) """ ], name="specialops", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) poly = get_op_poly(knl) n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} f32mul = poly[(np.dtype(np.float32), 'mul')].eval_with_dict(params) f32div = poly[(np.dtype(np.float32), 'div')].eval_with_dict(params) f32add = poly[(np.dtype(np.float32), 'add')].eval_with_dict(params) f64pow = poly[(np.dtype(np.float64), 'pow')].eval_with_dict(params) f64add = poly[(np.dtype(np.float64), 'add')].eval_with_dict(params) i32add = poly[(np.dtype(np.int32), 'add')].eval_with_dict(params) assert f32div == 2*n*m*l assert f32mul == f32add == n*m*l assert f64add == 2*n*m assert f64pow == i32add == n*m
def test_gmem_access_counter_bitwise(): knl = lp.make_kernel( "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", [ """ c[i, j, k] = (a[i,j,k] | 1) + (b[i,j,k] & 1) e[i, k] = (g[i,k] ^ k)*(~h[i,k+1]) + (g[i, k] << (h[i,k] >> k)) """ ], name="bitwise", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes( knl, dict( a=np.int32, b=np.int32, g=np.int32, h=np.int32)) poly = get_gmem_access_poly(knl) n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} i32 = poly[ (np.dtype(np.int32), 'uniform', 'load') ].eval_with_dict(params) assert i32 == 4*n*m+2*n*m*l i32 = poly[ (np.dtype(np.int32), 'uniform', 'store') ].eval_with_dict(params) assert i32 == n*m+n*m*l
def left_W(ctx): order='C' dtype = np.float32 knl = lp.make_kernel(ctx.devices[0], [ "{[j,i,alpha,alpha1]: 0<=alpha,alpha1<r and 0<=j,i<n}", ], [ "l[alpha,alpha1]=sum((i), u[alpha,i]*u[alpha1,i])*sum((j),v[alpha,j]*v[alpha1,j])", ], [ lp.GlobalArg("v", dtype, shape="r, n", order=order), lp.GlobalArg("u", dtype, shape="r, n", order=order), lp.GlobalArg("l", dtype, shape="r, r", order=order), lp.ValueArg("n", np.int64), lp.ValueArg("r", np.int64), ], assumptions="n>=1") knl = lp.split_iname(knl, "alpha1", 16,outer_tag="g.0", inner_tag="l.0") knl = lp.split_iname(knl, "alpha", 3, outer_tag="g.1", inner_tag="l.1") knl = lp.split_iname(knl, "j", 16) knl = lp.split_iname(knl, "i", 16) return knl
def test_forced_iname_deps_and_reduction(): # See https://github.com/inducer/loopy/issues/24 # This is (purposefully) somewhat un-idiomatic, to replicate the conditions # under which the above bug was found. If assignees were phi[i], then the # iname propagation heuristic would not assume that dependent instructions # need to run inside of 'i', and hence the forced_iname_* bits below would not # be needed. i1 = lp.CInstruction("i", "doSomethingToGetPhi();", assignees="phi") from pymbolic.primitives import Subscript, Variable i2 = lp.Assignment("a", lp.Reduction("sum", "j", Subscript(Variable("phi"), Variable("j"))), forced_iname_deps=frozenset(), forced_iname_deps_is_final=True) k = lp.make_kernel("{[i,j] : 0<=i,j<n}", [i1, i2], [ lp.GlobalArg("a", dtype=np.float32, shape=()), lp.ValueArg("n", dtype=np.int32), lp.TemporaryVariable("phi", dtype=np.float32, shape=("n",)), ], target=lp.CTarget(), ) k = lp.preprocess_kernel(k) assert 'i' not in k.insn_inames("insn_0_j_update") print(k.stringify(with_dependencies=True))
def test_atomic(ctx_factory, dtype): ctx = ctx_factory() if ( np.dtype(dtype).itemsize == 8 and "cl_khr_int64_base_atomics" not in ctx.devices[0].extensions): pytest.skip("64-bit atomics not supported on device") import pyopencl.version # noqa if ( cl.version.VERSION < (2015, 2) and dtype == np.int64): pytest.skip("int64 RNG not supported in PyOpenCL < 2015.2") knl = lp.make_kernel( "{ [i]: 0<=i<n }", "out[i%20] = out[i%20] + 2*a[i] {atomic}", [ lp.GlobalArg("out", dtype, shape=lp.auto, for_atomic=True), lp.GlobalArg("a", dtype, shape=lp.auto), "..." ], assumptions="n>0") ref_knl = knl knl = lp.split_iname(knl, "i", 512) knl = lp.split_iname(knl, "i_inner", 128, outer_tag="unr", inner_tag="g.0") lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=10000))
def test_generate_c_snippet(): from loopy.target.c import CTarget from pymbolic import var I = var("I") # noqa f = var("f") df = var("df") q_v = var("q_v") eN = var("eN") # noqa k = var("k") u = var("u") from functools import partial l_sum = partial(lp.Reduction, "sum", allow_simultaneous=True) Instr = lp.Assignment # noqa knl = lp.make_kernel("{[I, k]: 0<=I<nSpace and 0<=k<nQuad}", [ Instr(f[I], l_sum(k, q_v[k, I] * u)), Instr(df[I], l_sum(k, q_v[k, I])), ], [ lp.GlobalArg("q_v", np.float64, shape="nQuad, nSpace"), lp.GlobalArg("f,df", np.float64, shape="nSpace"), lp.ValueArg("u", np.float64), "...", ], target=CTarget(), assumptions="nQuad>=1") if 0: # enable to play with prefetching # (prefetch currently requires constant sizes) knl = lp.fix_parameters(knl, nQuad=5, nSpace=3) knl = lp.add_prefetch(knl, "q_v", "k,I", default_tag=None) knl = lp.split_iname(knl, "k", 4, inner_tag="unr", slabs=(0, 1)) knl = lp.set_loop_priority(knl, "I,k_outer,k_inner") knl = lp.preprocess_kernel(knl) knl = lp.get_one_scheduled_kernel(knl) print(lp.generate_body(knl))
def test_funny_shape_matrix_mul(ctx_factory): ctx = ctx_factory() n = get_suitable_size(ctx) m = n + 12 ell = m + 12 knl = lp.make_kernel("{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}", ["c[i, j] = sum(k, a[i, k]*b[k, j])"], name="matmul", assumptions="n,m,ell >= 1") knl = lp.add_dtypes(knl, { "a": np.float32, "b": np.float32, }) ref_knl = knl knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.1") knl = lp.split_iname(knl, "j", 8, outer_tag="g.1", inner_tag="l.0") knl = lp.split_iname(knl, "k", 32) #knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"], default_tag="l.auto") #knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"], default_tag="l.auto") knl = lp.extract_subst(knl, "a_acc", "a[i1,i2]", parameters="i1, i2") knl = lp.extract_subst(knl, "b_acc", "b[i1,i2]", parameters="i1, i2") knl = lp.precompute(knl, "a_acc", "k_inner,i_inner", default_tag="l.auto") knl = lp.precompute(knl, "b_acc", "j_inner,k_inner", default_tag="l.auto") lp.auto_test_vs_ref(ref_knl, ctx, knl, op_count=[2 * n**3 / 1e9], op_label=["GFlops"], parameters={ "n": n, "m": m, "ell": ell })
def test_simplify_indices(ctx_factory): ctx = ctx_factory() twice = lp.make_function("{[i, j]: 0<=i<10 and 0<=j<4}", """ y[i,j] = 2*x[i,j] """, name="zerozerozeroonezeroify") knl = lp.make_kernel( "{:}", """ Y[:,:] = zerozerozeroonezeroify(X[:,:]) """, [lp.GlobalArg("X,Y", shape=(10, 4), dtype=np.float64)]) class ContainsFloorDiv(lp.symbolic.CombineMapper): def combine(self, values): return any(values) def map_floor_div(self, expr): return True def map_variable(self, expr): return False def map_constant(self, expr): return False knl = lp.merge([knl, twice]) knl = lp.inline_callable_kernel(knl, "zerozerozeroonezeroify") simplified_knl = lp.simplify_indices(knl) contains_floordiv = ContainsFloorDiv() assert any( contains_floordiv(insn.expression) for insn in knl.default_entrypoint.instructions if isinstance(insn, lp.MultiAssignmentBase)) assert all(not contains_floordiv(insn.expression) for insn in simplified_knl.default_entrypoint.instructions if isinstance(insn, lp.MultiAssignmentBase)) lp.auto_test_vs_ref(knl, ctx, simplified_knl)
def test_op_counter_basic(): knl = lp.make_kernel( "[n,m,ell] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}", [ """ c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k] e[i, k+1] = -g[i,k]*h[i,k+1] """ ], name="basic", assumptions="n,m,ell >= 1") knl = lp.add_and_infer_dtypes( knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True, count_within_subscripts=True) n_workgroups = 1 group_size = 1 subgroups_per_group = div_ceil(group_size, SGS) n_subgroups = n_workgroups * subgroups_per_group n = 512 m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP)].eval_with_dict(params) f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP)].eval_with_dict(params) f32div = op_map[lp.Op(np.float32, 'div', CG.SUBGROUP)].eval_with_dict(params) f64mul = op_map[lp.Op(np.dtype(np.float64), 'mul', CG.SUBGROUP)].eval_with_dict(params) i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.SUBGROUP)].eval_with_dict(params) # (count-per-sub-group)*n_subgroups assert f32add == f32mul == f32div == n * m * ell * n_subgroups assert f64mul == n * m * n_subgroups assert i32add == n * m * 2 * n_subgroups
def test_image_matrix_mul(ctx_factory): dtype = np.float32 ctx = ctx_factory() order = "C" n = get_suitable_size(ctx) if (not ctx.devices[0].image_support or ctx.devices[0].platform.name == "Portable Computing Language"): pytest.skip("crashes on pocl") image_format = cl.ImageFormat(cl.channel_order.R, cl.channel_type.FLOAT) if image_format not in cl.get_supported_image_formats( ctx, cl.mem_flags.READ_ONLY, cl.mem_object_type.IMAGE2D): pytest.skip("image format not supported") knl = lp.make_kernel( "{[i,j,k]: 0<=i,j,k<%d}" % n, [ "c[i, j] = sum(k, a[i, k]*b[k, j])" ], [ lp.ImageArg("a", dtype, shape=(n, n)), lp.ImageArg("b", dtype, shape=(n, n)), lp.GlobalArg("c", dtype, shape=(n, n), order=order), ], name="matmul") seq_knl = knl knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.1") knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.0") knl = lp.split_iname(knl, "k", 32) # conflict-free knl = lp.add_prefetch(knl, 'a', ["i_inner", "k_inner"]) knl = lp.add_prefetch(knl, 'b', ["j_inner", "k_inner"]) lp.auto_test_vs_ref(seq_knl, ctx, knl, op_count=[2*n**3/1e9], op_label=["GFlops"], parameters={}, print_ref_code=True)
def test_mem_access_counter_reduction(): knl = lp.make_kernel("{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", ["c[i, j] = sum(k, a[i, k]*b[k, j])"], name="matmul", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32)) mem_map = lp.get_mem_access_map(knl) n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} f32l = mem_map[lp.MemAccess('global', np.float32, stride=0, direction='load', variable='a')].eval_with_dict(params) f32l += mem_map[lp.MemAccess('global', np.float32, stride=0, direction='load', variable='b')].eval_with_dict(params) assert f32l == 2 * n * m * l f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32), stride=0, direction='store', variable='c')].eval_with_dict(params) assert f32s == n * l ld_bytes = mem_map.filter_by(mtype=['global'], direction=['load' ]).to_bytes().eval_and_sum(params) st_bytes = mem_map.filter_by(mtype=['global'], direction=['store' ]).to_bytes().eval_and_sum(params) assert ld_bytes == 4 * f32l assert st_bytes == 4 * f32s
def get_tensor(ctx): order = 'C' dtype = np.float64 knl = lp.make_kernel(ctx.devices[0], [ "{[j,i,alpha,k]: 0<=alpha<r and 0<=i,j,k<n}", ], [ "res[i,j,k]=sum((alpha), u[i,alpha]*v[j,alpha]*w[k,alpha])", ], [ lp.GlobalArg("res", dtype, shape="n, n, n", order=order), lp.GlobalArg("v", dtype, shape="n, r", order=order), lp.GlobalArg("u", dtype, shape="n, r", order=order), lp.GlobalArg("w", dtype, shape="n, r", order=order), lp.ValueArg("n", np.int32), lp.ValueArg("r", np.int32), ], assumptions="n>=1") knl = lp.split_iname(knl, "i", 8, outer_tag="g.0", inner_tag="l.0") knl = lp.split_iname(knl, "j", 8, outer_tag="g.1", inner_tag="l.1") knl = lp.split_iname(knl, "alpha", 2) knl = lp.split_iname(knl, "k", 8, outer_tag="g.2", inner_tag="l.2") return knl
def test_fancy_matrix_mul(ctx_factory): dtype = np.float32 ctx = ctx_factory() order = "C" n = get_suitable_size(ctx) knl = lp.make_kernel( "[n] -> {[i,j,k]: 0<=i,j,k<n }", ["c[i, j] = sum(k, a[i, k]*b[k, j])"], [ lp.GlobalArg("a", dtype, shape="(n, n)", order=order), lp.GlobalArg("b", dtype, shape="(n, n)", order=order), lp.GlobalArg("c", dtype, shape="(n, n)", order=order), lp.ValueArg("n", np.int32, approximately=1000), ], name="fancy_matmul", assumptions="n>=1") seq_knl = knl knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.1") knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.0") knl = lp.split_iname(knl, "k", 16, slabs=(0, 1)) knl = lp.add_prefetch(knl, "a", ["i_inner", "k_inner"], fetch_outer_inames="i_outer, j_outer, k_outer", default_tag="l.auto") knl = lp.add_prefetch(knl, "b", ["k_inner", "j_inner"], fetch_outer_inames="i_outer, j_outer, k_outer", default_tag="l.auto") lp.auto_test_vs_ref(seq_knl, ctx, knl, op_count=[2 * n**3 / 1e9], op_label=["GFlops"], parameters=dict(n=n))
def LU_decomposition(ctx): order = 'C' dtype = np.float64 knl = lp.make_kernel(ctx.devices[0], [ "{[k,i]: 0<=k<n-1 and k+1<=i<n}", "{[j,l]: 0<=k<n-1 and k+1<=j,l<n}", ], [ "syst[i,k] = syst[i,k]/syst[k,k] {id=lab1}", "syst[l,j]= syst[l,j] - syst[l,k]*syst[k,j] {dep=lab1}", ], [ lp.GlobalArg("syst", dtype, shape="n, n", order=order), lp.ValueArg("n", np.int32), ], assumptions="n>=1") knl = lp.split_iname(knl, "k", n) knl = lp.split_iname(knl, "i", 32) knl = lp.split_iname(knl, "j", 32) knl = lp.split_iname(knl, "l", 32) # print knl # print lp.CompiledKernel(ctx, knl).get_highlighted_code() return knl
def test_callee_with_auto_offset(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) arange = lp.make_function("{[i]: 0<=i<7}", """ y[i] = 2*y[i] """, [lp.GlobalArg("y", offset=lp.auto)], name="dosify") knl = lp.make_kernel( "{[i]: 0<=i<7}", """ [i]: y[i] = dosify([i]: y[i]) """, [lp.GlobalArg("y", offset=3, shape=10)]) knl = lp.merge([knl, arange]) y = np.arange(10) knl(queue, y=y) np.testing.assert_allclose(y[:3], np.arange(3)) np.testing.assert_allclose(y[3:], 2 * np.arange(3, 10))
def get_kernel(self): loopy_insns, result_names = self.get_loopy_insns_and_result_names() kernel_exprs = self.get_kernel_exprs(result_names) arguments = (self.get_default_src_tgt_arguments() + [ lp.GlobalArg("result_%d" % i, dtype, shape="ntargets,nsources") for i, dtype in enumerate(self.value_dtypes) ]) loopy_knl = lp.make_kernel( [ """ {[itgt, isrc, idim]: \ 0 <= itgt < ntargets and \ 0 <= isrc < nsources and \ 0 <= idim < dim} """ ], self.get_kernel_scaling_assignments() + ["for itgt, isrc"] + ["<> d[idim] = targets[idim, itgt] - sources[idim, isrc]"] + [ "<> is_self = (isrc == target_to_source[itgt])" if self.exclude_self else "" ] + loopy_insns + kernel_exprs + [ """ result_{i}[itgt, isrc] = \ knl_{i}_scaling * pair_result_{i} {{inames=isrc:itgt}} """.format(i=iknl) for iknl in range(len(self.kernels)) ] + ["end"], arguments, assumptions="nsources>=1 and ntargets>=1", name=self.name, fixed_parameters=dict(dim=self.dim), lang_version=MOST_RECENT_LANGUAGE_VERSION) loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") for knl in self.kernels: loopy_knl = knl.prepare_loopy_kernel(loopy_knl) return loopy_knl
def test_segmented_scan(ctx_factory, n, segment_boundaries_indices, iname_tag): ctx = ctx_factory() queue = cl.CommandQueue(ctx) arr = np.ones(n, dtype=np.float32) segment_boundaries = np.zeros(n, dtype=np.int32) segment_boundaries[(segment_boundaries_indices, )] = 1 knl = lp.make_kernel( "{[i,j]: 0<=i<n and 0<=j<=i}", "out[i], <>_ = reduce(segmented(sum), j, arr[j], segflag[j])", [ lp.GlobalArg("arr", np.float32, shape=("n", )), lp.GlobalArg("segflag", np.int32, shape=("n", )), "..." ]) knl = lp.fix_parameters(knl, n=n) knl = lp.tag_inames(knl, dict(i=iname_tag)) knl = lp.realize_reduction(knl, force_scan=True) (evt, (out, )) = knl(queue, arr=arr, segflag=segment_boundaries) check_segmented_scan_output(arr, segment_boundaries_indices, out)
def test_kc_with_floor_div_in_expr(ctx_factory, inline): # See https://github.com/inducer/loopy/issues/366 import loopy as lp ctx = ctx_factory() callee = lp.make_function("{[i]: 0<=i<10}", """ x[i] = 2*x[i] """, name="callee_with_update") knl = lp.make_kernel( "{[i]: 0<=i<10}", """ [i]: x[2*(i//2) + (i%2)] = callee_with_update([i]: x[i]) """) knl = lp.merge([knl, callee]) if inline: knl = lp.inline_callable_kernel(knl, "callee_with_update") lp.auto_test_vs_ref(knl, ctx, knl)
def test_fd_demo(): knl = lp.make_kernel( "{[i,j]: 0<=i,j<n}", "result[i+1,j+1] = u[i + 1, j + 1]**2 + -1 + (-4)*u[i + 1, j + 1] \ + u[i + 1 + 1, j + 1] + u[i + 1 + -1, j + 1] \ + u[i + 1, j + 1 + 1] + u[i + 1, j + 1 + -1]") #assumptions="n mod 16=0") knl = lp.split_iname(knl, "i", 16, outer_tag="g.1", inner_tag="l.1") knl = lp.split_iname(knl, "j", 16, outer_tag="g.0", inner_tag="l.0") knl = lp.add_prefetch(knl, "u", ["i_inner", "j_inner"], fetch_bounding_box=True) #n = 1000 #u = cl.clrandom.rand(queue, (n+2, n+2), dtype=np.float32) knl = lp.set_options(knl, write_cl=True) knl = lp.add_and_infer_dtypes(knl, dict(u=np.float32)) code, inf = lp.generate_code(knl) print(code) assert "double" not in code
def test_ispc_target(occa_mode=False): from loopy.target.ispc import ISPCTarget knl = lp.make_kernel( "{ [i]: 0<=i<n }", "out[i] = 2*a[i]", [ lp.GlobalArg("out,a", np.float32, shape=lp.auto), "..." ], target=ISPCTarget(occa_mode=occa_mode)) knl = lp.split_iname(knl, "i", 8, inner_tag="l.0") knl = lp.split_iname(knl, "i_outer", 4, outer_tag="g.0", inner_tag="ilp") knl = lp.add_prefetch(knl, "a", ["i_inner", "i_outer_inner"]) codegen_result = lp.generate_code_v2( lp.get_one_scheduled_kernel( lp.preprocess_kernel(knl))) print(codegen_result.device_code()) print(codegen_result.host_code())
def test_nested_dependent_reduction(ctx_factory): dtype = np.dtype(np.int32) ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl = lp.make_kernel(["{[i]: 0<=i<n}", "{[j]: 0<=j<i+sumlen}"], [ "<> sumlen = ell[i]", "a[i] = sum(j, j)", ], [ lp.ValueArg("n", np.int32), lp.GlobalArg("a", dtype, ("n", )), lp.GlobalArg("ell", np.int32, ("n", )), ]) cknl = lp.CompiledKernel(ctx, knl) n = 330 ell = np.arange(n, dtype=np.int32) evt, (a, ) = cknl(queue, ell=ell, n=n, out_host=True) tgt_result = (2 * ell - 1) * 2 * ell / 2 assert (a == tgt_result).all()
def test_ispc_streaming_stores(): stream_dtype = np.float32 index_dtype = np.int32 knl = lp.make_kernel("{[i]: 0<=i<n}", "a[i] = b[i] + scalar * c[i]", target=lp.ISPCTarget(), index_dtype=index_dtype, name="stream_triad") vars = ["a", "b", "c", "scalar"] knl = lp.assume(knl, "n>0") knl = lp.split_iname(knl, "i", 2**18, outer_tag="g.0", slabs=(0, 1)) knl = lp.split_iname(knl, "i_inner", 8, inner_tag="l.0") knl = lp.tag_instructions(knl, "!streaming_store") knl = lp.add_and_infer_dtypes(knl, {var: stream_dtype for var in vars}) knl = lp.set_argument_order(knl, vars + ["n"]) lp.generate_code_v2(knl).all_code() assert "streaming_store(" in lp.generate_code_v2(knl).all_code()
def test_scan_with_outer_parallel_iname(ctx_factory, sweep_iname_tag): ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl = lp.make_kernel( [ "{[k]: 0<=k<=1}", "[n] -> {[i,j]: 0<=i<n and 0<=j<=i}" ], "out[k,i] = k + sum(j, j**2)" ) knl = lp.tag_inames(knl, dict(k="l.0", i=sweep_iname_tag)) n = 10 knl = lp.fix_parameters(knl, n=n) knl = lp.realize_reduction(knl, force_scan=True) evt, (out,) = knl(queue) inner = np.cumsum(np.arange(n)**2) assert (out.get() == np.array([inner, 1 + inner])).all()
def knl(): import loopy as lp knl = lp.make_kernel( """{[k,i,j]: 0<=k<nelements and 0<=i<n_to_nodes and 0<=j<n_from_nodes}""", "result[itgt_base + to_element_indices[k]*n_to_nodes + i, \ isrc_base + from_element_indices[k]*n_from_nodes + j] \ = resample_mat[i, j]", [ lp.GlobalArg("result", None, shape="nnodes_tgt, nnodes_src", offset=lp.auto), lp.ValueArg("itgt_base,isrc_base", np.int32), lp.ValueArg("nnodes_tgt,nnodes_src", np.int32), "...", ], name="oversample_mat") knl = lp.split_iname(knl, "i", 16, inner_tag="l.0") return lp.tag_inames(knl, dict(k="g.0"))
def test_c_instruction(ctx_factory): #logging.basicConfig(level=logging.DEBUG) ctx = ctx_factory() knl = lp.make_kernel("{[i,j]: 0<=i,j<n }", [ lp.CInstruction("i,j", """ x = sin((float) i*j); """, assignees="x"), "a[i,j] = x", ], [ lp.GlobalArg("a", shape=lp.auto, dtype=np.float32), lp.TemporaryVariable("x", np.float32), "...", ], assumptions="n>=1") knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") print(knl) print(lp.CompiledKernel(ctx, knl).get_highlighted_code())
def Prav_V(ctx): order = 'C' dtype = np.float64 knl = lp.make_kernel(ctx.devices[0], [ "{[i,j,k,alpha]: 0<=alpha<r and 0<=i,j,k<n}", ], [ "f[alpha,j]=sum((k,i), a[i,j,k]*w[k,alpha]*u[i,alpha])", ], [ lp.GlobalArg("a", dtype, shape="n, n, n", order=order), lp.GlobalArg("u", dtype, shape="n, r", order=order), lp.GlobalArg("w", dtype, shape="n, r", order=order), lp.GlobalArg("f", dtype, shape="r, n", order=order), lp.ValueArg("n", np.int64), lp.ValueArg("r", np.int64), ], assumptions="n>=1") knl = lp.split_iname(knl, "j", 16, outer_tag="g.0", inner_tag="l.0") knl = lp.split_iname(knl, "alpha", 3, outer_tag="g.1", inner_tag="l.1") knl = lp.split_iname(knl, "i", 16) knl = lp.split_iname(knl, "k", 16) return knl
def test_double_sum_made_unique(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) n = 20 knl = lp.make_kernel( "{[i,j]: 0<=i,j<n }", [ "a = sum((i,j), i*j)", "b = sum(i, sum(j, i*j))", ], assumptions="n>=1") knl = lp.make_reduction_inames_unique(knl) print(knl) evt, (a, b) = knl(queue, n=n) ref = sum(i*j for i in range(n) for j in range(n)) assert a.get() == ref assert b.get() == ref
def test_nested_scan(ctx_factory, i_tag, j_tag): ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl = lp.make_kernel([ "[n] -> {[i]: 0 <= i < n}", "[i] -> {[j]: 0 <= j <= i}", "[i] -> {[k]: 0 <= k <= i}" ], """ <>tmp[i] = sum(k, 1) out[i] = sum(j, tmp[j]) """) knl = lp.fix_parameters(knl, n=10) knl = lp.tag_inames(knl, dict(i=i_tag, j=j_tag)) knl = lp.realize_reduction(knl, force_scan=True) print(knl) evt, (out, ) = knl(queue) print(out)
def Prav_U(ctx): order = 'C' dtype = np.float32 knl = lp.make_kernel(ctx.devices[0], [ "{[i,j,k,alpha]: 0<=alpha<r and 0<=i,j,k<n}", ], [ "f[alpha,i]=sum((j,k), a[i,j,k]*v[alpha,j]*w[alpha,k])", ], [ lp.GlobalArg("a", dtype, shape="n, n, n", order=order), lp.GlobalArg("v", dtype, shape="r, n", order=order), lp.GlobalArg("w", dtype, shape="r, n", order=order), lp.GlobalArg("f", dtype, shape="r, n", order=order), lp.ValueArg("n", np.int64), lp.ValueArg("r", np.int64), ], assumptions="n>=1") knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.0") knl = lp.split_iname(knl, "alpha", 1, outer_tag="g.1", inner_tag="l.1") knl = lp.split_iname(knl, "j", 16) knl = lp.split_iname(knl, "k", 16) print lp.CompiledKernel(ctx, knl).get_highlighted_code() return knl
def test_scan_with_different_lower_bound_from_sweep(ctx_factory, sweep_lbound, scan_lbound): ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl = lp.make_kernel( "[n, sweep_lbound, scan_lbound] -> " "{[i,j]: sweep_lbound<=i<n+sweep_lbound " "and scan_lbound<=j<=2*(i-sweep_lbound)+scan_lbound}", """ out[i-sweep_lbound] = sum(j, j**2) """) n = 10 knl = lp.fix_parameters(knl, sweep_lbound=sweep_lbound, scan_lbound=scan_lbound) knl = lp.realize_reduction(knl, force_scan=True) evt, (out, ) = knl(queue, n=n) assert (out.get() == np.cumsum( np.arange(scan_lbound, 2 * n + scan_lbound)**2)[::2]).all()
def test_argmax(ctx_factory): logging.basicConfig(level=logging.INFO) dtype = np.dtype(np.float32) ctx = ctx_factory() queue = cl.CommandQueue(ctx) n = 10000 knl = lp.make_kernel( "{[i]: 0<=i<%d}" % n, """ max_val, max_idx = argmax(i, abs(a[i]), i) """) knl = lp.add_and_infer_dtypes(knl, {"a": np.float32}) print(lp.preprocess_kernel(knl)) knl = lp.set_options(knl, write_cl=True, highlight_cl=True) a = np.random.randn(10000).astype(dtype) evt, (max_idx, max_val) = knl(queue, a=a, out_host=True) assert max_val == np.max(np.abs(a)) assert max_idx == np.where(np.abs(a) == max_val)[-1]
def test_non1_step_slices(ctx_factory, start, inline): # See https://github.com/inducer/loopy/pull/222#discussion_r645905188 ctx = ctx_factory() cq = cl.CommandQueue(ctx) callee = lp.make_function("{[i]: 0<=i<n}", """ y[i] = i**2 """, [lp.ValueArg("n"), ...], name="squared_arange") t_unit = lp.make_kernel("{[i_init, j_init]: 0<=i_init, j_init<40}", f""" X[i_init] = 42 X[{start}:40:3] = squared_arange({len(range(start, 40, 3))}) Y[j_init] = 1729 Y[39:{start}:-3] = squared_arange({len(range(39, start, -3))}) """, [lp.GlobalArg("X,Y", shape=40)], seq_dependencies=True) expected_out1 = 42 * np.ones(40, dtype=np.int64) expected_out1[start:40:3] = np.arange(len(range(start, 40, 3)))**2 expected_out2 = 1729 * np.ones(40, dtype=np.int64) expected_out2[39:start:-3] = np.arange(len(range(39, start, -3)))**2 t_unit = lp.merge([t_unit, callee]) t_unit = lp.set_options(t_unit, "return_dict") if inline: t_unit = lp.inline_callable_kernel(t_unit, "squared_arange") evt, out_dict = t_unit(cq) np.testing.assert_allclose(out_dict["X"].get(), expected_out1) np.testing.assert_allclose(out_dict["Y"].get(), expected_out2)
def test_rename_argument_with_auto_stride(ctx_factory): from loopy.kernel.array import FixedStrideArrayDimTag ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl = lp.make_kernel("{[i]: 0<=i<10}", """ y[i] = x[i] """, [ lp.GlobalArg("x", dtype=float, shape=lp.auto, dim_tags=[FixedStrideArrayDimTag(lp.auto)]), ... ]) knl = lp.rename_argument(knl, "x", "x_new") code_str = lp.generate_code_v2(knl).device_code() assert code_str.find("double const *__restrict__ x_new,") != -1 assert code_str.find("double const *__restrict__ x,") == -1 evt, (out, ) = knl(queue, x_new=np.random.rand(10))
def make_G_S_knl(instructions): knl = lp.make_kernel( "[Nx, Ny, Nz] -> { [i,j,k]: 0<=i<Nx and 0<=j<Ny and 0<=k<Nz }", instructions, [ lp.GlobalArg( "subarr", shape=pencil_shape_str, offset=lp.auto), lp.GlobalArg("arr", shape="(Nx, Ny, Nz)", offset=lp.auto), ..., ], default_offset=lp.auto, lang_version=(2018, 2), ) knl = lp.fix_parameters(knl, **params_to_fix) knl = lp.split_iname(knl, "k", 32, outer_tag="g.0", inner_tag="l.0") knl = lp.split_iname(knl, "j", 2, outer_tag="g.1", inner_tag="unr") knl = lp.split_iname(knl, "i", 1, outer_tag="g.2", inner_tag="unr") return knl
def test_ilp_loop_bound(ctx_factory): # The salient bit of this test is that a joint bound on (outer, inner) # from a split occurs in a setting where the inner loop has been ilp'ed. # In 'normal' parallel loops, the inner index is available for conditionals # throughout. In ILP'd loops, not so much. ctx = ctx_factory() knl = lp.make_kernel("{ [i,j,k]: 0<=i,j,k<n }", """ out[i,k] = sum(j, a[i,j]*b[j,k]) """, [ lp.GlobalArg("a,b", np.float32, shape=lp.auto), "...", ], assumptions="n>=1") ref_knl = knl knl = lp.set_loop_priority(knl, "j,i,k") knl = lp.split_iname(knl, "k", 4, inner_tag="ilp") lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=200))