def copy_targets_kernel(self): knl = lp.make_kernel( """{[dim,i]: 0<=dim<ndims and 0<=i<npoints}""", """ targets[dim, i] = points[dim, i] """, default_offset=lp.auto, name="copy_targets", defines=dict(ndims=self.ambient_dim)) knl = lp.split_iname(knl, "i", 128, inner_tag="l.0", outer_tag="g.0") knl = lp.tag_data_axes(knl, "points", "sep, C") knl = lp.tag_data_axes(knl, "targets", "stride:auto, stride:1") return lp.tag_inames(knl, dict(dim="ilp"))
def test_offsets_and_slicing(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) n = 20 knl = lp.make_kernel("{[i,j]: 0<=i<n and 0<=j<m }", """ b[i,j] = 2*a[i,j] """, assumptions="n>=1 and m>=1", default_offset=lp.auto) knl = lp.tag_data_axes(knl, "a,b", "stride:auto,stride:1") cknl = lp.CompiledKernel(ctx, knl) a_full = cl.clrandom.rand(queue, (n, n), np.float64) a_full_h = a_full.get() b_full = cl.clrandom.rand(queue, (n, n), np.float64) b_full_h = b_full.get() a_sub = (slice(3, 10), slice(5, 10)) a = a_full[a_sub] b_sub = (slice(3 + 3, 10 + 3), slice(5 + 4, 10 + 4)) b = b_full[b_sub] b_full_h[b_sub] = 2 * a_full_h[a_sub] print(cknl.get_highlighted_code({"a": a.dtype})) cknl(queue, a=a, b=b) import numpy.linalg as la assert la.norm(b_full.get() - b_full_h) < 1e-13
def test_vectorize(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{[i]: 0<=i<n}", """ <> temp = 2*b[i] a[i] = temp """) knl = lp.add_and_infer_dtypes(knl, dict(b=np.float32)) knl = lp.set_array_dim_names(knl, "a,b", "i") knl = lp.split_array_dim(knl, [("a", 0), ("b", 0)], 4, split_kwargs=dict(slabs=(0, 1))) knl = lp.tag_data_axes(knl, "a,b", "c,vec") ref_knl = knl ref_knl = lp.tag_inames(ref_knl, {"i_inner": "unr"}) knl = lp.tag_inames(knl, {"i_inner": "vec"}) knl = lp.preprocess_kernel(knl) knl = lp.get_one_scheduled_kernel(knl) code, inf = lp.generate_code(knl) lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=30))
def test_vectorize(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{[i]: 0<=i<n}", """ <> temp = 2*b[i] a[i] = temp """) knl = lp.add_and_infer_dtypes(knl, dict(b=np.float32)) knl = lp.set_array_dim_names(knl, "a,b", "i") knl = lp.split_array_dim(knl, [("a", 0), ("b", 0)], 4, split_kwargs=dict(slabs=(0, 1))) knl = lp.tag_data_axes(knl, "a,b", "c,vec") ref_knl = knl ref_knl = lp.tag_inames(ref_knl, {"i_inner": "unr"}) knl = lp.tag_inames(knl, {"i_inner": "vec"}) knl = lp.preprocess_kernel(knl) knl = lp.get_one_scheduled_kernel(knl) code, inf = lp.generate_code(knl) lp.auto_test_vs_ref( ref_knl, ctx, knl, parameters=dict(n=30))
def test_vector_types(ctx_factory, vec_len): ctx = ctx_factory() knl = lp.make_kernel( "{ [i,j]: 0<=i<n and 0<=j<vec_len }", "out[i,j] = 2*a[i,j]", [ lp.GlobalArg("a", np.float32, shape=lp.auto), lp.GlobalArg("out", np.float32, shape=lp.auto), "..." ]) knl = lp.fix_parameters(knl, vec_len=vec_len) ref_knl = knl knl = lp.tag_data_axes(knl, "out", "c,vec") knl = lp.tag_inames(knl, dict(j="unr")) knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict( n=20000 ))
def set_q_storage_format(kernel, name): kernel = lp.set_array_dim_names(kernel, name, "i,j,k,field,e") kernel = lp.split_array_dim(kernel, (name, 3, "F"), 4, auto_split_inames=False) kernel = lp.tag_data_axes(kernel, name, "N0,N1,N2,vec,N4,N3") return kernel
def test_tag_data_axes(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel("{ [i,j,k]: 0<=i,j,k<n }", "out[i,j,k] = 15") ref_knl = knl with pytest.raises(lp.LoopyError): lp.tag_data_axes(knl, "out", "N1,N0,N5") with pytest.raises(lp.LoopyError): lp.tag_data_axes(knl, "out", "N1,N0,c") knl = lp.tag_data_axes(knl, "out", "N1,N0,N2") knl = lp.tag_inames(knl, dict(j="g.0", i="g.1")) lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=20))
def test_tag_data_axes(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{ [i,j,k]: 0<=i,j,k<n }", "out[i,j,k] = 15") ref_knl = knl with pytest.raises(lp.LoopyError): lp.tag_data_axes(knl, "out", "N1,N0,N5") with pytest.raises(lp.LoopyError): lp.tag_data_axes(knl, "out", "N1,N0,c") knl = lp.tag_data_axes(knl, "out", "N1,N0,N2") knl = lp.tag_inames(knl, dict(j="g.0", i="g.1")) lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=20))
def test_indexof_vec(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) if ctx.devices[0].platform.name.startswith("Portable"): # Accurate as of 2015-10-08 pytest.skip("POCL miscompiles vector code") knl = lp.make_kernel(''' { [i,j,k]: 0<=i,j,k<4 } ''', ''' out[i,j,k] = indexof_vec(out[i,j,k])''') knl = lp.tag_inames(knl, {"i": "vec"}) knl = lp.tag_data_axes(knl, "out", "vec,c,c") knl = lp.set_options(knl, write_cl=True) (evt, (out, )) = knl(queue)
def test_indexof_vec(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) if ctx.devices[0].platform.name.startswith("Portable"): # Accurate as of 2015-10-08 pytest.skip("POCL miscompiles vector code") knl = lp.make_kernel( ''' { [i,j,k]: 0<=i,j,k<4 } ''', ''' out[i,j,k] = indexof_vec(out[i,j,k])''') knl = lp.tag_inames(knl, {"i": "vec"}) knl = lp.tag_data_axes(knl, "out", "vec,c,c") knl = lp.set_options(knl, write_cl=True) (evt, (out,)) = knl(queue)
def test_indexof_vec(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) if ( # Accurate as of 2019-11-04 ctx.devices[0].platform.name.startswith("Intel")): pytest.skip("target ICD miscompiles vector code") knl = lp.make_kernel(""" { [i,j,k]: 0<=i,j,k<4 } """, """ out[i,j,k] = indexof_vec(out[i,j,k])""", [lp.GlobalArg("out", shape=lp.auto, is_input=False)]) knl = lp.tag_inames(knl, {"i": "vec"}) knl = lp.tag_data_axes(knl, "out", "vec,c,c") knl = lp.set_options(knl, write_cl=True) (evt, (out, )) = knl(queue)
def test_vector_types(ctx_factory, vec_len): ctx = ctx_factory() knl = lp.make_kernel( "{ [i,j]: 0<=i<n and 0<=j<vec_len }", "out[i,j] = 2*a[i,j]", [ lp.GlobalArg("a", np.float32, shape=lp.auto), lp.GlobalArg("out", np.float32, shape=lp.auto), "..." ]) knl = lp.fix_parameters(knl, vec_len=vec_len) ref_knl = knl knl = lp.tag_data_axes(knl, "out", "c,vec") knl = lp.tag_inames(knl, dict(j="unr")) knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=20000))
def knl(): knl = lp.make_kernel( """{[d,k,i,j]: 0<=d<dims and 0<=k<nelements and 0<=i<ndiscr_nodes and 0<=j<nmesh_nodes}""", """ result[d, k, i] = \ sum(j, resampling_mat[i, j] * nodes[d, k, j]) """, name="nodes", default_offset=lp.auto) knl = lp.split_iname(knl, "i", 16, inner_tag="l.0") knl = lp.tag_inames(knl, dict(k="g.0")) knl = lp.tag_data_axes(knl, "result", "stride:auto,stride:auto,stride:auto") return knl
def pick_expansion_centers(self): knl = lp.make_kernel( """{[dim,k,i]: 0<=dim<ndims and 0<=k<nelements and 0<=i<nout_nodes}""", """ centers[dim, k, i] = all_centers[dim, k, kept_center_indices[i]] radii[k, i] = all_radii[k, kept_center_indices[i]] """, [ lp.GlobalArg("all_centers", None, shape="ndims,nelements,nunit_nodes"), lp.GlobalArg("all_radii", None, shape="nelements,nunit_nodes"), lp.ValueArg("nunit_nodes", np.int32), "..." ], default_offset=lp.auto, name="center_pick", defines=dict(ndims=self.ambient_dim)) knl = lp.tag_data_axes(knl, "centers,all_centers", "sep, C, C") knl = lp.split_iname(knl, "i", 16, inner_tag="l.0") return lp.tag_inames(knl, dict(k="g.0", dim="ilp"))
def test_offsets_and_slicing(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) n = 20 knl = lp.make_kernel( "{[i,j]: 0<=i<n and 0<=j<m }", """ b[i,j] = 2*a[i,j] """, assumptions="n>=1 and m>=1", default_offset=lp.auto) knl = lp.tag_data_axes(knl, "a,b", "stride:auto,stride:1") cknl = lp.CompiledKernel(ctx, knl) a_full = cl.clrandom.rand(queue, (n, n), np.float64) a_full_h = a_full.get() b_full = cl.clrandom.rand(queue, (n, n), np.float64) b_full_h = b_full.get() a_sub = (slice(3, 10), slice(5, 10)) a = a_full[a_sub] b_sub = (slice(3+3, 10+3), slice(5+4, 10+4)) b = b_full[b_sub] b_full_h[b_sub] = 2*a_full_h[a_sub] print(cknl.get_highlighted_code({"a": a.dtype})) cknl(queue, a=a, b=b) import numpy.linalg as la assert la.norm(b_full.get() - b_full_h) < 1e-13
def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level): ctx = ctx_factory() filename = "strongVolumeKernels.f90" with open(filename, "r") as sourcef: source = sourcef.read() source = source.replace("datafloat", "real*4") hsv_r, hsv_s = [ knl for knl in lp.parse_fortran(source, filename, auto_dependencies=False) if "KernelR" in knl.name or "KernelS" in knl.name ] hsv_r = lp.tag_instructions(hsv_r, "rknl") hsv_s = lp.tag_instructions(hsv_s, "sknl") hsv = lp.fuse_kernels([hsv_r, hsv_s], ["_r", "_s"]) #hsv = hsv_s from gnuma_loopy_transforms import ( fix_euler_parameters, set_q_storage_format, set_D_storage_format) hsv = lp.fix_parameters(hsv, Nq=Nq) hsv = lp.set_loop_priority(hsv, "e,k,j,i") hsv = lp.tag_inames(hsv, dict(e="g.0", j="l.1", i="l.0")) hsv = lp.assume(hsv, "elements >= 1") hsv = fix_euler_parameters(hsv, p_p0=1, p_Gamma=1.4, p_R=1) for name in ["Q", "rhsQ"]: hsv = set_q_storage_format(hsv, name) hsv = set_D_storage_format(hsv) #hsv = lp.add_prefetch(hsv, "volumeGeometricFactors") ref_hsv = hsv if opt_level == 0: tap_hsv = hsv hsv = lp.add_prefetch(hsv, "D[:,:]") if opt_level == 1: tap_hsv = hsv # turn the first reads into subst rules local_prep_var_names = set() for insn in lp.find_instructions(hsv, "tag:local_prep"): assignee, = insn.assignee_var_names() local_prep_var_names.add(assignee) hsv = lp.assignment_to_subst(hsv, assignee) # precompute fluxes hsv = lp.assignment_to_subst(hsv, "JinvD_r") hsv = lp.assignment_to_subst(hsv, "JinvD_s") r_fluxes = lp.find_instructions(hsv, "tag:compute_fluxes and tag:rknl") s_fluxes = lp.find_instructions(hsv, "tag:compute_fluxes and tag:sknl") if ilp_multiple > 1: hsv = lp.split_iname(hsv, "k", 2, inner_tag="ilp") ilp_inames = ("k_inner",) flux_ilp_inames = ("kk",) else: ilp_inames = () flux_ilp_inames = () rtmps = [] stmps = [] flux_store_idx = 0 for rflux_insn, sflux_insn in zip(r_fluxes, s_fluxes): for knl_tag, insn, flux_inames, tmps, flux_precomp_inames in [ ("rknl", rflux_insn, ("j", "n",), rtmps, ("jj", "ii",)), ("sknl", sflux_insn, ("i", "n",), stmps, ("ii", "jj",)), ]: flux_var, = insn.assignee_var_names() print(insn) reader, = lp.find_instructions(hsv, "tag:{knl_tag} and reads:{flux_var}" .format(knl_tag=knl_tag, flux_var=flux_var)) hsv = lp.assignment_to_subst(hsv, flux_var) flux_store_name = "flux_store_%d" % flux_store_idx flux_store_idx += 1 tmps.append(flux_store_name) hsv = lp.precompute(hsv, flux_var+"_subst", flux_inames + ilp_inames, temporary_name=flux_store_name, precompute_inames=flux_precomp_inames + flux_ilp_inames, default_tag=None) if flux_var.endswith("_s"): hsv = lp.tag_data_axes(hsv, flux_store_name, "N0,N1,N2?") else: hsv = lp.tag_data_axes(hsv, flux_store_name, "N1,N0,N2?") n_iname = "n_"+flux_var.replace("_r", "").replace("_s", "") if n_iname.endswith("_0"): n_iname = n_iname[:-2] hsv = lp.rename_iname(hsv, "n", n_iname, within="id:"+reader.id, existing_ok=True) hsv = lp.tag_inames(hsv, dict(ii="l.0", jj="l.1")) for iname in flux_ilp_inames: hsv = lp.tag_inames(hsv, {iname: "ilp"}) hsv = lp.alias_temporaries(hsv, rtmps) hsv = lp.alias_temporaries(hsv, stmps) if opt_level == 2: tap_hsv = hsv for prep_var_name in local_prep_var_names: if prep_var_name.startswith("Jinv") or "_s" in prep_var_name: continue hsv = lp.precompute(hsv, lp.find_one_rule_matching(hsv, prep_var_name+"_*subst*")) if opt_level == 3: tap_hsv = hsv hsv = lp.add_prefetch(hsv, "Q[ii,jj,k,:,:,e]", sweep_inames=ilp_inames) if opt_level == 4: tap_hsv = hsv tap_hsv = lp.tag_inames(tap_hsv, dict( Q_dim_field_inner="unr", Q_dim_field_outer="unr")) hsv = lp.buffer_array(hsv, "rhsQ", ilp_inames, fetch_bounding_box=True, default_tag="for", init_expression="0", store_expression="base + buffer") if opt_level == 5: tap_hsv = hsv tap_hsv = lp.tag_inames(tap_hsv, dict( rhsQ_init_field_inner="unr", rhsQ_store_field_inner="unr", rhsQ_init_field_outer="unr", rhsQ_store_field_outer="unr", Q_dim_field_inner="unr", Q_dim_field_outer="unr")) # buffer axes need to be vectorized in order for this to work hsv = lp.tag_data_axes(hsv, "rhsQ_buf", "c?,vec,c") hsv = lp.tag_data_axes(hsv, "Q_fetch", "c?,vec,c") hsv = lp.tag_data_axes(hsv, "D_fetch", "f,f") hsv = lp.tag_inames(hsv, {"Q_dim_k": "unr", "rhsQ_init_k": "unr", "rhsQ_store_k": "unr"}, ignore_nonexistent=True) if opt_level == 6: tap_hsv = hsv tap_hsv = lp.tag_inames(tap_hsv, dict( rhsQ_init_field_inner="unr", rhsQ_store_field_inner="unr", rhsQ_init_field_outer="unr", rhsQ_store_field_outer="unr", Q_dim_field_inner="unr", Q_dim_field_outer="unr")) hsv = lp.tag_inames(hsv, dict( rhsQ_init_field_inner="vec", rhsQ_store_field_inner="vec", rhsQ_init_field_outer="unr", rhsQ_store_field_outer="unr", Q_dim_field_inner="vec", Q_dim_field_outer="unr")) if opt_level == 7: tap_hsv = hsv hsv = lp.collect_common_factors_on_increment(hsv, "rhsQ_buf", vary_by_axes=(0,) if ilp_multiple > 1 else ()) if opt_level >= 8: tap_hsv = hsv hsv = tap_hsv if 1: print("OPS") op_poly = lp.get_op_poly(hsv) print(lp.stringify_stats_mapping(op_poly)) print("MEM") gmem_poly = lp.sum_mem_access_to_bytes(lp.get_gmem_access_poly(hsv)) print(lp.stringify_stats_mapping(gmem_poly)) hsv = lp.set_options(hsv, cl_build_options=[ "-cl-denorms-are-zero", "-cl-fast-relaxed-math", "-cl-finite-math-only", "-cl-mad-enable", "-cl-no-signed-zeros", ]) hsv = hsv.copy(name="horizontalStrongVolumeKernel") results = lp.auto_test_vs_ref(ref_hsv, ctx, hsv, parameters=dict(elements=300), quiet=True) elapsed = results["elapsed_wall"] print("elapsed", elapsed)
def set_D_storage_format(kernel): return lp.tag_data_axes(kernel, "D", "f,f")
def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level): ctx = ctx_factory() filename = "strongVolumeKernels.f90" with open(filename, "r") as sourcef: source = sourcef.read() source = source.replace("datafloat", "real*4") hsv_r, hsv_s = [ knl for knl in lp.parse_fortran(source, filename, auto_dependencies=False) if "KernelR" in knl.name or "KernelS" in knl.name ] hsv_r = lp.tag_instructions(hsv_r, "rknl") hsv_s = lp.tag_instructions(hsv_s, "sknl") hsv = lp.fuse_kernels([hsv_r, hsv_s], ["_r", "_s"]) #hsv = hsv_s from gnuma_loopy_transforms import (fix_euler_parameters, set_q_storage_format, set_D_storage_format) hsv = lp.fix_parameters(hsv, Nq=Nq) hsv = lp.set_loop_priority(hsv, "e,k,j,i") hsv = lp.tag_inames(hsv, dict(e="g.0", j="l.1", i="l.0")) hsv = lp.assume(hsv, "elements >= 1") hsv = fix_euler_parameters(hsv, p_p0=1, p_Gamma=1.4, p_R=1) for name in ["Q", "rhsQ"]: hsv = set_q_storage_format(hsv, name) hsv = set_D_storage_format(hsv) #hsv = lp.add_prefetch(hsv, "volumeGeometricFactors") ref_hsv = hsv if opt_level == 0: tap_hsv = hsv hsv = lp.add_prefetch(hsv, "D[:,:]") if opt_level == 1: tap_hsv = hsv # turn the first reads into subst rules local_prep_var_names = set() for insn in lp.find_instructions(hsv, "tag:local_prep"): assignee, = insn.assignee_var_names() local_prep_var_names.add(assignee) hsv = lp.assignment_to_subst(hsv, assignee) # precompute fluxes hsv = lp.assignment_to_subst(hsv, "JinvD_r") hsv = lp.assignment_to_subst(hsv, "JinvD_s") r_fluxes = lp.find_instructions(hsv, "tag:compute_fluxes and tag:rknl") s_fluxes = lp.find_instructions(hsv, "tag:compute_fluxes and tag:sknl") if ilp_multiple > 1: hsv = lp.split_iname(hsv, "k", 2, inner_tag="ilp") ilp_inames = ("k_inner", ) flux_ilp_inames = ("kk", ) else: ilp_inames = () flux_ilp_inames = () rtmps = [] stmps = [] flux_store_idx = 0 for rflux_insn, sflux_insn in zip(r_fluxes, s_fluxes): for knl_tag, insn, flux_inames, tmps, flux_precomp_inames in [ ("rknl", rflux_insn, ( "j", "n", ), rtmps, ( "jj", "ii", )), ("sknl", sflux_insn, ( "i", "n", ), stmps, ( "ii", "jj", )), ]: flux_var, = insn.assignee_var_names() print(insn) reader, = lp.find_instructions( hsv, "tag:{knl_tag} and reads:{flux_var}".format(knl_tag=knl_tag, flux_var=flux_var)) hsv = lp.assignment_to_subst(hsv, flux_var) flux_store_name = "flux_store_%d" % flux_store_idx flux_store_idx += 1 tmps.append(flux_store_name) hsv = lp.precompute(hsv, flux_var + "_subst", flux_inames + ilp_inames, temporary_name=flux_store_name, precompute_inames=flux_precomp_inames + flux_ilp_inames, default_tag=None) if flux_var.endswith("_s"): hsv = lp.tag_data_axes(hsv, flux_store_name, "N0,N1,N2?") else: hsv = lp.tag_data_axes(hsv, flux_store_name, "N1,N0,N2?") n_iname = "n_" + flux_var.replace("_r", "").replace("_s", "") if n_iname.endswith("_0"): n_iname = n_iname[:-2] hsv = lp.rename_iname(hsv, "n", n_iname, within="id:" + reader.id, existing_ok=True) hsv = lp.tag_inames(hsv, dict(ii="l.0", jj="l.1")) for iname in flux_ilp_inames: hsv = lp.tag_inames(hsv, {iname: "ilp"}) hsv = lp.alias_temporaries(hsv, rtmps) hsv = lp.alias_temporaries(hsv, stmps) if opt_level == 2: tap_hsv = hsv for prep_var_name in local_prep_var_names: if prep_var_name.startswith("Jinv") or "_s" in prep_var_name: continue hsv = lp.precompute( hsv, lp.find_one_rule_matching(hsv, prep_var_name + "_*subst*")) if opt_level == 3: tap_hsv = hsv hsv = lp.add_prefetch(hsv, "Q[ii,jj,k,:,:,e]", sweep_inames=ilp_inames) if opt_level == 4: tap_hsv = hsv tap_hsv = lp.tag_inames( tap_hsv, dict(Q_dim_field_inner="unr", Q_dim_field_outer="unr")) hsv = lp.buffer_array(hsv, "rhsQ", ilp_inames, fetch_bounding_box=True, default_tag="for", init_expression="0", store_expression="base + buffer") if opt_level == 5: tap_hsv = hsv tap_hsv = lp.tag_inames( tap_hsv, dict(rhsQ_init_field_inner="unr", rhsQ_store_field_inner="unr", rhsQ_init_field_outer="unr", rhsQ_store_field_outer="unr", Q_dim_field_inner="unr", Q_dim_field_outer="unr")) # buffer axes need to be vectorized in order for this to work hsv = lp.tag_data_axes(hsv, "rhsQ_buf", "c?,vec,c") hsv = lp.tag_data_axes(hsv, "Q_fetch", "c?,vec,c") hsv = lp.tag_data_axes(hsv, "D_fetch", "f,f") hsv = lp.tag_inames(hsv, { "Q_dim_k": "unr", "rhsQ_init_k": "unr", "rhsQ_store_k": "unr" }, ignore_nonexistent=True) if opt_level == 6: tap_hsv = hsv tap_hsv = lp.tag_inames( tap_hsv, dict(rhsQ_init_field_inner="unr", rhsQ_store_field_inner="unr", rhsQ_init_field_outer="unr", rhsQ_store_field_outer="unr", Q_dim_field_inner="unr", Q_dim_field_outer="unr")) hsv = lp.tag_inames( hsv, dict(rhsQ_init_field_inner="vec", rhsQ_store_field_inner="vec", rhsQ_init_field_outer="unr", rhsQ_store_field_outer="unr", Q_dim_field_inner="vec", Q_dim_field_outer="unr")) if opt_level == 7: tap_hsv = hsv hsv = lp.collect_common_factors_on_increment( hsv, "rhsQ_buf", vary_by_axes=(0, ) if ilp_multiple > 1 else ()) if opt_level >= 8: tap_hsv = hsv hsv = tap_hsv if 1: print("OPS") op_poly = lp.get_op_poly(hsv) print(lp.stringify_stats_mapping(op_poly)) print("MEM") gmem_poly = lp.sum_mem_access_to_bytes(lp.get_gmem_access_poly(hsv)) print(lp.stringify_stats_mapping(gmem_poly)) hsv = lp.set_options(hsv, cl_build_options=[ "-cl-denorms-are-zero", "-cl-fast-relaxed-math", "-cl-finite-math-only", "-cl-mad-enable", "-cl-no-signed-zeros", ]) hsv = hsv.copy(name="horizontalStrongVolumeKernel") results = lp.auto_test_vs_ref(ref_hsv, ctx, hsv, parameters=dict(elements=300), quiet=True) elapsed = results["elapsed_wall"] print("elapsed", elapsed)