def test_global_parallel_reduction(ctx_factory, size): ctx = ctx_factory() knl = lp.make_kernel( "{[i]: 0 <= i < n }", """ # Using z[0] instead of z works around a bug in ancient PyOpenCL. z[0] = sum(i, i/13) """) ref_knl = knl gsize = 128 knl = lp.split_iname(knl, "i", gsize * 20) knl = lp.split_iname(knl, "i_inner", gsize, outer_tag="l.0") knl = lp.split_reduction_inward(knl, "i_inner_inner") knl = lp.split_reduction_inward(knl, "i_inner_outer") from loopy.transform.data import reduction_arg_to_subst_rule knl = reduction_arg_to_subst_rule(knl, "i_outer") knl = lp.precompute(knl, "red_i_outer_arg", "i_outer", temporary_scope=lp.temp_var_scope.GLOBAL, default_tag="l.auto") knl = lp.realize_reduction(knl) knl = lp.add_dependency( knl, "writes:acc_i_outer", "id:red_i_outer_arg_barrier") lp.auto_test_vs_ref( ref_knl, ctx, knl, parameters={"n": size}, print_ref_code=True)
def test_global_mc_parallel_reduction(ctx_factory, size): ctx = ctx_factory() import pyopencl.version # noqa if cl.version.VERSION < (2016, 2): pytest.skip("Random123 RNG not supported in PyOpenCL < 2016.2") knl = lp.make_kernel( "{[i]: 0 <= i < n }", """ for i <> key = make_uint2(i, 324830944) {inames=i} <> ctr = make_uint4(0, 1, 2, 3) {inames=i,id=init_ctr} <> vals, ctr = philox4x32_f32(ctr, key) {dep=init_ctr} end z = sum(i, vals.s0 + vals.s1 + vals.s2 + vals.s3) """) ref_knl = knl gsize = 128 knl = lp.split_iname(knl, "i", gsize * 20) knl = lp.split_iname(knl, "i_inner", gsize, outer_tag="l.0") knl = lp.split_reduction_inward(knl, "i_inner_inner") knl = lp.split_reduction_inward(knl, "i_inner_outer") from loopy.transform.data import reduction_arg_to_subst_rule knl = reduction_arg_to_subst_rule(knl, "i_outer") knl = lp.precompute(knl, "red_i_outer_arg", "i_outer", temporary_scope=lp.temp_var_scope.GLOBAL) knl = lp.realize_reduction(knl) knl = lp.add_dependency(knl, "writes:acc_i_outer", "id:red_i_outer_arg_barrier") lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters={"n": size})
def test_prefetch_with_within(ctx_factory): t_unit = lp.make_kernel( ["{[j]: 0<=j<256}", "{[i, k]: 0<=i<100 and 0<=k<128}"], """ f[j] = 3.14 * j {id=set_f} f[j] = 2 * f[j] {id=update_f, nosync=set_f} ... gbarrier {id=insn_gbar} y[i, k] = f[k] * x[i, k] {id=set_y} """, [lp.GlobalArg("x", shape=lp.auto, dtype=float), ...], seq_dependencies=True, name="myknl") ref_t_unit = t_unit t_unit = lp.split_iname(t_unit, "j", 32, inner_tag="l.0", outer_tag="g.0") t_unit = lp.split_iname(t_unit, "i", 32, inner_tag="l.0", outer_tag="g.0") t_unit = lp.add_prefetch(t_unit, "f", prefetch_insn_id="f_prftch", within="id:set_y", sweep_inames="k", dim_arg_names="iprftch", default_tag=None, temporary_address_space=lp.AddressSpace.LOCAL, temporary_name="foo", fetch_outer_inames=frozenset({"i_outer"})) t_unit = lp.add_dependency(t_unit, "id:f_prftch", "id:insn_gbar") t_unit = lp.split_iname(t_unit, "iprftch", 32, inner_tag="l.0") # test that 'f' is only prefetched in set_y assert t_unit["myknl"].temporary_variables["foo"].shape == (128, ) lp.auto_test_vs_ref(ref_t_unit, ctx_factory(), t_unit)
def add_instruction_deps(knl): assignees = {} for i in knl.instructions: assignee = i.assignee_name if assignee not in assignees.keys(): assignees[assignee] = i.id else: knl = lp.add_dependency(knl, f"id:{i.id}", f"id:{assignees[assignee]}") assignees[assignee] = i.id return knl
def test_global_parallel_reduction(ctx_factory, size): ctx = ctx_factory() knl = lp.make_kernel( "{[i]: 0 <= i < n }", """ # Using z[0] instead of z works around a bug in ancient PyOpenCL. z[0] = sum(i, a[i]) """) knl = lp.add_and_infer_dtypes(knl, {"a": np.float32}) ref_knl = knl gsize = 128 knl = lp.split_iname(knl, "i", gsize * 20) knl = lp.split_iname(knl, "i_inner", gsize, inner_tag="l.0") knl = lp.split_reduction_outward(knl, "i_outer") knl = lp.split_reduction_inward(knl, "i_inner_outer") from loopy.transform.data import reduction_arg_to_subst_rule knl = reduction_arg_to_subst_rule(knl, "i_outer") knl = lp.precompute(knl, "red_i_outer_arg", "i_outer", temporary_address_space=lp.AddressSpace.GLOBAL, default_tag="l.auto") knl = lp.realize_reduction(knl) knl = lp.tag_inames(knl, "i_outer_0:g.0") # Keep the i_outer accumulator on the correct (lower) side of the barrier, # otherwise there will be useless save/reload code generated. knl = lp.add_dependency(knl, "writes:acc_i_outer", "id:red_i_outer_arg_barrier") lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters={"n": size}, print_ref_code=True)
def test_global_mc_parallel_reduction(ctx_factory, size): ctx = ctx_factory() import pyopencl.version # noqa if cl.version.VERSION < (2016, 2): pytest.skip("Random123 RNG not supported in PyOpenCL < 2016.2") knl = lp.make_kernel( "{[i]: 0 <= i < n }", """ for i <> key = make_uint2(i, 324830944) {inames=i} <> ctr = make_uint4(0, 1, 2, 3) {inames=i,id=init_ctr} <> vals, ctr = philox4x32_f32(ctr, key) {dep=init_ctr} end z = sum(i, vals.s0 + vals.s1 + vals.s2 + vals.s3) """) ref_knl = knl gsize = 128 knl = lp.split_iname(knl, "i", gsize * 20) knl = lp.split_iname(knl, "i_inner", gsize, outer_tag="l.0") knl = lp.split_reduction_inward(knl, "i_inner_inner") knl = lp.split_reduction_inward(knl, "i_inner_outer") from loopy.transform.data import reduction_arg_to_subst_rule knl = reduction_arg_to_subst_rule(knl, "i_outer") knl = lp.precompute(knl, "red_i_outer_arg", "i_outer", temporary_scope=lp.temp_var_scope.GLOBAL, default_tag="l.auto") knl = lp.realize_reduction(knl) knl = lp.add_dependency( knl, "writes:acc_i_outer", "id:red_i_outer_arg_barrier") lp.auto_test_vs_ref( ref_knl, ctx, knl, parameters={"n": size})
def test_global_parallel_reduction(ctx_factory, size): ctx = ctx_factory() knl = lp.make_kernel( "{[i]: 0 <= i < n }", """ # Using z[0] instead of z works around a bug in ancient PyOpenCL. z[0] = sum(i, a[i]) """) knl = lp.add_and_infer_dtypes(knl, {"a": np.float32}) ref_knl = knl gsize = 128 knl = lp.split_iname(knl, "i", gsize * 20) knl = lp.split_iname(knl, "i_inner", gsize, inner_tag="l.0") knl = lp.split_reduction_outward(knl, "i_outer") knl = lp.split_reduction_inward(knl, "i_inner_outer") from loopy.transform.data import reduction_arg_to_subst_rule knl = reduction_arg_to_subst_rule(knl, "i_outer") knl = lp.precompute(knl, "red_i_outer_arg", "i_outer", temporary_scope=lp.temp_var_scope.GLOBAL, default_tag="l.auto") knl = lp.realize_reduction(knl) knl = lp.tag_inames(knl, "i_outer_0:g.0") # Keep the i_outer accumulator on the correct (lower) side of the barrier, # otherwise there will be useless save/reload code generated. knl = lp.add_dependency( knl, "writes:acc_i_outer", "id:red_i_outer_arg_barrier") lp.auto_test_vs_ref( ref_knl, ctx, knl, parameters={"n": size}, print_ref_code=True)