예제 #1
0
def test_global_parallel_reduction(ctx_factory, size):
    ctx = ctx_factory()

    knl = lp.make_kernel(
            "{[i]: 0 <= i < n }",
            """
            # Using z[0] instead of z works around a bug in ancient PyOpenCL.
            z[0] = sum(i, i/13)
            """)

    ref_knl = knl

    gsize = 128
    knl = lp.split_iname(knl, "i", gsize * 20)
    knl = lp.split_iname(knl, "i_inner", gsize, outer_tag="l.0")
    knl = lp.split_reduction_inward(knl, "i_inner_inner")
    knl = lp.split_reduction_inward(knl, "i_inner_outer")
    from loopy.transform.data import reduction_arg_to_subst_rule
    knl = reduction_arg_to_subst_rule(knl, "i_outer")
    knl = lp.precompute(knl, "red_i_outer_arg", "i_outer",
            temporary_scope=lp.temp_var_scope.GLOBAL,
            default_tag="l.auto")
    knl = lp.realize_reduction(knl)
    knl = lp.add_dependency(
            knl, "writes:acc_i_outer",
            "id:red_i_outer_arg_barrier")

    lp.auto_test_vs_ref(
            ref_knl, ctx, knl, parameters={"n": size},
            print_ref_code=True)
예제 #2
0
def test_global_mc_parallel_reduction(ctx_factory, size):
    ctx = ctx_factory()

    import pyopencl.version  # noqa
    if cl.version.VERSION < (2016, 2):
        pytest.skip("Random123 RNG not supported in PyOpenCL < 2016.2")

    knl = lp.make_kernel(
        "{[i]: 0 <= i < n }", """
            for i
                <> key = make_uint2(i, 324830944)  {inames=i}
                <> ctr = make_uint4(0, 1, 2, 3)  {inames=i,id=init_ctr}
                <> vals, ctr = philox4x32_f32(ctr, key)  {dep=init_ctr}
            end
            z = sum(i, vals.s0 + vals.s1 + vals.s2 + vals.s3)
            """)

    ref_knl = knl

    gsize = 128
    knl = lp.split_iname(knl, "i", gsize * 20)
    knl = lp.split_iname(knl, "i_inner", gsize, outer_tag="l.0")
    knl = lp.split_reduction_inward(knl, "i_inner_inner")
    knl = lp.split_reduction_inward(knl, "i_inner_outer")
    from loopy.transform.data import reduction_arg_to_subst_rule
    knl = reduction_arg_to_subst_rule(knl, "i_outer")
    knl = lp.precompute(knl,
                        "red_i_outer_arg",
                        "i_outer",
                        temporary_scope=lp.temp_var_scope.GLOBAL)
    knl = lp.realize_reduction(knl)
    knl = lp.add_dependency(knl, "writes:acc_i_outer",
                            "id:red_i_outer_arg_barrier")

    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters={"n": size})
예제 #3
0
def test_prefetch_with_within(ctx_factory):
    t_unit = lp.make_kernel(
        ["{[j]: 0<=j<256}", "{[i, k]: 0<=i<100 and 0<=k<128}"],
        """
            f[j] = 3.14 * j {id=set_f}
            f[j] = 2 * f[j] {id=update_f, nosync=set_f}
            ... gbarrier {id=insn_gbar}
            y[i, k] = f[k] * x[i, k] {id=set_y}
            """, [lp.GlobalArg("x", shape=lp.auto, dtype=float), ...],
        seq_dependencies=True,
        name="myknl")

    ref_t_unit = t_unit

    t_unit = lp.split_iname(t_unit, "j", 32, inner_tag="l.0", outer_tag="g.0")
    t_unit = lp.split_iname(t_unit, "i", 32, inner_tag="l.0", outer_tag="g.0")

    t_unit = lp.add_prefetch(t_unit,
                             "f",
                             prefetch_insn_id="f_prftch",
                             within="id:set_y",
                             sweep_inames="k",
                             dim_arg_names="iprftch",
                             default_tag=None,
                             temporary_address_space=lp.AddressSpace.LOCAL,
                             temporary_name="foo",
                             fetch_outer_inames=frozenset({"i_outer"}))
    t_unit = lp.add_dependency(t_unit, "id:f_prftch", "id:insn_gbar")
    t_unit = lp.split_iname(t_unit, "iprftch", 32, inner_tag="l.0")

    # test that 'f' is only prefetched in set_y
    assert t_unit["myknl"].temporary_variables["foo"].shape == (128, )

    lp.auto_test_vs_ref(ref_t_unit, ctx_factory(), t_unit)
예제 #4
0
def add_instruction_deps(knl):
    assignees = {}

    for i in knl.instructions:
        assignee = i.assignee_name
        if assignee not in assignees.keys():
            assignees[assignee] = i.id
        else:
            knl = lp.add_dependency(knl, f"id:{i.id}", f"id:{assignees[assignee]}")
            assignees[assignee] = i.id
    return knl
예제 #5
0
def test_global_parallel_reduction(ctx_factory, size):
    ctx = ctx_factory()

    knl = lp.make_kernel(
        "{[i]: 0 <= i < n }", """
            # Using z[0] instead of z works around a bug in ancient PyOpenCL.
            z[0] = sum(i, a[i])
            """)

    knl = lp.add_and_infer_dtypes(knl, {"a": np.float32})
    ref_knl = knl

    gsize = 128
    knl = lp.split_iname(knl, "i", gsize * 20)
    knl = lp.split_iname(knl, "i_inner", gsize, inner_tag="l.0")
    knl = lp.split_reduction_outward(knl, "i_outer")
    knl = lp.split_reduction_inward(knl, "i_inner_outer")
    from loopy.transform.data import reduction_arg_to_subst_rule
    knl = reduction_arg_to_subst_rule(knl, "i_outer")

    knl = lp.precompute(knl,
                        "red_i_outer_arg",
                        "i_outer",
                        temporary_address_space=lp.AddressSpace.GLOBAL,
                        default_tag="l.auto")
    knl = lp.realize_reduction(knl)
    knl = lp.tag_inames(knl, "i_outer_0:g.0")

    # Keep the i_outer accumulator on the  correct (lower) side of the barrier,
    # otherwise there will be useless save/reload code generated.
    knl = lp.add_dependency(knl, "writes:acc_i_outer",
                            "id:red_i_outer_arg_barrier")

    lp.auto_test_vs_ref(ref_knl,
                        ctx,
                        knl,
                        parameters={"n": size},
                        print_ref_code=True)
예제 #6
0
def test_global_mc_parallel_reduction(ctx_factory, size):
    ctx = ctx_factory()

    import pyopencl.version  # noqa
    if cl.version.VERSION < (2016, 2):
        pytest.skip("Random123 RNG not supported in PyOpenCL < 2016.2")

    knl = lp.make_kernel(
            "{[i]: 0 <= i < n }",
            """
            for i
                <> key = make_uint2(i, 324830944)  {inames=i}
                <> ctr = make_uint4(0, 1, 2, 3)  {inames=i,id=init_ctr}
                <> vals, ctr = philox4x32_f32(ctr, key)  {dep=init_ctr}
            end
            z = sum(i, vals.s0 + vals.s1 + vals.s2 + vals.s3)
            """)

    ref_knl = knl

    gsize = 128
    knl = lp.split_iname(knl, "i", gsize * 20)
    knl = lp.split_iname(knl, "i_inner", gsize, outer_tag="l.0")
    knl = lp.split_reduction_inward(knl, "i_inner_inner")
    knl = lp.split_reduction_inward(knl, "i_inner_outer")
    from loopy.transform.data import reduction_arg_to_subst_rule
    knl = reduction_arg_to_subst_rule(knl, "i_outer")
    knl = lp.precompute(knl, "red_i_outer_arg", "i_outer",
            temporary_scope=lp.temp_var_scope.GLOBAL,
            default_tag="l.auto")
    knl = lp.realize_reduction(knl)
    knl = lp.add_dependency(
            knl, "writes:acc_i_outer",
            "id:red_i_outer_arg_barrier")

    lp.auto_test_vs_ref(
            ref_knl, ctx, knl, parameters={"n": size})
예제 #7
0
def test_global_parallel_reduction(ctx_factory, size):
    ctx = ctx_factory()

    knl = lp.make_kernel(
            "{[i]: 0 <= i < n }",
            """
            # Using z[0] instead of z works around a bug in ancient PyOpenCL.
            z[0] = sum(i, a[i])
            """)

    knl = lp.add_and_infer_dtypes(knl, {"a": np.float32})
    ref_knl = knl

    gsize = 128
    knl = lp.split_iname(knl, "i", gsize * 20)
    knl = lp.split_iname(knl, "i_inner", gsize, inner_tag="l.0")
    knl = lp.split_reduction_outward(knl, "i_outer")
    knl = lp.split_reduction_inward(knl, "i_inner_outer")
    from loopy.transform.data import reduction_arg_to_subst_rule
    knl = reduction_arg_to_subst_rule(knl, "i_outer")

    knl = lp.precompute(knl, "red_i_outer_arg", "i_outer",
            temporary_scope=lp.temp_var_scope.GLOBAL,
            default_tag="l.auto")
    knl = lp.realize_reduction(knl)
    knl = lp.tag_inames(knl, "i_outer_0:g.0")

    # Keep the i_outer accumulator on the  correct (lower) side of the barrier,
    # otherwise there will be useless save/reload code generated.
    knl = lp.add_dependency(
            knl, "writes:acc_i_outer",
            "id:red_i_outer_arg_barrier")

    lp.auto_test_vs_ref(
            ref_knl, ctx, knl, parameters={"n": size},
            print_ref_code=True)