예제 #1
0
def test_alias_temporaries(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel(
        "{[i]: 0<=i<n}", """
        times2(i) := 2*a[i]
        times3(i) := 3*a[i]
        times4(i) := 4*a[i]

        x[i] = times2(i)
        y[i] = times3(i)
        z[i] = times4(i)
        """)

    knl = lp.add_and_infer_dtypes(knl, {"a": np.float32})

    ref_knl = knl

    knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.0")

    knl = lp.precompute(knl, "times2", "i_inner")
    knl = lp.precompute(knl, "times3", "i_inner")
    knl = lp.precompute(knl, "times4", "i_inner")

    knl = lp.alias_temporaries(knl, ["times2_0", "times3_0", "times4_0"])

    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=30))
예제 #2
0
def test_precompute_with_preexisting_inames(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel(
        "{[e,i,j,k]: 0<=e<E and 0<=i,j,k<n}",
        """
        result[e,i] = sum(j, D1[i,j]*u[e,j])
        result2[e,i] = sum(k, D2[i,k]*u[e,k])
        """)

    knl = lp.add_and_infer_dtypes(knl, {
        "u": np.float32,
        "D1": np.float32,
        "D2": np.float32,
        })

    knl = lp.fix_parameters(knl, n=13)

    ref_knl = knl

    knl = lp.extract_subst(knl, "D1_subst", "D1[ii,jj]", parameters="ii,jj")
    knl = lp.extract_subst(knl, "D2_subst", "D2[ii,jj]", parameters="ii,jj")

    knl = lp.precompute(knl, "D1_subst", "i,j", default_tag="for",
            precompute_inames="ii,jj")
    knl = lp.precompute(knl, "D2_subst", "i,k", default_tag="for",
            precompute_inames="ii,jj")

    knl = lp.set_loop_priority(knl, "ii,jj,e,j,k")

    lp.auto_test_vs_ref(
            ref_knl, ctx, knl,
            parameters=dict(E=200))
예제 #3
0
def test_alias_temporaries(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel(
        "{[i]: 0<=i<n}",
        """
        times2(i) := 2*a[i]
        times3(i) := 3*a[i]
        times4(i) := 4*a[i]

        x[i] = times2(i)
        y[i] = times3(i)
        z[i] = times4(i)
        """)

    knl = lp.add_and_infer_dtypes(knl, {"a": np.float32})

    ref_knl = knl

    knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.0")

    knl = lp.precompute(knl, "times2", "i_inner")
    knl = lp.precompute(knl, "times3", "i_inner")
    knl = lp.precompute(knl, "times4", "i_inner")

    knl = lp.alias_temporaries(knl, ["times2_0", "times3_0", "times4_0"])

    lp.auto_test_vs_ref(
            ref_knl, ctx, knl,
            parameters=dict(n=30))
예제 #4
0
def test_precompute_with_preexisting_inames_fail():
    knl = lp.make_kernel(
        "{[e,i,j,k]: 0<=e<E and 0<=i,j<n and 0<=k<2*n}", """
        result[e,i] = sum(j, D1[i,j]*u[e,j])
        result2[e,i] = sum(k, D2[i,k]*u[e,k])
        """)

    knl = lp.add_and_infer_dtypes(knl, {
        "u": np.float32,
        "D1": np.float32,
        "D2": np.float32,
    })

    knl = lp.fix_parameters(knl, n=13)

    knl = lp.extract_subst(knl, "D1_subst", "D1[ii,jj]", parameters="ii,jj")
    knl = lp.extract_subst(knl, "D2_subst", "D2[ii,jj]", parameters="ii,jj")

    knl = lp.precompute(knl,
                        "D1_subst",
                        "i,j",
                        default_tag="for",
                        precompute_inames="ii,jj")
    with pytest.raises(lp.LoopyError):
        lp.precompute(knl,
                      "D2_subst",
                      "i,k",
                      default_tag="for",
                      precompute_inames="ii,jj")
예제 #5
0
def test_tim2d(ctx_factory):
    dtype = np.float32
    ctx = ctx_factory()
    order = "C"

    n = 8

    from pymbolic import var
    K_sym = var("K")

    field_shape = (K_sym, n, n)

    # K - run-time symbolic
    knl = lp.make_kernel(ctx.devices[0],
            "[K] -> {[i,j,e,m,o,gi]: 0<=i,j,m,o<%d and 0<=e<K and 0<=gi<3}" % n,
           [
            "ur(a,b) := sum_float32(@o, D[a,o]*u[e,o,b])",
            "us(a,b) := sum_float32(@o, D[b,o]*u[e,a,o])",

            "lap[e,i,j]  = "
            "  sum_float32(m, D[m,i]*(G[0,e,m,j]*ur(m,j) + G[1,e,m,j]*us(m,j)))"
            "+ sum_float32(m, D[m,j]*(G[1,e,i,m]*ur(i,m) + G[2,e,i,m]*us(i,m)))"

            ],
            [
            lp.ArrayArg("u", dtype, shape=field_shape, order=order),
            lp.ArrayArg("lap", dtype, shape=field_shape, order=order),
            lp.ArrayArg("G", dtype, shape=(3,)+field_shape, order=order),
#            lp.ConstantArrayArg("D", dtype, shape=(n, n), order=order),
            lp.ArrayArg("D", dtype, shape=(n, n), order=order),
#            lp.ImageArg("D", dtype, shape=(n, n)),
            lp.ValueArg("K", np.int32, approximately=1000),
            ],
             name="semlap2D", assumptions="K>=1")

    unroll = 32

    seq_knl = knl
    knl = lp.add_prefetch(knl, "D", ["m", "j", "i","o"], default_tag="l.auto")
    knl = lp.add_prefetch(knl, "u", ["i", "j",  "o"], default_tag="l.auto")
    knl = lp.precompute(knl, "ur", np.float32, ["a", "b"], default_tag="l.auto")
    knl = lp.precompute(knl, "us", np.float32, ["a", "b"], default_tag="l.auto")
    knl = lp.split_iname(knl, "e", 1, outer_tag="g.0")#, slabs=(0, 1))

    knl = lp.tag_inames(knl, dict(i="l.0", j="l.1"))
    knl = lp.tag_inames(knl, dict(o="unr"))
    knl = lp.tag_inames(knl, dict(m="unr"))


#    knl = lp.add_prefetch(knl, "G", [2,3], default_tag=None) # axis/argument indices on G
    knl = lp.add_prefetch(knl, "G", [2,3], default_tag="l.auto") # axis/argument indices on G

    kernel_gen = lp.generate_loop_schedules(knl)
    kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000))

    K = 1000
    lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen,
            op_count=K*(n*n*n*2*2 + n*n*2*3 + n**3 * 2*2)/1e9,
            op_label="GFlops",
            parameters={"K": K})
예제 #6
0
def test_precompute_with_preexisting_inames(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel(
        "{[e,i,j,k]: 0<=e<E and 0<=i,j,k<n}",
        """
        result[e,i] = sum(j, D1[i,j]*u[e,j])
        result2[e,i] = sum(k, D2[i,k]*u[e,k])
        """)

    knl = lp.add_and_infer_dtypes(knl, {
        "u": np.float32,
        "D1": np.float32,
        "D2": np.float32,
        })

    knl = lp.fix_parameters(knl, n=13)

    ref_knl = knl

    knl = lp.extract_subst(knl, "D1_subst", "D1[ii,jj]", parameters="ii,jj")
    knl = lp.extract_subst(knl, "D2_subst", "D2[ii,jj]", parameters="ii,jj")

    knl = lp.precompute(knl, "D1_subst", "i,j", default_tag="for",
            precompute_inames="ii,jj")
    knl = lp.precompute(knl, "D2_subst", "i,k", default_tag="for",
            precompute_inames="ii,jj")

    knl = lp.prioritize_loops(knl, "ii,jj,e,j,k")

    lp.auto_test_vs_ref(
            ref_knl, ctx, knl,
            parameters=dict(E=200))
예제 #7
0
    def variant_orig(knl):
        knl = lp.tag_inames(knl, dict(i="l.0", j="l.1", e="g.0"))

        knl = lp.add_prefetch(knl,
                              "D[:,:]",
                              fetch_outer_inames='e',
                              default_tag="l.auto")
        knl = lp.add_prefetch(knl, "u[e, :, :]", default_tag="l.auto")

        knl = lp.precompute(knl, "ur(m,j)", ["m", "j"], default_tag="l.auto")
        knl = lp.precompute(knl, "us(i,m)", ["i", "m"], default_tag="l.auto")
        # TODO this adds `a` and `b` to domains, which leads to unused inames

        knl = lp.precompute(knl, "Gux(m,j)", ["m", "j"], default_tag="l.auto")
        knl = lp.precompute(knl, "Guy(i,m)", ["i", "m"], default_tag="l.auto")

        knl = lp.add_prefetch(knl, "G$x[:,e,:,:]", default_tag="l.auto")
        knl = lp.add_prefetch(knl, "G$y[:,e,:,:]", default_tag="l.auto")

        knl = lp.tag_inames(knl, dict(o="unr"))
        knl = lp.tag_inames(knl, dict(m="unr"))

        knl = lp.set_instruction_priority(knl, "id:D_fetch", 5)
        print(knl)

        return knl
예제 #8
0
파일: test_tim.py 프로젝트: inducer/loopy
def test_tim2d(ctx_factory):
    dtype = np.float32
    ctx = ctx_factory()
    order = "C"

    n = 8

    from pymbolic import var
    K_sym = var("K")

    field_shape = (K_sym, n, n)

    # K - run-time symbolic
    knl = lp.make_kernel(ctx.devices[0],
            "[K] -> {[i,j,e,m,o,gi]: 0<=i,j,m,o<%d and 0<=e<K and 0<=gi<3}" % n,
           [
            "ur(a,b) := sum_float32(@o, D[a,o]*u[e,o,b])",
            "us(a,b) := sum_float32(@o, D[b,o]*u[e,a,o])",

            "lap[e,i,j]  = "
            "  sum_float32(m, D[m,i]*(G[0,e,m,j]*ur(m,j) + G[1,e,m,j]*us(m,j)))"
            "+ sum_float32(m, D[m,j]*(G[1,e,i,m]*ur(i,m) + G[2,e,i,m]*us(i,m)))"

            ],
            [
            lp.ArrayArg("u", dtype, shape=field_shape, order=order),
            lp.ArrayArg("lap", dtype, shape=field_shape, order=order),
            lp.ArrayArg("G", dtype, shape=(3,)+field_shape, order=order),
#            lp.ConstantArrayArg("D", dtype, shape=(n, n), order=order),
            lp.ArrayArg("D", dtype, shape=(n, n), order=order),
#            lp.ImageArg("D", dtype, shape=(n, n)),
            lp.ValueArg("K", np.int32, approximately=1000),
            ],
             name="semlap2D", assumptions="K>=1")

    unroll = 32

    seq_knl = knl
    knl = lp.add_prefetch(knl, "D", ["m", "j", "i","o"], default_tag="l.auto")
    knl = lp.add_prefetch(knl, "u", ["i", "j",  "o"], default_tag="l.auto")
    knl = lp.precompute(knl, "ur", np.float32, ["a", "b"], default_tag="l.auto")
    knl = lp.precompute(knl, "us", np.float32, ["a", "b"], default_tag="l.auto")
    knl = lp.split_iname(knl, "e", 1, outer_tag="g.0")#, slabs=(0, 1))

    knl = lp.tag_inames(knl, dict(i="l.0", j="l.1"))
    knl = lp.tag_inames(knl, dict(o="unr"))
    knl = lp.tag_inames(knl, dict(m="unr"))


#    knl = lp.add_prefetch(knl, "G", [2,3], default_tag=None) # axis/argument indices on G
    knl = lp.add_prefetch(knl, "G", [2,3], default_tag="l.auto") # axis/argument indices on G

    kernel_gen = lp.generate_loop_schedules(knl)
    kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000))

    K = 1000
    lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen,
            op_count=K*(n*n*n*2*2 + n*n*2*3 + n**3 * 2*2)/1e9,
            op_label="GFlops",
            parameters={"K": K})
예제 #9
0
def test_matmul(ctx_factory, buffer_inames):
    ctx = ctx_factory()

    if (buffer_inames and
            ctx.devices[0].platform.name == "Portable Computing Language"):
        pytest.skip("crashes on pocl")

    logging.basicConfig(level=logging.INFO)

    fortran_src = """
        subroutine dgemm(m,n,ell,a,b,c)
          implicit none
          real*8 a(m,ell),b(ell,n),c(m,n)
          integer m,n,k,i,j,ell

          do j = 1,n
            do i = 1,m
              do k = 1,ell
                c(i,j) = c(i,j) + b(k,j)*a(i,k)
              end do
            end do
          end do
        end subroutine
        """

    knl, = lp.parse_fortran(fortran_src)

    assert len(knl.domains) == 1

    ref_knl = knl

    knl = lp.split_iname(knl, "i", 16,
            outer_tag="g.0", inner_tag="l.1")
    knl = lp.split_iname(knl, "j", 8,
            outer_tag="g.1", inner_tag="l.0")
    knl = lp.split_iname(knl, "k", 32)
    knl = lp.assume(knl, "n mod 32 = 0")
    knl = lp.assume(knl, "m mod 32 = 0")
    knl = lp.assume(knl, "ell mod 16 = 0")

    knl = lp.extract_subst(knl, "a_acc", "a[i1,i2]", parameters="i1, i2")
    knl = lp.extract_subst(knl, "b_acc", "b[i1,i2]", parameters="i1, i2")
    knl = lp.precompute(knl, "a_acc", "k_inner,i_inner",
            precompute_outer_inames='i_outer, j_outer, k_outer',
            default_tag="l.auto")
    knl = lp.precompute(knl, "b_acc", "j_inner,k_inner",
            precompute_outer_inames='i_outer, j_outer, k_outer',
            default_tag="l.auto")

    knl = lp.buffer_array(knl, "c", buffer_inames=buffer_inames,
            init_expression="0", store_expression="base+buffer")

    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=128, m=128, ell=128))
예제 #10
0
def test_precompute_some_exist(ctx_factory):
    fortran_src = """
        subroutine dgemm(m,n,ell,a,b,c)
          implicit none
          real*8 a(m,ell),b(ell,n),c(m,n)
          integer m,n,k,i,j,ell

          do j = 1,n
            do i = 1,m
              do k = 1,ell
                c(i,j) = c(i,j) + b(k,j)*a(i,k)
              end do
            end do
          end do
        end subroutine
        """

    knl = lp.parse_fortran(fortran_src)

    assert len(knl["dgemm"].domains) == 1

    knl = lp.split_iname(knl, "i", 8, outer_tag="g.0", inner_tag="l.1")
    knl = lp.split_iname(knl, "j", 8, outer_tag="g.1", inner_tag="l.0")
    knl = lp.split_iname(knl, "k", 8)
    knl = lp.assume(knl, "n mod 8 = 0")
    knl = lp.assume(knl, "m mod 8 = 0")
    knl = lp.assume(knl, "ell mod 8 = 0")

    knl = lp.extract_subst(knl, "a_acc", "a[i1,i2]", parameters="i1, i2")
    knl = lp.extract_subst(knl, "b_acc", "b[i1,i2]", parameters="i1, i2")
    knl = lp.precompute(knl,
                        "a_acc",
                        "k_inner,i_inner",
                        precompute_inames="ktemp,itemp",
                        precompute_outer_inames="i_outer, j_outer, k_outer",
                        default_tag="l.auto")
    knl = lp.precompute(knl,
                        "b_acc",
                        "j_inner,k_inner",
                        precompute_inames="itemp,k2temp",
                        precompute_outer_inames="i_outer, j_outer, k_outer",
                        default_tag="l.auto")

    ref_knl = knl

    ctx = ctx_factory()
    lp.auto_test_vs_ref(ref_knl,
                        ctx,
                        knl,
                        parameters=dict(n=128, m=128, ell=128))
예제 #11
0
def test_funny_shape_matrix_mul(ctx_factory):
    ctx = ctx_factory()

    n = get_suitable_size(ctx)
    m = n + 12
    ell = m + 12

    knl = lp.make_kernel("{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}",
                         ["c[i, j] = sum(k, a[i, k]*b[k, j])"],
                         name="matmul",
                         assumptions="n,m,ell >= 1")

    knl = lp.add_dtypes(knl, {
        "a": np.float32,
        "b": np.float32,
    })

    ref_knl = knl

    knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.1")
    knl = lp.split_iname(knl, "j", 8, outer_tag="g.1", inner_tag="l.0")
    knl = lp.split_iname(knl, "k", 32)

    #knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"], default_tag="l.auto")
    #knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"], default_tag="l.auto")
    knl = lp.extract_subst(knl, "a_acc", "a[i1,i2]", parameters="i1, i2")
    knl = lp.extract_subst(knl, "b_acc", "b[i1,i2]", parameters="i1, i2")
    knl = lp.precompute(knl,
                        "a_acc",
                        "k_inner,i_inner",
                        precompute_outer_inames="i_outer, j_outer, k_outer",
                        default_tag="l.auto")
    knl = lp.precompute(knl,
                        "b_acc",
                        "j_inner,k_inner",
                        precompute_outer_inames="i_outer, j_outer, k_outer",
                        default_tag="l.auto")

    lp.auto_test_vs_ref(ref_knl,
                        ctx,
                        knl,
                        op_count=[2 * n**3 / 1e9],
                        op_label=["GFlops"],
                        parameters={
                            "n": n,
                            "m": m,
                            "ell": ell
                        })
예제 #12
0
def test_matmul(ctx_factory, buffer_inames):
    logging.basicConfig(level=logging.INFO)

    fortran_src = """
        subroutine dgemm(m,n,l,a,b,c)
          implicit none
          real*8 a(m,l),b(l,n),c(m,n)
          integer m,n,k,i,j,l

          do j = 1,n
            do i = 1,m
              do k = 1,l
                c(i,j) = c(i,j) + b(k,j)*a(i,k)
              end do
            end do
          end do
        end subroutine
        """

    knl, = lp.parse_fortran(fortran_src)

    assert len(knl.domains) == 1

    ref_knl = knl

    knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.1")
    knl = lp.split_iname(knl, "j", 8, outer_tag="g.1", inner_tag="l.0")
    knl = lp.split_iname(knl, "k", 32)
    knl = lp.assume(knl, "n mod 32 = 0")
    knl = lp.assume(knl, "m mod 32 = 0")
    knl = lp.assume(knl, "l mod 16 = 0")

    knl = lp.extract_subst(knl, "a_acc", "a[i1,i2]", parameters="i1, i2")
    knl = lp.extract_subst(knl, "b_acc", "b[i1,i2]", parameters="i1, i2")
    knl = lp.precompute(knl, "a_acc", "k_inner,i_inner")
    knl = lp.precompute(knl, "b_acc", "j_inner,k_inner")

    knl = lp.buffer_array(knl,
                          "c",
                          buffer_inames=buffer_inames,
                          init_expression="0",
                          store_expression="base+buffer")

    ctx = ctx_factory()
    lp.auto_test_vs_ref(ref_knl,
                        ctx,
                        knl,
                        parameters=dict(n=128, m=128, l=128))
예제 #13
0
def test_global_parallel_reduction(ctx_factory, size):
    ctx = ctx_factory()

    knl = lp.make_kernel(
            "{[i]: 0 <= i < n }",
            """
            # Using z[0] instead of z works around a bug in ancient PyOpenCL.
            z[0] = sum(i, i/13)
            """)

    ref_knl = knl

    gsize = 128
    knl = lp.split_iname(knl, "i", gsize * 20)
    knl = lp.split_iname(knl, "i_inner", gsize, outer_tag="l.0")
    knl = lp.split_reduction_inward(knl, "i_inner_inner")
    knl = lp.split_reduction_inward(knl, "i_inner_outer")
    from loopy.transform.data import reduction_arg_to_subst_rule
    knl = reduction_arg_to_subst_rule(knl, "i_outer")
    knl = lp.precompute(knl, "red_i_outer_arg", "i_outer",
            temporary_scope=lp.temp_var_scope.GLOBAL,
            default_tag="l.auto")
    knl = lp.realize_reduction(knl)
    knl = lp.add_dependency(
            knl, "writes:acc_i_outer",
            "id:red_i_outer_arg_barrier")

    lp.auto_test_vs_ref(
            ref_knl, ctx, knl, parameters={"n": size},
            print_ref_code=True)
예제 #14
0
def test_precompute_confusing_subst_arguments(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel(
        "{[i,j]: 0<=i<n and 0<=j<5}",
        """
        D(i):=a[i+1]-a[i]
        b[i,j] = D(j)
        """)

    knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32))

    ref_knl = knl

    knl = lp.tag_inames(knl, dict(j="g.1"))
    knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")

    from loopy.symbolic import get_dependencies
    assert "i_inner" not in get_dependencies(knl.substitutions["D"].expression)
    knl = lp.precompute(knl, "D", sweep_inames="j",
            precompute_outer_inames="j, i_inner, i_outer")

    lp.auto_test_vs_ref(
            ref_knl, ctx, knl,
            parameters=dict(n=12345))
예제 #15
0
def test_precompute_nested_subst(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel(
        "{[i,j]: 0<=i<n and 0<=j<5}", """
        E:=a[i]
        D:=E*E
        b[i] = D
        """)

    knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32))

    ref_knl = knl

    knl = lp.tag_inames(knl, dict(j="g.1"))
    knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")

    from loopy.symbolic import get_dependencies
    assert "i_inner" not in get_dependencies(knl.substitutions["D"].expression)
    knl = lp.precompute(knl, "D", "i_inner")

    # There's only one surviving 'E' rule.
    assert len([
        rule_name for rule_name in knl.substitutions
        if rule_name.startswith("E")
    ]) == 1

    # That rule should use the newly created prefetch inames,
    # not the prior 'i_inner'
    assert "i_inner" not in get_dependencies(knl.substitutions["E"].expression)

    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=12345))
예제 #16
0
def test_global_mc_parallel_reduction(ctx_factory, size):
    ctx = ctx_factory()

    import pyopencl.version  # noqa
    if cl.version.VERSION < (2016, 2):
        pytest.skip("Random123 RNG not supported in PyOpenCL < 2016.2")

    knl = lp.make_kernel(
        "{[i]: 0 <= i < n }", """
            for i
                <> key = make_uint2(i, 324830944)  {inames=i}
                <> ctr = make_uint4(0, 1, 2, 3)  {inames=i,id=init_ctr}
                <> vals, ctr = philox4x32_f32(ctr, key)  {dep=init_ctr}
            end
            z = sum(i, vals.s0 + vals.s1 + vals.s2 + vals.s3)
            """)

    ref_knl = knl

    gsize = 128
    knl = lp.split_iname(knl, "i", gsize * 20)
    knl = lp.split_iname(knl, "i_inner", gsize, outer_tag="l.0")
    knl = lp.split_reduction_inward(knl, "i_inner_inner")
    knl = lp.split_reduction_inward(knl, "i_inner_outer")
    from loopy.transform.data import reduction_arg_to_subst_rule
    knl = reduction_arg_to_subst_rule(knl, "i_outer")
    knl = lp.precompute(knl,
                        "red_i_outer_arg",
                        "i_outer",
                        temporary_scope=lp.temp_var_scope.GLOBAL)
    knl = lp.realize_reduction(knl)
    knl = lp.add_dependency(knl, "writes:acc_i_outer",
                            "id:red_i_outer_arg_barrier")

    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters={"n": size})
예제 #17
0
def test_precompute_nested_subst(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel(
        "{[i,j]: 0<=i<n and 0<=j<5}",
        """
        E:=a[i]
        D:=E*E
        b[i] = D
        """)

    knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32))

    ref_knl = knl

    knl = lp.tag_inames(knl, dict(j="g.1"))
    knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")

    from loopy.symbolic import get_dependencies
    assert "i_inner" not in get_dependencies(knl.substitutions["D"].expression)
    knl = lp.precompute(knl, "D", "i_inner")

    # There's only one surviving 'E' rule.
    assert len([
        rule_name
        for rule_name in knl.substitutions
        if rule_name.startswith("E")]) == 1

    # That rule should use the newly created prefetch inames,
    # not the prior 'i_inner'
    assert "i_inner" not in get_dependencies(knl.substitutions["E"].expression)

    lp.auto_test_vs_ref(
            ref_knl, ctx, knl,
            parameters=dict(n=12345))
예제 #18
0
def no_test_global_parallel_reduction(ctx_factory, size):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    knl = lp.make_kernel(
            "{[i]: 0 <= i < n }",
            """
            <> key = make_uint2(i, 324830944)  {inames=i}
            <> ctr = make_uint4(0, 1, 2, 3)  {inames=i,id=init_ctr}
            <> vals, ctr = philox4x32_f32(ctr, key)  {dep=init_ctr}
            z = sum(i, vals.s0 + vals.s1 + vals.s2 + vals.s3)
            """)

    # ref_knl = knl

    gsize = 128
    knl = lp.split_iname(knl, "i", gsize * 20)
    knl = lp.split_iname(knl, "i_inner", gsize, outer_tag="l.0")
    knl = lp.split_reduction_inward(knl, "i_inner_inner")
    knl = lp.split_reduction_inward(knl, "i_inner_outer")
    from loopy.transform.data import reduction_arg_to_subst_rule
    knl = reduction_arg_to_subst_rule(knl, "i_outer")
    knl = lp.precompute(knl, "red_i_outer_arg", "i_outer")
    print(knl)
    1/0
    knl = lp.realize_reduction(knl)

    evt, (z,) = knl(queue, n=size)
예제 #19
0
def test_matmul(ctx_factory, buffer_inames):
    logging.basicConfig(level=logging.INFO)

    fortran_src = """
        subroutine dgemm(m,n,l,a,b,c)
          implicit none
          real*8 a(m,l),b(l,n),c(m,n)
          integer m,n,k,i,j,l

          do j = 1,n
            do i = 1,m
              do k = 1,l
                c(i,j) = c(i,j) + b(k,j)*a(i,k)
              end do
            end do
          end do
        end subroutine
        """

    knl, = lp.parse_fortran(fortran_src)

    assert len(knl.domains) == 1

    ref_knl = knl

    knl = lp.split_iname(knl, "i", 16,
            outer_tag="g.0", inner_tag="l.1")
    knl = lp.split_iname(knl, "j", 8,
            outer_tag="g.1", inner_tag="l.0")
    knl = lp.split_iname(knl, "k", 32)
    knl = lp.assume(knl, "n mod 32 = 0")
    knl = lp.assume(knl, "m mod 32 = 0")
    knl = lp.assume(knl, "l mod 16 = 0")

    knl = lp.extract_subst(knl, "a_acc", "a[i1,i2]", parameters="i1, i2")
    knl = lp.extract_subst(knl, "b_acc", "b[i1,i2]", parameters="i1, i2")
    knl = lp.precompute(knl, "a_acc", "k_inner,i_inner")
    knl = lp.precompute(knl, "b_acc", "j_inner,k_inner")

    knl = lp.buffer_array(knl, "c", buffer_inames=buffer_inames,
            init_expression="0", store_expression="base+buffer")

    ctx = ctx_factory()
    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=128, m=128, l=128))
예제 #20
0
    def variant_fig33(knl):
        # This is meant to (mostly) reproduce Figure 3.3.

        Ncloc = 16
        knl = lp.split_iname(knl, "K", Ncloc,
                outer_iname="Ko", inner_iname="Kloc")
        knl = lp.precompute(knl, "dPsi$one", np.float32, ["dx_axis"], default_tag=None)
        knl = lp.tag_inames(knl, {"j": "ilp.seq"})

        return knl, ["Ko", "Kloc"]
예제 #21
0
def test_precompute_some_exist(ctx_factory):
    fortran_src = """
        subroutine dgemm(m,n,ell,a,b,c)
          implicit none
          real*8 a(m,ell),b(ell,n),c(m,n)
          integer m,n,k,i,j,ell

          do j = 1,n
            do i = 1,m
              do k = 1,ell
                c(i,j) = c(i,j) + b(k,j)*a(i,k)
              end do
            end do
          end do
        end subroutine
        """

    knl, = lp.parse_fortran(fortran_src)

    assert len(knl.domains) == 1

    knl = lp.split_iname(knl, "i", 8,
            outer_tag="g.0", inner_tag="l.1")
    knl = lp.split_iname(knl, "j", 8,
            outer_tag="g.1", inner_tag="l.0")
    knl = lp.split_iname(knl, "k", 8)
    knl = lp.assume(knl, "n mod 8 = 0")
    knl = lp.assume(knl, "m mod 8 = 0")
    knl = lp.assume(knl, "ell mod 8 = 0")

    knl = lp.extract_subst(knl, "a_acc", "a[i1,i2]", parameters="i1, i2")
    knl = lp.extract_subst(knl, "b_acc", "b[i1,i2]", parameters="i1, i2")
    knl = lp.precompute(knl, "a_acc", "k_inner,i_inner",
            precompute_inames="ktemp,itemp",
            default_tag="l.auto")
    knl = lp.precompute(knl, "b_acc", "j_inner,k_inner",
            precompute_inames="itemp,k2temp",
            default_tag="l.auto")

    ref_knl = knl

    ctx = ctx_factory()
    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=128, m=128, ell=128))
예제 #22
0
    def variant_fig33(knl):
        # This is meant to (mostly) reproduce Figure 3.3.

        Ncloc = 16
        knl = lp.split_iname(knl, "K", Ncloc,
                outer_iname="Ko", inner_iname="Kloc")
        knl = lp.precompute(knl, "dPsi$one", np.float32, ["dx_axis"], default_tag=None)
        knl = lp.tag_inames(knl, {"j": "ilp.seq"})

        return knl, ["Ko", "Kloc"]
예제 #23
0
    def variant_fig32(knl):
        # This (mostly) reproduces Figure 3.2.

        Ncloc = 16
        knl = lp.split_iname(knl, "K", Ncloc,
                outer_iname="Ko", inner_iname="Kloc")
        knl = lp.precompute(knl, "dPsi", np.float32, ["i", "q", "dx_axis"],
                default_tag=None)
        knl = lp.tag_inames(knl, {"dx_axis": "unr", "dxi": "unr"})
        return knl, ["Ko", "Kloc", "dPsi_q", "ij", "i", "j", "q", "ax_b_insn"]
예제 #24
0
    def variant_fig32(knl):
        # This (mostly) reproduces Figure 3.2.

        Ncloc = 16
        knl = lp.split_iname(knl, "K", Ncloc,
                outer_iname="Ko", inner_iname="Kloc")
        knl = lp.precompute(knl, "dPsi", np.float32, ["i", "q", "dx_axis"],
                default_tag=None)
        knl = lp.tag_inames(knl, {"dx_axis": "unr", "dxi": "unr"})
        return knl, ["Ko", "Kloc", "dPsi_q", "ij", "i", "j", "q", "ax_b_insn"]
예제 #25
0
def test_funny_shape_matrix_mul(ctx_factory):
    ctx = ctx_factory()

    n = get_suitable_size(ctx)
    m = n+12
    ell = m+12

    knl = lp.make_kernel(
            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}",
            [
                "c[i, j] = sum(k, a[i, k]*b[k, j])"
                ],
            name="matmul", assumptions="n,m,ell >= 1")

    knl = lp.add_dtypes(knl, {
        "a": np.float32,
        "b": np.float32,
        })

    ref_knl = knl

    knl = lp.split_iname(knl, "i", 16,
            outer_tag="g.0", inner_tag="l.1")
    knl = lp.split_iname(knl, "j", 8,
            outer_tag="g.1", inner_tag="l.0")
    knl = lp.split_iname(knl, "k", 32)

    #knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"], default_tag="l.auto")
    #knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"], default_tag="l.auto")
    knl = lp.extract_subst(knl, "a_acc", "a[i1,i2]", parameters="i1, i2")
    knl = lp.extract_subst(knl, "b_acc", "b[i1,i2]", parameters="i1, i2")
    knl = lp.precompute(knl, "a_acc", "k_inner,i_inner",
            default_tag="l.auto")
    knl = lp.precompute(knl, "b_acc", "j_inner,k_inner",
            default_tag="l.auto")

    lp.auto_test_vs_ref(ref_knl, ctx, knl,
            op_count=[2*n**3/1e9], op_label=["GFlops"],
            parameters={"n": n, "m": m, "ell": ell})
예제 #26
0
    def variant_orig(knl):
        knl = lp.tag_inames(knl, dict(i="l.0", j="l.1", e="g.0"))

        knl = lp.add_prefetch(knl, "D[:,:]")
        knl = lp.add_prefetch(knl, "u[e, :, :]")

        knl = lp.precompute(knl, "ur(m,j)", ["m", "j"])
        knl = lp.precompute(knl, "us(i,m)", ["i", "m"])

        knl = lp.precompute(knl, "Gux(m,j)", ["m", "j"])
        knl = lp.precompute(knl, "Guy(i,m)", ["i", "m"])

        knl = lp.add_prefetch(knl, "G$x[:,e,:,:]")
        knl = lp.add_prefetch(knl, "G$y[:,e,:,:]")

        knl = lp.tag_inames(knl, dict(o="unr"))
        knl = lp.tag_inames(knl, dict(m="unr"))

        knl = lp.set_instruction_priority(knl, "id:D_fetch", 5)
        print(knl)

        return knl
예제 #27
0
    def variant_orig(knl):
        knl = lp.tag_inames(knl, dict(i="l.0", j="l.1", e="g.0"))

        knl = lp.add_prefetch(knl, "D[:,:]")
        knl = lp.add_prefetch(knl, "u[e, :, :]")

        knl = lp.precompute(knl, "ur(m,j)", ["m", "j"])
        knl = lp.precompute(knl, "us(i,m)", ["i", "m"])

        knl = lp.precompute(knl, "Gux(m,j)", ["m", "j"])
        knl = lp.precompute(knl, "Guy(i,m)", ["i", "m"])

        knl = lp.add_prefetch(knl, "G$x[:,e,:,:]")
        knl = lp.add_prefetch(knl, "G$y[:,e,:,:]")

        knl = lp.tag_inames(knl, dict(o="unr"))
        knl = lp.tag_inames(knl, dict(m="unr"))

        knl = lp.set_instruction_priority(knl, "id:D_fetch", 5)
        print(knl)

        return knl
예제 #28
0
파일: test_apps.py 프로젝트: arghdos/loopy
def test_fd_1d(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel("{[i]: 0<=i<n}", "result[i] = u[i+1]-u[i]")

    knl = lp.add_and_infer_dtypes(knl, {"u": np.float32})
    ref_knl = knl

    knl = lp.split_iname(knl, "i", 16)
    knl = lp.extract_subst(knl, "u_acc", "u[j]", parameters="j")
    knl = lp.precompute(knl, "u_acc", "i_inner", default_tag="for")
    knl = lp.assume(knl, "n mod 16 = 0")

    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=2048))
예제 #29
0
파일: test_loopy.py 프로젝트: dokempf/loopy
def test_finite_difference_expr_subst(ctx_factory):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    grid = np.linspace(0, 2*np.pi, 2048, endpoint=False)
    h = grid[1] - grid[0]
    u = cl.clmath.sin(cl.array.to_device(queue, grid))

    fin_diff_knl = lp.make_kernel(
        "{[i]: 1<=i<=n}",
        "out[i] = -(f[i+1] - f[i-1])/h",
        [lp.GlobalArg("out", shape="n+2"), "..."])

    flux_knl = lp.make_kernel(
        "{[j]: 1<=j<=n}",
        "f[j] = u[j]**2/2",
        [
            lp.GlobalArg("f", shape="n+2"),
            lp.GlobalArg("u", shape="n+2"),
            ])

    fused_knl = lp.fuse_kernels([fin_diff_knl, flux_knl],
            data_flow=[
                ("f", 1, 0)
                ])

    fused_knl = lp.set_options(fused_knl, write_cl=True)
    evt, _ = fused_knl(queue, u=u, h=np.float32(1e-1))

    fused_knl = lp.assignment_to_subst(fused_knl, "f")

    fused_knl = lp.set_options(fused_knl, write_cl=True)

    # This is the real test here: The automatically generated
    # shape expressions are '2+n' and the ones above are 'n+2'.
    # Is loopy smart enough to understand that these are equal?
    evt, _ = fused_knl(queue, u=u, h=np.float32(1e-1))

    fused0_knl = lp.affine_map_inames(fused_knl, "i", "inew", "inew+1=i")

    gpu_knl = lp.split_iname(
            fused0_knl, "inew", 128, outer_tag="g.0", inner_tag="l.0")

    precomp_knl = lp.precompute(
            gpu_knl, "f_subst", "inew_inner", fetch_bounding_box=True)

    precomp_knl = lp.tag_inames(precomp_knl, {"j_0_outer": "unr"})
    precomp_knl = lp.set_options(precomp_knl, return_dict=True)
    evt, _ = precomp_knl(queue, u=u, h=h)
예제 #30
0
def test_precompute_with_preexisting_inames_fail():
    knl = lp.make_kernel(
        "{[e,i,j,k]: 0<=e<E and 0<=i,j<n and 0<=k<2*n}",
        """
        result[e,i] = sum(j, D1[i,j]*u[e,j])
        result2[e,i] = sum(k, D2[i,k]*u[e,k])
        """)

    knl = lp.add_and_infer_dtypes(knl, {
        "u": np.float32,
        "D1": np.float32,
        "D2": np.float32,
        })

    knl = lp.fix_parameters(knl, n=13)

    knl = lp.extract_subst(knl, "D1_subst", "D1[ii,jj]", parameters="ii,jj")
    knl = lp.extract_subst(knl, "D2_subst", "D2[ii,jj]", parameters="ii,jj")

    knl = lp.precompute(knl, "D1_subst", "i,j", default_tag="for",
            precompute_inames="ii,jj")
    with pytest.raises(lp.LoopyError):
        lp.precompute(knl, "D2_subst", "i,k", default_tag="for",
                precompute_inames="ii,jj")
예제 #31
0
파일: test_loopy.py 프로젝트: dokempf/loopy
def test_finite_difference_expr_subst(ctx_factory):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    grid = np.linspace(0, 2 * np.pi, 2048, endpoint=False)
    h = grid[1] - grid[0]
    u = cl.clmath.sin(cl.array.to_device(queue, grid))

    fin_diff_knl = lp.make_kernel("{[i]: 1<=i<=n}",
                                  "out[i] = -(f[i+1] - f[i-1])/h",
                                  [lp.GlobalArg("out", shape="n+2"), "..."])

    flux_knl = lp.make_kernel("{[j]: 1<=j<=n}", "f[j] = u[j]**2/2", [
        lp.GlobalArg("f", shape="n+2"),
        lp.GlobalArg("u", shape="n+2"),
    ])

    fused_knl = lp.fuse_kernels([fin_diff_knl, flux_knl],
                                data_flow=[("f", 1, 0)])

    fused_knl = lp.set_options(fused_knl, write_cl=True)
    evt, _ = fused_knl(queue, u=u, h=np.float32(1e-1))

    fused_knl = lp.assignment_to_subst(fused_knl, "f")

    fused_knl = lp.set_options(fused_knl, write_cl=True)

    # This is the real test here: The automatically generated
    # shape expressions are '2+n' and the ones above are 'n+2'.
    # Is loopy smart enough to understand that these are equal?
    evt, _ = fused_knl(queue, u=u, h=np.float32(1e-1))

    fused0_knl = lp.affine_map_inames(fused_knl, "i", "inew", "inew+1=i")

    gpu_knl = lp.split_iname(fused0_knl,
                             "inew",
                             128,
                             outer_tag="g.0",
                             inner_tag="l.0")

    precomp_knl = lp.precompute(gpu_knl,
                                "f_subst",
                                "inew_inner",
                                fetch_bounding_box=True)

    precomp_knl = lp.tag_inames(precomp_knl, {"j_0_outer": "unr"})
    precomp_knl = lp.set_options(precomp_knl, return_dict=True)
    evt, _ = precomp_knl(queue, u=u, h=h)
예제 #32
0
def test_precompute_does_not_lead_to_dep_cycle(ctx_factory):
    # See https://github.com/inducer/loopy/issues/498
    ctx = ctx_factory()

    knl = lp.make_kernel(
        "{[i]: 0<=i<10}", """
        <> tmp0[i] = 2 * i
        <> tmp1[i] = 2 * tmp0[i]
        <> tmp2[i] = 3 * tmp1[i]
        out[i] = 2*tmp1[i] + 3*tmp2[i]
        """)
    ref_knl = knl

    knl = lp.assignment_to_subst(knl, "tmp1")
    knl = lp.precompute(knl, "tmp1_subst")

    lp.auto_test_vs_ref(knl, ctx, ref_knl)
예제 #33
0
def test_fd_1d(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel(
        "{[i]: 0<=i<n}",
        "result[i] = u[i+1]-u[i]")

    knl = lp.add_and_infer_dtypes(knl, {"u": np.float32})
    ref_knl = knl

    knl = lp.split_iname(knl, "i", 16)
    knl = lp.extract_subst(knl, "u_acc", "u[j]", parameters="j")
    knl = lp.precompute(knl, "u_acc", "i_inner", default_tag="for")
    knl = lp.assume(knl, "n mod 16 = 0")

    lp.auto_test_vs_ref(
            ref_knl, ctx, knl,
            parameters=dict(n=2048))
예제 #34
0
def test_global_parallel_reduction(ctx_factory, size):
    ctx = ctx_factory()

    knl = lp.make_kernel(
        "{[i]: 0 <= i < n }", """
            # Using z[0] instead of z works around a bug in ancient PyOpenCL.
            z[0] = sum(i, a[i])
            """)

    knl = lp.add_and_infer_dtypes(knl, {"a": np.float32})
    ref_knl = knl

    gsize = 128
    knl = lp.split_iname(knl, "i", gsize * 20)
    knl = lp.split_iname(knl, "i_inner", gsize, inner_tag="l.0")
    knl = lp.split_reduction_outward(knl, "i_outer")
    knl = lp.split_reduction_inward(knl, "i_inner_outer")
    from loopy.transform.data import reduction_arg_to_subst_rule
    knl = reduction_arg_to_subst_rule(knl, "i_outer")

    knl = lp.precompute(knl,
                        "red_i_outer_arg",
                        "i_outer",
                        temporary_address_space=lp.AddressSpace.GLOBAL,
                        default_tag="l.auto")
    knl = lp.realize_reduction(knl)
    knl = lp.tag_inames(knl, "i_outer_0:g.0")

    # Keep the i_outer accumulator on the  correct (lower) side of the barrier,
    # otherwise there will be useless save/reload code generated.
    knl = lp.add_dependency(knl, "writes:acc_i_outer",
                            "id:red_i_outer_arg_barrier")

    lp.auto_test_vs_ref(ref_knl,
                        ctx,
                        knl,
                        parameters={"n": size},
                        print_ref_code=True)
예제 #35
0
def test_global_mc_parallel_reduction(ctx_factory, size):
    ctx = ctx_factory()

    import pyopencl.version  # noqa
    if cl.version.VERSION < (2016, 2):
        pytest.skip("Random123 RNG not supported in PyOpenCL < 2016.2")

    knl = lp.make_kernel(
            "{[i]: 0 <= i < n }",
            """
            for i
                <> key = make_uint2(i, 324830944)  {inames=i}
                <> ctr = make_uint4(0, 1, 2, 3)  {inames=i,id=init_ctr}
                <> vals, ctr = philox4x32_f32(ctr, key)  {dep=init_ctr}
            end
            z = sum(i, vals.s0 + vals.s1 + vals.s2 + vals.s3)
            """)

    ref_knl = knl

    gsize = 128
    knl = lp.split_iname(knl, "i", gsize * 20)
    knl = lp.split_iname(knl, "i_inner", gsize, outer_tag="l.0")
    knl = lp.split_reduction_inward(knl, "i_inner_inner")
    knl = lp.split_reduction_inward(knl, "i_inner_outer")
    from loopy.transform.data import reduction_arg_to_subst_rule
    knl = reduction_arg_to_subst_rule(knl, "i_outer")
    knl = lp.precompute(knl, "red_i_outer_arg", "i_outer",
            temporary_scope=lp.temp_var_scope.GLOBAL,
            default_tag="l.auto")
    knl = lp.realize_reduction(knl)
    knl = lp.add_dependency(
            knl, "writes:acc_i_outer",
            "id:red_i_outer_arg_barrier")

    lp.auto_test_vs_ref(
            ref_knl, ctx, knl, parameters={"n": size})
예제 #36
0
def test_precompute_confusing_subst_arguments(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel(
        "{[i,j]: 0<=i<n and 0<=j<5}",
        """
        D(i):=a[i+1]-a[i]
        b[i,j] = D(j)
        """)

    knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32))

    ref_knl = knl

    knl = lp.tag_inames(knl, dict(j="g.1"))
    knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")

    from loopy.symbolic import get_dependencies
    assert "i_inner" not in get_dependencies(knl.substitutions["D"].expression)
    knl = lp.precompute(knl, "D")

    lp.auto_test_vs_ref(
            ref_knl, ctx, knl,
            parameters=dict(n=12345))
예제 #37
0
def test_global_parallel_reduction(ctx_factory, size):
    ctx = ctx_factory()

    knl = lp.make_kernel(
            "{[i]: 0 <= i < n }",
            """
            # Using z[0] instead of z works around a bug in ancient PyOpenCL.
            z[0] = sum(i, a[i])
            """)

    knl = lp.add_and_infer_dtypes(knl, {"a": np.float32})
    ref_knl = knl

    gsize = 128
    knl = lp.split_iname(knl, "i", gsize * 20)
    knl = lp.split_iname(knl, "i_inner", gsize, inner_tag="l.0")
    knl = lp.split_reduction_outward(knl, "i_outer")
    knl = lp.split_reduction_inward(knl, "i_inner_outer")
    from loopy.transform.data import reduction_arg_to_subst_rule
    knl = reduction_arg_to_subst_rule(knl, "i_outer")

    knl = lp.precompute(knl, "red_i_outer_arg", "i_outer",
            temporary_scope=lp.temp_var_scope.GLOBAL,
            default_tag="l.auto")
    knl = lp.realize_reduction(knl)
    knl = lp.tag_inames(knl, "i_outer_0:g.0")

    # Keep the i_outer accumulator on the  correct (lower) side of the barrier,
    # otherwise there will be useless save/reload code generated.
    knl = lp.add_dependency(
            knl, "writes:acc_i_outer",
            "id:red_i_outer_arg_barrier")

    lp.auto_test_vs_ref(
            ref_knl, ctx, knl, parameters={"n": size},
            print_ref_code=True)
예제 #38
0
def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level):
    ctx = ctx_factory()

    filename = "strongVolumeKernels.f90"
    with open(filename, "r") as sourcef:
        source = sourcef.read()

    source = source.replace("datafloat", "real*4")

    hsv_r, hsv_s = [
        knl
        for knl in lp.parse_fortran(source, filename, auto_dependencies=False)
        if "KernelR" in knl.name or "KernelS" in knl.name
    ]
    hsv_r = lp.tag_instructions(hsv_r, "rknl")
    hsv_s = lp.tag_instructions(hsv_s, "sknl")
    hsv = lp.fuse_kernels([hsv_r, hsv_s], ["_r", "_s"])
    #hsv = hsv_s

    from gnuma_loopy_transforms import (fix_euler_parameters,
                                        set_q_storage_format,
                                        set_D_storage_format)

    hsv = lp.fix_parameters(hsv, Nq=Nq)
    hsv = lp.set_loop_priority(hsv, "e,k,j,i")
    hsv = lp.tag_inames(hsv, dict(e="g.0", j="l.1", i="l.0"))
    hsv = lp.assume(hsv, "elements >= 1")

    hsv = fix_euler_parameters(hsv, p_p0=1, p_Gamma=1.4, p_R=1)
    for name in ["Q", "rhsQ"]:
        hsv = set_q_storage_format(hsv, name)

    hsv = set_D_storage_format(hsv)
    #hsv = lp.add_prefetch(hsv, "volumeGeometricFactors")

    ref_hsv = hsv

    if opt_level == 0:
        tap_hsv = hsv

    hsv = lp.add_prefetch(hsv, "D[:,:]")

    if opt_level == 1:
        tap_hsv = hsv

    # turn the first reads into subst rules
    local_prep_var_names = set()
    for insn in lp.find_instructions(hsv, "tag:local_prep"):
        assignee, = insn.assignee_var_names()
        local_prep_var_names.add(assignee)
        hsv = lp.assignment_to_subst(hsv, assignee)

    # precompute fluxes
    hsv = lp.assignment_to_subst(hsv, "JinvD_r")
    hsv = lp.assignment_to_subst(hsv, "JinvD_s")

    r_fluxes = lp.find_instructions(hsv, "tag:compute_fluxes and tag:rknl")
    s_fluxes = lp.find_instructions(hsv, "tag:compute_fluxes and tag:sknl")

    if ilp_multiple > 1:
        hsv = lp.split_iname(hsv, "k", 2, inner_tag="ilp")
        ilp_inames = ("k_inner", )
        flux_ilp_inames = ("kk", )
    else:
        ilp_inames = ()
        flux_ilp_inames = ()

    rtmps = []
    stmps = []

    flux_store_idx = 0

    for rflux_insn, sflux_insn in zip(r_fluxes, s_fluxes):
        for knl_tag, insn, flux_inames, tmps, flux_precomp_inames in [
            ("rknl", rflux_insn, (
                "j",
                "n",
            ), rtmps, (
                "jj",
                "ii",
            )),
            ("sknl", sflux_insn, (
                "i",
                "n",
            ), stmps, (
                "ii",
                "jj",
            )),
        ]:
            flux_var, = insn.assignee_var_names()
            print(insn)

            reader, = lp.find_instructions(
                hsv,
                "tag:{knl_tag} and reads:{flux_var}".format(knl_tag=knl_tag,
                                                            flux_var=flux_var))

            hsv = lp.assignment_to_subst(hsv, flux_var)

            flux_store_name = "flux_store_%d" % flux_store_idx
            flux_store_idx += 1
            tmps.append(flux_store_name)

            hsv = lp.precompute(hsv,
                                flux_var + "_subst",
                                flux_inames + ilp_inames,
                                temporary_name=flux_store_name,
                                precompute_inames=flux_precomp_inames +
                                flux_ilp_inames,
                                default_tag=None)
            if flux_var.endswith("_s"):
                hsv = lp.tag_array_axes(hsv, flux_store_name, "N0,N1,N2?")
            else:
                hsv = lp.tag_array_axes(hsv, flux_store_name, "N1,N0,N2?")

            n_iname = "n_" + flux_var.replace("_r", "").replace("_s", "")
            if n_iname.endswith("_0"):
                n_iname = n_iname[:-2]
            hsv = lp.rename_iname(hsv,
                                  "n",
                                  n_iname,
                                  within="id:" + reader.id,
                                  existing_ok=True)

    hsv = lp.tag_inames(hsv, dict(ii="l.0", jj="l.1"))
    for iname in flux_ilp_inames:
        hsv = lp.tag_inames(hsv, {iname: "ilp"})

    hsv = lp.alias_temporaries(hsv, rtmps)
    hsv = lp.alias_temporaries(hsv, stmps)

    if opt_level == 2:
        tap_hsv = hsv

    for prep_var_name in local_prep_var_names:
        if prep_var_name.startswith("Jinv") or "_s" in prep_var_name:
            continue
        hsv = lp.precompute(
            hsv, lp.find_one_rule_matching(hsv, prep_var_name + "_*subst*"))

    if opt_level == 3:
        tap_hsv = hsv

    hsv = lp.add_prefetch(hsv, "Q[ii,jj,k,:,:,e]", sweep_inames=ilp_inames)

    if opt_level == 4:
        tap_hsv = hsv
        tap_hsv = lp.tag_inames(
            tap_hsv, dict(Q_dim_field_inner="unr", Q_dim_field_outer="unr"))

    hsv = lp.buffer_array(hsv,
                          "rhsQ",
                          ilp_inames,
                          fetch_bounding_box=True,
                          default_tag="for",
                          init_expression="0",
                          store_expression="base + buffer")

    if opt_level == 5:
        tap_hsv = hsv
        tap_hsv = lp.tag_inames(
            tap_hsv,
            dict(rhsQ_init_field_inner="unr",
                 rhsQ_store_field_inner="unr",
                 rhsQ_init_field_outer="unr",
                 rhsQ_store_field_outer="unr",
                 Q_dim_field_inner="unr",
                 Q_dim_field_outer="unr"))

    # buffer axes need to be vectorized in order for this to work
    hsv = lp.tag_array_axes(hsv, "rhsQ_buf", "c?,vec,c")
    hsv = lp.tag_array_axes(hsv, "Q_fetch", "c?,vec,c")
    hsv = lp.tag_array_axes(hsv, "D_fetch", "f,f")
    hsv = lp.tag_inames(hsv, {
        "Q_dim_k": "unr",
        "rhsQ_init_k": "unr",
        "rhsQ_store_k": "unr"
    },
                        ignore_nonexistent=True)

    if opt_level == 6:
        tap_hsv = hsv
        tap_hsv = lp.tag_inames(
            tap_hsv,
            dict(rhsQ_init_field_inner="unr",
                 rhsQ_store_field_inner="unr",
                 rhsQ_init_field_outer="unr",
                 rhsQ_store_field_outer="unr",
                 Q_dim_field_inner="unr",
                 Q_dim_field_outer="unr"))

    hsv = lp.tag_inames(
        hsv,
        dict(rhsQ_init_field_inner="vec",
             rhsQ_store_field_inner="vec",
             rhsQ_init_field_outer="unr",
             rhsQ_store_field_outer="unr",
             Q_dim_field_inner="vec",
             Q_dim_field_outer="unr"))

    if opt_level == 7:
        tap_hsv = hsv

    hsv = lp.collect_common_factors_on_increment(
        hsv, "rhsQ_buf", vary_by_axes=(0, ) if ilp_multiple > 1 else ())

    if opt_level >= 8:
        tap_hsv = hsv

    hsv = tap_hsv

    if 1:
        print("OPS")
        op_poly = lp.get_op_poly(hsv)
        print(lp.stringify_stats_mapping(op_poly))

        print("MEM")
        gmem_poly = lp.sum_mem_access_to_bytes(lp.get_gmem_access_poly(hsv))
        print(lp.stringify_stats_mapping(gmem_poly))

    hsv = lp.set_options(hsv,
                         cl_build_options=[
                             "-cl-denorms-are-zero",
                             "-cl-fast-relaxed-math",
                             "-cl-finite-math-only",
                             "-cl-mad-enable",
                             "-cl-no-signed-zeros",
                         ])

    hsv = hsv.copy(name="horizontalStrongVolumeKernel")

    results = lp.auto_test_vs_ref(ref_hsv,
                                  ctx,
                                  hsv,
                                  parameters=dict(elements=300),
                                  quiet=True)

    elapsed = results["elapsed_wall"]

    print("elapsed", elapsed)
예제 #39
0
def test_laplacian_lmem(ctx_factory):
    dtype = np.float32
    ctx = ctx_factory()
    order = "C"

    n = 4

    from pymbolic import var
    K_sym = var("K")

    field_shape = (K_sym, n, n, n)

    # K - run-time symbolic
    knl = lp.make_kernel(ctx.devices[0],
            "[K] -> {[i,j,k,e,m,o,gi]: 0<=i,j,k,m,o<%d and 0<=e<K and 0<=gi<6}" % n,
            [
                "ur(a,b,c) := sum_float32(@o, D[a,o]*u[e,o,b,c])",
                "us(a,b,c) := sum_float32(@o, D[b,o]*u[e,a,o,c])",
                "ut(a,b,c) := sum_float32(@o, D[c,o]*u[e,a,b,o])",

                "lap[e,i,j,k]  = "
                "  sum_float32(m, D[m,i]*(G[0,e,m,j,k]*ur(m,j,k) + G[1,e,m,j,k]*us(m,j,k) + G[2,e,m,j,k]*ut(m,j,k)))"
                "+ sum_float32(m, D[m,j]*(G[1,e,i,m,k]*ur(i,m,k) + G[3,e,i,m,k]*us(i,m,k) + G[4,e,i,m,k]*ut(i,m,k)))"
                "+ sum_float32(m, D[m,k]*(G[2,e,i,j,m]*ur(i,j,m) + G[4,e,i,j,m]*us(i,j,m) + G[5,e,i,j,m]*ut(i,j,m)))"
                ],
            [
            lp.ArrayArg("u", dtype, shape=field_shape, order=order),
            lp.ArrayArg("lap", dtype, shape=field_shape, order=order),
            lp.ArrayArg("G", dtype, shape=(6,)+field_shape, order=order),
            lp.ArrayArg("D", dtype, shape=(n, n), order=order),
            lp.ValueArg("K", np.int32, approximately=1000),
            ],
            name="semlap", assumptions="K>=1")

    seq_knl = knl

    if 1:
        # original
        knl = lp.add_prefetch(knl, "u", ["i", "j", "k", "o"])
        knl = lp.precompute(knl, "ur", np.float32, ["a", "b", "c"])
        knl = lp.precompute(knl, "us", np.float32, ["a", "b", "c"])
        knl = lp.precompute(knl, "ut", np.float32, ["a", "b", "c"])
        knl = lp.split_iname(knl, "e", 16, outer_tag="g.0")#, slabs=(0, 1))
        knl = lp.add_prefetch(knl, "D", ["m", "j", "k", "i"])
    else:
        # experiment
#        knl = lp.add_prefetch(knl, "u", ["i", "j", "k", "o"])
        knl = lp.precompute(knl, "eu", np.float32, ["b", "c"])
        knl = lp.precompute(knl, "ur", np.float32, ["b", "c"])
        knl = lp.precompute(knl, "us", np.float32, ["b", "c"])
        knl = lp.precompute(knl, "ut", np.float32, ["b", "c"])
        knl = lp.split_iname(knl, "e", 1, outer_tag="g.0")#, slabs=(0, 1))
        knl = lp.add_prefetch(knl, "D", ["m", "j", "k", "i"])



    #knl = lp.add_prefetch(knl, "G", [2,3,4]) # axis/argument indices on G
    #knl = lp.add_prefetch(knl, "G", ["i", "j", "m", "k"]) # axis/argument indices on G
    #print(knl)
    #1/0

    #knl = lp.split_iname(knl, "e_inner", 4, inner_tag="ilp")
#    knl = lp.join_dimensions(knl, ["i", "j"], "i_and_j")

    #print(seq_knl)
    #print(lp.preprocess_kernel(knl))
    #1/0

# TW: turned this off since it generated:
# ValueError: cannot tag 'i_and_j'--not known
#    knl = lp.tag_inames(knl, dict(i_and_j="l.0", k="l.1"))

    kernel_gen = lp.generate_loop_schedules(knl)
    kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000))

    K = 1000
    lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen,
            op_count=K*(n*n*n*n*2*3 + n*n*n*5*3 + n**4 * 2*3)/1e9,
            op_label="GFlops",
            parameters={"K": K})
예제 #40
0
def test_laplacian_lmem(ctx_factory):
    dtype = np.float32
    ctx = ctx_factory()
    order = "C"

    n = 4

    from pymbolic import var
    K_sym = var("K")

    field_shape = (K_sym, n, n, n)

    # K - run-time symbolic
    knl = lp.make_kernel(
        ctx.devices[0],
        "[K] -> {[i,j,k,e,m,o,gi]: 0<=i,j,k,m,o<%d and 0<=e<K and 0<=gi<6}" %
        n, [
            "ur(a,b,c) := sum_float32(@o, D[a,o]*u[e,o,b,c])",
            "us(a,b,c) := sum_float32(@o, D[b,o]*u[e,a,o,c])",
            "ut(a,b,c) := sum_float32(@o, D[c,o]*u[e,a,b,o])",
            "lap[e,i,j,k]  = "
            "  sum_float32(m, D[m,i]*(G[0,e,m,j,k]*ur(m,j,k) + G[1,e,m,j,k]*us(m,j,k) + G[2,e,m,j,k]*ut(m,j,k)))"
            "+ sum_float32(m, D[m,j]*(G[1,e,i,m,k]*ur(i,m,k) + G[3,e,i,m,k]*us(i,m,k) + G[4,e,i,m,k]*ut(i,m,k)))"
            "+ sum_float32(m, D[m,k]*(G[2,e,i,j,m]*ur(i,j,m) + G[4,e,i,j,m]*us(i,j,m) + G[5,e,i,j,m]*ut(i,j,m)))"
        ], [
            lp.ArrayArg("u", dtype, shape=field_shape, order=order),
            lp.ArrayArg("lap", dtype, shape=field_shape, order=order),
            lp.ArrayArg("G", dtype, shape=(6, ) + field_shape, order=order),
            lp.ArrayArg("D", dtype, shape=(n, n), order=order),
            lp.ValueArg("K", np.int32, approximately=1000),
        ],
        name="semlap",
        assumptions="K>=1")

    seq_knl = knl

    if 1:
        # original
        knl = lp.add_prefetch(knl,
                              "u", ["i", "j", "k", "o"],
                              default_tag="l.auto")
        knl = lp.precompute(knl,
                            "ur",
                            np.float32, ["a", "b", "c"],
                            default_tag="l.auto")
        knl = lp.precompute(knl,
                            "us",
                            np.float32, ["a", "b", "c"],
                            default_tag="l.auto")
        knl = lp.precompute(knl,
                            "ut",
                            np.float32, ["a", "b", "c"],
                            default_tag="l.auto")
        knl = lp.split_iname(knl, "e", 16, outer_tag="g.0")  #, slabs=(0, 1))
        knl = lp.add_prefetch(knl,
                              "D", ["m", "j", "k", "i"],
                              default_tag="l.auto")
    else:
        # experiment
        #        knl = lp.add_prefetch(knl, "u", ["i", "j", "k", "o"], default_tag="l.auto")
        knl = lp.precompute(knl,
                            "eu",
                            np.float32, ["b", "c"],
                            default_tag="l.auto")
        knl = lp.precompute(knl,
                            "ur",
                            np.float32, ["b", "c"],
                            default_tag="l.auto")
        knl = lp.precompute(knl,
                            "us",
                            np.float32, ["b", "c"],
                            default_tag="l.auto")
        knl = lp.precompute(knl,
                            "ut",
                            np.float32, ["b", "c"],
                            default_tag="l.auto")
        knl = lp.split_iname(knl, "e", 1, outer_tag="g.0")  #, slabs=(0, 1))
        knl = lp.add_prefetch(knl,
                              "D", ["m", "j", "k", "i"],
                              default_tag="l.auto")

    #knl = lp.add_prefetch(knl, "G", [2,3,4], default_tag="l.auto") # axis/argument indices on G
    #knl = lp.add_prefetch(knl, "G", ["i", "j", "m", "k"], default_tag="l.auto") # axis/argument indices on G
    #print(knl)
    #1/0

    #knl = lp.split_iname(knl, "e_inner", 4, inner_tag="ilp")
#    knl = lp.join_dimensions(knl, ["i", "j"], "i_and_j")

#print(seq_knl)
#print(lp.preprocess_kernel(knl))
#1/0

# TW: turned this off since it generated:
# ValueError: cannot tag 'i_and_j'--not known
#    knl = lp.tag_inames(knl, dict(i_and_j="l.0", k="l.1"))

    kernel_gen = lp.generate_loop_schedules(knl)
    kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000))

    K = 1000
    lp.auto_test_vs_ref(
        seq_knl,
        ctx,
        kernel_gen,
        op_count=K *
        (n * n * n * n * 2 * 3 + n * n * n * 5 * 3 + n**4 * 2 * 3) / 1e9,
        op_label="GFlops",
        parameters={"K": K})
예제 #41
0
파일: test_apps.py 프로젝트: inducer/loopy
 def variant_1(knl):
     knl = lp.precompute(knl, "dpsi", "i,k,ell", default_tag='for')
     knl = lp.prioritize_loops(knl, "c,i,j")
     return knl
예제 #42
0
파일: test_sem.py 프로젝트: cmsquared/loopy
def test_laplacian_lmem_ilp(ctx_factory):
    # This does not lead to practical/runnable code (out of lmem), but it's an
    # excellent stress test for the code generator. :)

    dtype = np.float32
    ctx = ctx_factory()
    order = "C"

    n = 8

    from pymbolic import var
    K_sym = var("K")

    field_shape = (K_sym, n, n, n)

    # K - run-time symbolic
    knl = lp.make_kernel(ctx.devices[0],
            "[K] -> {[i,j,k,e,m,o,gi]: 0<=i,j,k,m,o<%d and 0<=e<K }" % n,
            [
                "ur(i,j,k) := sum_float32(@o, D[i,o]*u[e,o,j,k])",
                "us(i,j,k) := sum_float32(@o, D[j,o]*u[e,i,o,k])",
                "ut(i,j,k) := sum_float32(@o, D[k,o]*u[e,i,j,o])",

                "lap[e,i,j,k]  = "
                "  sum_float32(m, D[m,i]*(G[0,e,m,j,k]*ur(m,j,k) + G[1,e,m,j,k]*us(m,j,k) + G[2,e,m,j,k]*ut(m,j,k)))"
                "+ sum_float32(m, D[m,j]*(G[1,e,i,m,k]*ur(i,m,k) + G[3,e,i,m,k]*us(i,m,k) + G[4,e,i,m,k]*ut(i,m,k)))"
                "+ sum_float32(m, D[m,k]*(G[2,e,i,j,m]*ur(i,j,m) + G[4,e,i,j,m]*us(i,j,m) + G[5,e,i,j,m]*ut(i,j,m)))"
                ],
            [
            lp.GlobalArg("u", dtype, shape=field_shape, order=order),
            lp.GlobalArg("lap", dtype, shape=field_shape, order=order),
            lp.GlobalArg("G", dtype, shape=(6,)+field_shape, order=order),
            lp.GlobalArg("D", dtype, shape=(n, n), order=order),
            lp.ValueArg("K", np.int32, approximately=1000),
            ],
            name="semlap", assumptions="K>=1")


    # Must act on u first, otherwise stencil becomes crooked and
    # footprint becomes non-convex.

    knl = lp.split_iname(knl, "e", 16, outer_tag="g.0")#, slabs=(0, 1))
    knl = lp.split_iname(knl, "e_inner", 4, inner_tag="ilp")

    knl = lp.add_prefetch(knl, "u", [1, 2, 3, "e_inner_inner"])

    knl = lp.precompute(knl, "ur", np.float32, [0, 1, 2, "e_inner_inner"])
    knl = lp.precompute(knl, "us", np.float32, [0, 1, 2, "e_inner_inner"])
    knl = lp.precompute(knl, "ut", np.float32, [0, 1, 2, "e_inner_inner"])

    knl = lp.add_prefetch(knl, "G", ["m", "i", "j", "k", "e_inner_inner"])
    knl = lp.add_prefetch(knl, "D", ["m", "j"])

    #print seq_knl
    #1/0

    knl = lp.tag_inames(knl, dict(i="l.0", j="l.1"))

    kernel_gen = lp.generate_loop_schedules(knl)
    kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000))

    for knl in kernel_gen:
        print(lp.generate_code(knl))
예제 #43
0
def test_laplacian_lmem_ilp(ctx_factory):
    # This does not lead to practical/runnable code (out of lmem), but it's an
    # excellent stress test for the code generator. :)

    dtype = np.float32
    ctx = ctx_factory()
    order = "C"

    n = 8

    from pymbolic import var
    K_sym = var("K")

    field_shape = (K_sym, n, n, n)

    # K - run-time symbolic
    knl = lp.make_kernel(
        ctx.devices[0],
        "[K] -> {[i,j,k,e,m,o,gi]: 0<=i,j,k,m,o<%d and 0<=e<K }" % n, [
            "ur(i,j,k) := sum_float32(@o, D[i,o]*u[e,o,j,k])",
            "us(i,j,k) := sum_float32(@o, D[j,o]*u[e,i,o,k])",
            "ut(i,j,k) := sum_float32(@o, D[k,o]*u[e,i,j,o])",
            "lap[e,i,j,k]  = "
            "  sum_float32(m, D[m,i]*(G[0,e,m,j,k]*ur(m,j,k) + G[1,e,m,j,k]*us(m,j,k) + G[2,e,m,j,k]*ut(m,j,k)))"
            "+ sum_float32(m, D[m,j]*(G[1,e,i,m,k]*ur(i,m,k) + G[3,e,i,m,k]*us(i,m,k) + G[4,e,i,m,k]*ut(i,m,k)))"
            "+ sum_float32(m, D[m,k]*(G[2,e,i,j,m]*ur(i,j,m) + G[4,e,i,j,m]*us(i,j,m) + G[5,e,i,j,m]*ut(i,j,m)))"
        ], [
            lp.GlobalArg("u", dtype, shape=field_shape, order=order),
            lp.GlobalArg("lap", dtype, shape=field_shape, order=order),
            lp.GlobalArg("G", dtype, shape=(6, ) + field_shape, order=order),
            lp.GlobalArg("D", dtype, shape=(n, n), order=order),
            lp.ValueArg("K", np.int32, approximately=1000),
        ],
        name="semlap",
        assumptions="K>=1")

    # Must act on u first, otherwise stencil becomes crooked and
    # footprint becomes non-convex.

    knl = lp.split_iname(knl, "e", 16, outer_tag="g.0")  #, slabs=(0, 1))
    knl = lp.split_iname(knl, "e_inner", 4, inner_tag="ilp")

    knl = lp.add_prefetch(knl, "u", [1, 2, 3, "e_inner_inner"])

    knl = lp.precompute(knl, "ur", np.float32, [0, 1, 2, "e_inner_inner"])
    knl = lp.precompute(knl, "us", np.float32, [0, 1, 2, "e_inner_inner"])
    knl = lp.precompute(knl, "ut", np.float32, [0, 1, 2, "e_inner_inner"])

    knl = lp.add_prefetch(knl, "G", ["m", "i", "j", "k", "e_inner_inner"])
    knl = lp.add_prefetch(knl, "D", ["m", "j"])

    #print seq_knl
    #1/0

    knl = lp.tag_inames(knl, dict(i="l.0", j="l.1"))

    kernel_gen = lp.generate_loop_schedules(knl)
    kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000))

    for knl in kernel_gen:
        print(lp.generate_code(knl))
예제 #44
0
def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level):
    ctx = ctx_factory()

    filename = "strongVolumeKernels.f90"
    with open(filename, "r") as sourcef:
        source = sourcef.read()

    source = source.replace("datafloat", "real*4")

    hsv_r, hsv_s = [
           knl for knl in lp.parse_fortran(source, filename, auto_dependencies=False)
           if "KernelR" in knl.name or "KernelS" in knl.name
           ]
    hsv_r = lp.tag_instructions(hsv_r, "rknl")
    hsv_s = lp.tag_instructions(hsv_s, "sknl")
    hsv = lp.fuse_kernels([hsv_r, hsv_s], ["_r", "_s"])
    #hsv = hsv_s

    from gnuma_loopy_transforms import (
          fix_euler_parameters,
          set_q_storage_format, set_D_storage_format)

    hsv = lp.fix_parameters(hsv, Nq=Nq)
    hsv = lp.set_loop_priority(hsv, "e,k,j,i")
    hsv = lp.tag_inames(hsv, dict(e="g.0", j="l.1", i="l.0"))
    hsv = lp.assume(hsv, "elements >= 1")

    hsv = fix_euler_parameters(hsv, p_p0=1, p_Gamma=1.4, p_R=1)
    for name in ["Q", "rhsQ"]:
        hsv = set_q_storage_format(hsv, name)

    hsv = set_D_storage_format(hsv)
    #hsv = lp.add_prefetch(hsv, "volumeGeometricFactors")

    ref_hsv = hsv

    if opt_level == 0:
        tap_hsv = hsv

    hsv = lp.add_prefetch(hsv, "D[:,:]")

    if opt_level == 1:
        tap_hsv = hsv

    # turn the first reads into subst rules
    local_prep_var_names = set()
    for insn in lp.find_instructions(hsv, "tag:local_prep"):
        assignee, = insn.assignee_var_names()
        local_prep_var_names.add(assignee)
        hsv = lp.assignment_to_subst(hsv, assignee)

    # precompute fluxes
    hsv = lp.assignment_to_subst(hsv, "JinvD_r")
    hsv = lp.assignment_to_subst(hsv, "JinvD_s")

    r_fluxes = lp.find_instructions(hsv, "tag:compute_fluxes and tag:rknl")
    s_fluxes = lp.find_instructions(hsv, "tag:compute_fluxes and tag:sknl")

    if ilp_multiple > 1:
        hsv = lp.split_iname(hsv, "k", 2, inner_tag="ilp")
        ilp_inames = ("k_inner",)
        flux_ilp_inames = ("kk",)
    else:
        ilp_inames = ()
        flux_ilp_inames = ()

    rtmps = []
    stmps = []

    flux_store_idx = 0

    for rflux_insn, sflux_insn in zip(r_fluxes, s_fluxes):
        for knl_tag, insn, flux_inames, tmps, flux_precomp_inames in [
                  ("rknl", rflux_insn, ("j", "n",), rtmps, ("jj", "ii",)),
                  ("sknl", sflux_insn, ("i", "n",), stmps, ("ii", "jj",)),
                  ]:
            flux_var, = insn.assignee_var_names()
            print(insn)

            reader, = lp.find_instructions(hsv,
                  "tag:{knl_tag} and reads:{flux_var}"
                  .format(knl_tag=knl_tag, flux_var=flux_var))

            hsv = lp.assignment_to_subst(hsv, flux_var)

            flux_store_name = "flux_store_%d" % flux_store_idx
            flux_store_idx += 1
            tmps.append(flux_store_name)

            hsv = lp.precompute(hsv, flux_var+"_subst", flux_inames + ilp_inames,
                temporary_name=flux_store_name,
                precompute_inames=flux_precomp_inames + flux_ilp_inames,
                default_tag=None)
            if flux_var.endswith("_s"):
                hsv = lp.tag_data_axes(hsv, flux_store_name, "N0,N1,N2?")
            else:
                hsv = lp.tag_data_axes(hsv, flux_store_name, "N1,N0,N2?")

            n_iname = "n_"+flux_var.replace("_r", "").replace("_s", "")
            if n_iname.endswith("_0"):
                n_iname = n_iname[:-2]
            hsv = lp.rename_iname(hsv, "n", n_iname, within="id:"+reader.id,
                  existing_ok=True)

    hsv = lp.tag_inames(hsv, dict(ii="l.0", jj="l.1"))
    for iname in flux_ilp_inames:
        hsv = lp.tag_inames(hsv, {iname: "ilp"})

    hsv = lp.alias_temporaries(hsv, rtmps)
    hsv = lp.alias_temporaries(hsv, stmps)

    if opt_level == 2:
        tap_hsv = hsv

    for prep_var_name in local_prep_var_names:
        if prep_var_name.startswith("Jinv") or "_s" in prep_var_name:
            continue
        hsv = lp.precompute(hsv,
            lp.find_one_rule_matching(hsv, prep_var_name+"_*subst*"))

    if opt_level == 3:
        tap_hsv = hsv

    hsv = lp.add_prefetch(hsv, "Q[ii,jj,k,:,:,e]", sweep_inames=ilp_inames)

    if opt_level == 4:
        tap_hsv = hsv
        tap_hsv = lp.tag_inames(tap_hsv, dict(
              Q_dim_field_inner="unr",
              Q_dim_field_outer="unr"))

    hsv = lp.buffer_array(hsv, "rhsQ", ilp_inames,
          fetch_bounding_box=True, default_tag="for",
          init_expression="0", store_expression="base + buffer")

    if opt_level == 5:
        tap_hsv = hsv
        tap_hsv = lp.tag_inames(tap_hsv, dict(
              rhsQ_init_field_inner="unr", rhsQ_store_field_inner="unr",
              rhsQ_init_field_outer="unr", rhsQ_store_field_outer="unr",
              Q_dim_field_inner="unr",
              Q_dim_field_outer="unr"))

    # buffer axes need to be vectorized in order for this to work
    hsv = lp.tag_data_axes(hsv, "rhsQ_buf", "c?,vec,c")
    hsv = lp.tag_data_axes(hsv, "Q_fetch", "c?,vec,c")
    hsv = lp.tag_data_axes(hsv, "D_fetch", "f,f")
    hsv = lp.tag_inames(hsv,
            {"Q_dim_k": "unr", "rhsQ_init_k": "unr", "rhsQ_store_k": "unr"},
            ignore_nonexistent=True)

    if opt_level == 6:
        tap_hsv = hsv
        tap_hsv = lp.tag_inames(tap_hsv, dict(
              rhsQ_init_field_inner="unr", rhsQ_store_field_inner="unr",
              rhsQ_init_field_outer="unr", rhsQ_store_field_outer="unr",
              Q_dim_field_inner="unr",
              Q_dim_field_outer="unr"))

    hsv = lp.tag_inames(hsv, dict(
          rhsQ_init_field_inner="vec", rhsQ_store_field_inner="vec",
          rhsQ_init_field_outer="unr", rhsQ_store_field_outer="unr",
          Q_dim_field_inner="vec",
          Q_dim_field_outer="unr"))

    if opt_level == 7:
        tap_hsv = hsv

    hsv = lp.collect_common_factors_on_increment(hsv, "rhsQ_buf",
          vary_by_axes=(0,) if ilp_multiple > 1 else ())

    if opt_level >= 8:
        tap_hsv = hsv

    hsv = tap_hsv

    if 1:
        print("OPS")
        op_poly = lp.get_op_poly(hsv)
        print(lp.stringify_stats_mapping(op_poly))

        print("MEM")
        gmem_poly = lp.sum_mem_access_to_bytes(lp.get_gmem_access_poly(hsv))
        print(lp.stringify_stats_mapping(gmem_poly))

    hsv = lp.set_options(hsv, cl_build_options=[
         "-cl-denorms-are-zero",
         "-cl-fast-relaxed-math",
         "-cl-finite-math-only",
         "-cl-mad-enable",
         "-cl-no-signed-zeros",
         ])

    hsv = hsv.copy(name="horizontalStrongVolumeKernel")

    results = lp.auto_test_vs_ref(ref_hsv, ctx, hsv, parameters=dict(elements=300),
            quiet=True)

    elapsed = results["elapsed_wall"]

    print("elapsed", elapsed)
예제 #45
0
 def variant_1(knl):
     knl = lp.precompute(knl, "dpsi", "i,k,ell", default_tag='for')
     knl = lp.set_loop_priority(knl, "c,i,j")
     return knl
예제 #46
0
 def variant_2(knl):
     knl = lp.precompute(knl, "dpsi", "i,ell", default_tag='for')
     knl = lp.set_loop_priority(knl, "c,i,j")
     return knl
예제 #47
0
 def variant_2(knl):
     knl = lp.precompute(knl, "dpsi", "i,ell", default_tag='for')
     knl = lp.prioritize_loops(knl, "c,i,j")
     return knl