Пример #1
0
def test_type_inference_no_artificial_doubles(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel("{[i]: 0<=i<n}",
                         """
                <> bb = a[i] - b[i]
                c[i] = bb
                """, [
                             lp.GlobalArg("a", np.float32, shape=("n", )),
                             lp.GlobalArg("b", np.float32, shape=("n", )),
                             lp.GlobalArg("c", np.float32, shape=("n", )),
                             lp.ValueArg("n", np.int32),
                         ],
                         assumptions="n>=1")

    knl = lp.preprocess_kernel(knl, ctx.devices[0])
    for k in lp.generate_loop_schedules(knl):
        code = lp.generate_code(k)
        assert "double" not in code
Пример #2
0
def test_sized_and_complex_literals(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel("{[i]: 0<=i<n}",
                         """
                <> aa = 5jf
                <> bb = 5j
                a[i] = imag(aa)
                b[i] = imag(bb)
                c[i] = 5f
                """, [
                             lp.GlobalArg("a", np.float32, shape=("n", )),
                             lp.GlobalArg("b", np.float32, shape=("n", )),
                             lp.GlobalArg("c", np.float32, shape=("n", )),
                             lp.ValueArg("n", np.int32),
                         ],
                         assumptions="n>=1")

    lp.auto_test_vs_ref(knl, ctx, knl, parameters=dict(n=5))
Пример #3
0
def test_vector_types(ctx_factory, vec_len):
    ctx = ctx_factory()

    knl = lp.make_kernel(
        "{ [i,j]: 0<=i<n and 0<=j<vec_len }", "out[i,j] = 2*a[i,j]", [
            lp.GlobalArg("a", np.float32, shape=lp.auto),
            lp.GlobalArg("out", np.float32, shape=lp.auto), "..."
        ])

    knl = lp.fix_parameters(knl, vec_len=vec_len)

    ref_knl = knl

    knl = lp.tag_data_axes(knl, "out", "c,vec")
    knl = lp.tag_inames(knl, dict(j="unr"))

    knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")

    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=20000))
Пример #4
0
def test_conditional(ctx_factory):
    #logging.basicConfig(level=logging.DEBUG)
    ctx = ctx_factory()

    knl = lp.make_kernel(
        "{ [i,j]: 0<=i,j<n }", """
                <> my_a = a[i,j] {id=read_a}
                <> a_less_than_zero = my_a < 0 {dep=read_a,inames=i:j}
                my_a = 2*my_a {id=twice_a,dep=read_a,if=a_less_than_zero}
                my_a = my_a+1 {id=aplus,dep=twice_a,if=a_less_than_zero}
                out[i,j] = 2*my_a {dep=aplus}
                """, [
            lp.GlobalArg("a", np.float32, shape=lp.auto),
            lp.GlobalArg("out", np.float32, shape=lp.auto), "..."
        ])

    ref_knl = knl

    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=200))
Пример #5
0
def test_generate_c_snippet():
    from loopy.target.c import CTarget

    from pymbolic import var
    I = var("I")  # noqa
    f = var("f")
    df = var("df")
    q_v = var("q_v")
    eN = var("eN")  # noqa
    k = var("k")
    u = var("u")

    from functools import partial
    l_sum = partial(lp.Reduction, "sum", allow_simultaneous=True)

    Instr = lp.Assignment  # noqa

    knl = lp.make_kernel(
        "{[I, k]: 0<=I<nSpace and 0<=k<nQuad}",
        [
            Instr(f[I], l_sum(k, q_v[k, I]*u)),
            Instr(df[I], l_sum(k, q_v[k, I])),
            ],
        [
            lp.GlobalArg("q_v", np.float64, shape="nQuad, nSpace"),
            lp.GlobalArg("f,df", np.float64, shape="nSpace"),
            lp.ValueArg("u", np.float64),
            "...",
            ],
        target=CTarget(),
        assumptions="nQuad>=1")

    if 0:  # enable to play with prefetching
        # (prefetch currently requires constant sizes)
        knl = lp.fix_parameters(knl, nQuad=5, nSpace=3)
        knl = lp.add_prefetch(knl, "q_v", "k,I", default_tag=None)

    knl = lp.split_iname(knl, "k", 4, inner_tag="unr", slabs=(0, 1))
    knl = lp.prioritize_loops(knl, "I,k_outer,k_inner")

    knl = lp.preprocess_kernel(knl)
    knl = lp.get_one_scheduled_kernel(knl)
    print(lp.generate_body(knl))
Пример #6
0
def test_troublesome_premagma_fermi_matrix_mul(ctx_factory):
    dtype = np.float32
    ctx = ctx_factory()
    order = "C"

    n = 6 * 16 * 2

    knl = lp.make_kernel(
        "{[i,j,k]: 0<=i,j,k<%d}" % n, ["c[i, j] = sum(k, a[i, k]*b[k, j])"], [
            lp.GlobalArg("a", dtype, shape=(n, n), order=order),
            lp.GlobalArg("b", dtype, shape=(n, n), order=order),
            lp.GlobalArg("c", dtype, shape=(n, n), order=order),
        ],
        name="matmul")

    seq_knl = knl

    i_reg = 2
    j_reg = 2
    i_chunks = 16
    j_chunks = 16
    knl = lp.split_iname(knl, "i", i_reg * i_chunks, outer_tag="g.0")
    knl = lp.split_iname(knl,
                         "i_inner",
                         i_reg,
                         outer_tag="l.0",
                         inner_tag="ilp")
    knl = lp.split_iname(knl, "j", j_reg * j_chunks, outer_tag="g.1")
    knl = lp.split_iname(knl,
                         "j_inner",
                         j_reg,
                         outer_tag="l.1",
                         inner_tag="ilp")
    knl = lp.split_iname(knl, "k", 16)
    knl = lp.add_prefetch(knl, 'a',
                          ["k_inner", "i_inner_inner", "i_inner_outer"])

    lp.auto_test_vs_ref(seq_knl,
                        ctx,
                        knl,
                        op_count=[2 * n**3 / 1e9],
                        op_label=["GFlops"],
                        parameters={})
Пример #7
0
 def get_source_args(self):
     return [
             KernelArgument(
                 loopy_arg=lp.GlobalArg(
                     self.dir_vec_name,
                     None,
                     shape=(self.dim, "nsources"),
                     dim_tags="sep,C"),
                 )
                 ] + self.inner_kernel.get_source_args()
Пример #8
0
def test_reduction_with_conditional():
    # Test whether realization of a reduction inherits predicates
    # of the original instruction. Tested with the CTarget, because
    # the PyOpenCL target will hoist the conditional into the host
    # code in this minimal example.
    knl = lp.make_kernel(
                "{ [i] : 0<=i<42 }",
                """
                if n > 0
                    <>b = sum(i, a[i])
                end
                """,
                [lp.GlobalArg("a", dtype=np.float32, shape=(42,)),
                 lp.GlobalArg("n", dtype=np.float32, shape=())],
                target=lp.CTarget())
    code = lp.generate_body(knl)

    # Check that the if appears before the loop that realizes the reduction.
    assert code.index("if") < code.index("for")
Пример #9
0
def test_write_parameter(ctx_factory):
    dtype = np.float32
    ctx = ctx_factory()

    knl = lp.make_kernel("{[i,j]: 0<=i,j<n }",
                         """
                a = sum((i,j), i*j)
                b = sum(i, sum(j, i*j))
                n = 15
                """, [
                             lp.GlobalArg("a", dtype, shape=()),
                             lp.GlobalArg("b", dtype, shape=()),
                             lp.ValueArg("n", np.int32, approximately=1000),
                         ],
                         assumptions="n>=1")

    import pytest
    with pytest.raises(RuntimeError):
        lp.CompiledKernel(ctx, knl).get_code()
Пример #10
0
def test_memory_tools_defn():
    wrapper = __test_cases()
    for opts in wrapper:
        # create a dummy callgen
        callgen = CallgenResult(order=opts.order, lang=opts.lang,
                                dev_mem_type=wrapper.state['dev_mem_type'],
                                type_map=type_map(opts.lang))
        # create a memory manager
        mem = get_memory(callgen, host_namer=HostNamer(), device_namer=DeviceNamer())

        a1 = lp.GlobalArg('a1', shape=(arc.problem_size), dtype=np.int32)
        a2 = lp.GlobalArg('a2', shape=(arc.problem_size, 10), dtype=np.int64)
        d3 = lp.GlobalArg('d3', shape=(arc.problem_size, 10, 10), dtype=np.float64)
        a4 = lp.ValueArg('a4', dtype=np.int64)
        a5 = lp.ValueArg('a5', dtype=np.int32)
        a6 = lp.TemporaryVariable('a6', initializer=np.array([0, 1, 2]),
                                  read_only=True)

        if opts.lang == 'opencl':
            assert mem.define(True, a1) == 'cl_mem d_a1;'
            assert mem.define(False, a2) == 'long int* h_a2;'
            assert mem.define(True, d3) == 'cl_mem d_d3;'
            assert mem.define(False, a4) == 'long int h_a4;'
            assert mem.define(True, a5) == 'cl_uint d_a5;'
            assert mem.define(True, a5) == 'cl_uint d_a5;'
            with assert_raises(Exception):
                mem.define(True, a6, host_constant=True)
            assert mem.define(False, a6, host_constant=True) == \
                'const long int h_a6[3] = {0, 1, 2};'

        elif opts.lang == 'c':
            assert mem.define(True, a1) == 'int* d_a1;'
            assert mem.define(False, a2) == 'long int* h_a2;'
            assert mem.define(True, d3) == 'double* d_d3;'
            assert mem.define(False, a4) == 'long int h_a4;'
            assert mem.define(True, a5) == 'int d_a5;'
            with assert_raises(Exception):
                mem.define(True, a6, host_constant=True)
            assert mem.define(False, a6, host_constant=True) == \
                'const long int h_a6[3] = {0, 1, 2};'
        else:
            raise NotImplementedError
Пример #11
0
def test_lpy_iname_presplit(opts):
    """
    Tests that inames access to pre-split inames in non-split loopy arrays are
    correctly handled
    """
    from pymbolic.primitives import Subscript, Variable
    # create array split
    asplit = array_splitter(opts)

    # create a test kernel
    arg1 = lp.GlobalArg('a1', shape=(20, 10), order=opts.order)
    arg2 = lp.GlobalArg('a2', shape=(16, 16), order=opts.order)

    k = lp.make_kernel([
        '{[i]: 0 <= i < 10}', '{{[j_outer]: 0 <= j_outer < {}}}'.format(
            int(np.ceil(10 / VECTOR_WIDTH))),
        '{{[j_inner]: 0 <= j_inner < {}}}'.format(VECTOR_WIDTH)
    ],
                       """
            a1[j_outer, i] = 1 {id=a1}
            a2[j_outer, i] = 1 {id=a2}
        """, [arg1, arg2],
                       silenced_warnings=['no_device_in_pre_codegen_checks'],
                       target=lp.OpenCLTarget())

    k = asplit.split_loopy_arrays(k, dont_split=['a1', 'a2'])

    # ensure there's no loopy errors
    lp.generate_code_v2(k).device_code()

    def __indexer():
        return (Variable('j_outer') * VECTOR_WIDTH + Variable('j_inner'),
                Variable('i'))

    # check indexing
    assign = next(insn.assignee for insn in k.instructions if insn.id == 'a1')
    # construct index
    assert isinstance(assign, Subscript) and assign.index == __indexer()

    # now test with evenly sized
    assign = next(insn.assignee for insn in k.instructions if insn.id == 'a2')
    assert isinstance(assign, Subscript) and assign.index == __indexer()
Пример #12
0
def test_math_function(target, tp):
    # Test correct maths functions are generated for C and OpenCL
    # backend instead for different data type

    data_type = {"f32": np.float32, "f64": np.float64}[tp]

    import pymbolic.primitives as p

    i = p.Variable("i")
    xi = p.Subscript(p.Variable("x"), i)
    yi = p.Subscript(p.Variable("y"), i)
    zi = p.Subscript(p.Variable("z"), i)

    n = 100
    domain = "{[i]: 0<=i<%d}" % n
    data = [
        lp.GlobalArg("x", data_type, shape=(n, )),
        lp.GlobalArg("y", data_type, shape=(n, )),
        lp.GlobalArg("z", data_type, shape=(n, ))
    ]

    inst = [lp.Assignment(xi, p.Variable("min")(yi, zi))]
    knl = lp.make_kernel(domain, inst, data, target=target())
    code = lp.generate_code_v2(knl).device_code()

    assert "fmin" in code

    if tp == "f32" and target == CTarget:
        assert "fminf" in code
    else:
        assert "fminf" not in code

    inst = [lp.Assignment(xi, p.Variable("max")(yi, zi))]
    knl = lp.make_kernel(domain, inst, data, target=target())
    code = lp.generate_code_v2(knl).device_code()

    assert "fmax" in code

    if tp == "f32" and target == CTarget:
        assert "fmaxf" in code
    else:
        assert "fmaxf" not in code
Пример #13
0
 def __get_knl():
     return lp.make_kernel(
         '{[i]: 0 <= i < 10}',
         """
         a[i] = b[i]
     """, [
             lp.GlobalArg('a', shape=(10, ), dtype=np.int32),
             lp.ConstantArg('b', shape=(10))
         ],
         target=ExecutableCTarget(),
         name='cache_test')
Пример #14
0
def test_np_bool_handling(ctx_factory):
    import pymbolic.primitives as p
    from loopy.symbolic import parse
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    knl = lp.make_kernel(
        "{:}", [lp.Assignment(parse("y"), p.LogicalNot(np.bool_(False)))],
        [lp.GlobalArg("y", dtype=np.bool_, shape=lp.auto)])
    evt, (out, ) = knl(queue)
    assert out.get().item() is True
Пример #15
0
 def kernel_data(self) -> List[str]:
     "Return arguments / data to kernel."
     # normalize wrt. key set like ['n,out', 'foo,bar']
     csk = ','.join(self.kernel_dtypes().keys())
     data = [key for key in csk.split(',')]
     if hasattr(self, 'extra_data_shape'):
         for name, shape in self.extra_data_shape.items():
             shape = tuple(pm.parse(_) for _ in shape.split(','))
             arg = lp.GlobalArg(name, shape=shape)
             data[data.index(name)] = arg
     return data
Пример #16
0
def test_divisibility_assumption(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel("[n] -> {[i]: 0<=i<n}", ["b[i] = 2*a[i]"], [
        lp.GlobalArg("a", np.float32, shape=("n", )),
        lp.GlobalArg("b", np.float32, shape=("n", )),
        lp.ValueArg("n", np.int32),
    ],
                         assumptions="n>=1 and (exists zz: n = 16*zz)")

    ref_knl = knl

    knl = lp.split_iname(knl, "i", 16)

    knl = lp.preprocess_kernel(knl, ctx.devices[0])
    for k in lp.generate_loop_schedules(knl):
        code = lp.generate_code(k)
        assert "if" not in code

    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters={"n": 16**3})
Пример #17
0
def test_join_inames(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel(
            "{[i,j]: 0<=i,j<16}",
            [
                "b[i,j] = 2*a[i,j]"
                ],
            [
                lp.GlobalArg("a", np.float32, shape=(16, 16,)),
                lp.GlobalArg("b", np.float32, shape=(16, 16,))
                ],
            )

    ref_knl = knl

    knl = lp.add_prefetch(knl, "a", sweep_inames=["i", "j"], default_tag="l.auto")
    knl = lp.join_inames(knl, ["a_dim_0", "a_dim_1"])

    lp.auto_test_vs_ref(ref_knl, ctx, knl, print_ref_code=True)
Пример #18
0
def expression_argument(expr, parameters):
    name = expr.name
    shape = expr.shape
    dtype = expr.dtype
    if shape == ():
        arg = loopy.ValueArg(name, dtype=dtype)
    else:
        arg = loopy.GlobalArg(name, dtype=dtype, shape=shape)
    idx = parameters.wrapper_arguments.index(expr)
    parameters.kernel_data[idx] = arg
    return pym.Variable(name)
Пример #19
0
def vanilla():
    k = lp.make_kernel(
        "{ [i] : k <= i < n}", """
                       a[i] = a[i] + 1
                       """, [
            lp.ValueArg("k", dtype="int32"),
            lp.ValueArg("n", dtype="int32"),
            lp.GlobalArg("a", shape=(None, ), dtype="int32")
        ])
    k = lp.assume(k, "k >= 0 and n >= k")
    return k
Пример #20
0
def test_nonsense_reduction(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel(
        "{[i]: 0<=i<100}", """
                a[i] = sum(i, 2)
                """, [lp.GlobalArg("a", np.float32, shape=(100, ))])

    import pytest
    with pytest.raises(RuntimeError):
        knl = lp.preprocess_kernel(knl, ctx.devices[0])
Пример #21
0
def test_nested_dependent_reduction(ctx_factory):
    dtype = np.dtype(np.int32)
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    knl = lp.make_kernel(["{[i]: 0<=i<n}", "{[j]: 0<=j<i+sumlen}"], [
        "<> sumlen = ell[i]",
        "a[i] = sum(j, j)",
    ], [
        lp.ValueArg("n", np.int32),
        lp.GlobalArg("a", dtype, ("n", )),
        lp.GlobalArg("ell", np.int32, ("n", )),
    ])

    n = 330
    ell = np.arange(n, dtype=np.int32)
    evt, (a, ) = knl(queue, ell=ell, n=n, out_host=True)

    tgt_result = (2 * ell - 1) * 2 * ell / 2
    assert (a == tgt_result).all()
Пример #22
0
 def __get_knl():
     return lp.make_kernel(
         "{[i]: 0 <= i < 10}",
         """
         a[i] = b[i]
     """, [
             lp.GlobalArg("a", shape=(10, ), dtype=np.int32),
             lp.ConstantArg("b", shape=(10))
         ],
         target=ExecutableCTarget(),
         name="cache_test")
Пример #23
0
def test_split_reduction(ctx_factory):
    knl = lp.make_kernel(
        "{[i,j,k]: 0<=i,j,k<n}", """
                b = sum((i,j,k), a[i,j,k])
                """, [
            lp.GlobalArg("box_source_starts,box_source_counts_nonchild,a",
                         None,
                         shape=None), "..."
        ])

    knl = lp.split_reduction_outward(knl, "j,k")
Пример #24
0
def test_modulo_indexing(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel(
        "{[i,j]: 0<=i<n and 0<=j<5}", """
                b[i] = sum(j, a[(i+j)%n])
                """, [lp.GlobalArg("a", None, shape="n"), "..."])

    print(knl)
    print(
        lp.CompiledKernel(ctx, knl).get_highlighted_code(dict(a=np.float32, )))
Пример #25
0
    def pick_used_centers(self):
        knl = lp.make_kernel(
            """{[i]: 0<=i<ntargets}""",
            """
                <>target_has_center = (target_to_center[i] >= 0)
                center_is_used[target_to_center[i]] = 1 \
                    {id=center_is_used_write,if=target_has_center}
            """, [
                lp.GlobalArg(
                    "target_to_center", shape="ntargets", offset=lp.auto),
                lp.GlobalArg("center_is_used", shape="ncenters"),
                lp.ValueArg("ncenters", np.int32),
                lp.ValueArg("ntargets", np.int32),
            ],
            name="pick_used_centers",
            silenced_warnings="write_race(center_is_used_write)",
            lang_version=MOST_RECENT_LANGUAGE_VERSION)

        knl = lp.split_iname(knl, "i", 128, inner_tag="l.0", outer_tag="g.0")
        return knl
Пример #26
0
def test_plain_matrix_mul(ctx_factory):
    ctx = ctx_factory()
    order = "C"

    n = get_suitable_size(ctx)

    for dtype, check, vec_size in [
        (cl_array.vec.float4, check_float4, 4),
        (np.float32, None, 1),
    ]:
        knl = lp.make_kernel(
            "{[i,j,k]: 0<=i,j,k<%d}" % n,
            ["c[i, j] = sum(k, a[i, k]*b[k, j])"], [
                lp.GlobalArg("a", dtype, shape=(n, n), order=order),
                lp.GlobalArg("b", dtype, shape=(n, n), order=order),
                lp.GlobalArg("c", dtype, shape=(n, n), order=order),
            ],
            name="matmul")

        ref_knl = knl

        knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.1")
        knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.0")
        knl = lp.split_iname(knl, "k", 16)
        knl = lp.add_prefetch(knl,
                              "a", ["k_inner", "i_inner"],
                              default_tag="l.auto")
        knl = lp.add_prefetch(knl,
                              "b", [
                                  "j_inner",
                                  "k_inner",
                              ],
                              default_tag="l.auto")

        lp.auto_test_vs_ref(ref_knl,
                            ctx,
                            knl,
                            op_count=[vec_size * 2 * n**3 / 1e9],
                            op_label=["GFlops"],
                            parameters={"n": n},
                            check_result=check)
Пример #27
0
def test_dependent_loop_bounds_3(ctx_factory):
    # The point of this test is that it shows a dependency between
    # domains that is exclusively mediated by the row_len temporary.
    # It also makes sure that row_len gets read before any
    # conditionals use it.

    dtype = np.dtype(np.float32)
    ctx = ctx_factory()

    knl = lp.make_kernel(
            [
                "{[i]: 0<=i<n}",
                "{[jj]: 0<=jj<row_len}",
                ],
            [
                "<> row_len = a_row_lengths[i]",
                "a[i,jj] = 1",
                ],
            [
                lp.GlobalArg("a_row_lengths", np.int32, shape=lp.auto),
                lp.GlobalArg("a", dtype, shape=("n,n"), order="C"),
                lp.ValueArg("n", np.int32),
                ])

    assert knl.parents_per_domain()[1] == 0

    knl = lp.split_iname(knl, "i", 128, outer_tag="g.0",
            inner_tag="l.0")

    cknl = lp.CompiledKernel(ctx, knl)
    print("---------------------------------------------------")
    print(cknl.get_highlighted_code())
    print("---------------------------------------------------")

    knl_bad = lp.split_iname(knl, "jj", 128, outer_tag="g.1",
            inner_tag="l.1")

    knl = lp.preprocess_kernel(knl, ctx.devices[0])

    with pytest.raises(RuntimeError):
        list(lp.generate_loop_schedules(knl_bad))
Пример #28
0
def test_get_field_args(proc_shape):
    if proc_shape != (1, 1, 1):
        pytest.skip("test field only on one rank")

    from pystella import Field, DynamicField, get_field_args

    x = Field("x", offset=(1, 2, 3))
    y = Field("y", offset="h")
    z = DynamicField("z", shape=(2, "a"))

    import loopy as lp
    true_args = [
        lp.GlobalArg("x", shape="(Nx+2, Ny+4, Nz+6)", offset=lp.auto),
        lp.GlobalArg("y", shape="(Nx+2*h, Ny+2*h, Nz+2*h)", offset=lp.auto),
        lp.GlobalArg("z", shape="(2, a, Nx, Ny, Nz)", offset=lp.auto),
        lp.GlobalArg("dzdx", shape="(2, a, 3, Nx, Ny, Nz)", offset=lp.auto),
    ]

    def lists_equal(a, b):
        equal = True
        for x in a:
            equal *= x in b
        for x in b:
            equal *= x in a
        return equal

    expressions = {x: y, y: x * z + z.pd[0]}
    args = get_field_args(expressions)
    assert lists_equal(args, true_args)

    expressions = x * y + z + z.pd[2]
    args = get_field_args(expressions)
    assert lists_equal(args, true_args)

    expressions = [x, y, y * z**2, 3 + z.pd[0] + z.pd[1]]
    args = get_field_args(expressions)
    assert lists_equal(args, true_args)

    expressions = [shift_fields(x, (1, 2, 3)), y + z.pd[0], y * z**2]
    args = get_field_args(expressions)
    assert lists_equal(args, true_args)
Пример #29
0
def left_W(ctx):
    order = 'C'
    dtype = np.float64
    knl = lp.make_kernel(ctx.devices[0], [
        "{[j,i,alpha,alpha1]: 0<=alpha,alpha1<r and 0<=j,i<n}",
    ], [
        "l[alpha,alpha1]=sum((i), u[i,alpha]*u[i,alpha1])*sum((j),v[j,alpha]*v[j,alpha1])",
    ], [
        lp.GlobalArg("v", dtype, shape="n, r", order=order),
        lp.GlobalArg("u", dtype, shape="n, r", order=order),
        lp.GlobalArg("l", dtype, shape="r, r", order=order),
        lp.ValueArg("n", np.int64),
        lp.ValueArg("r", np.int64),
    ],
                         assumptions="n>=1")
    knl = lp.split_iname(knl, "alpha1", 16, outer_tag="g.0", inner_tag="l.0")
    knl = lp.split_iname(knl, "alpha", 3, outer_tag="g.1", inner_tag="l.1")
    knl = lp.split_iname(knl, "j", 16)
    knl = lp.split_iname(knl, "i", 16)

    return knl
Пример #30
0
def left_V(ctx):
    order = 'C'
    dtype = np.float32
    knl = lp.make_kernel(ctx.devices[0], [
        "{[i,k,alpha,alpha1]: 0<=alpha,alpha1<r and 0<=i,k<n}",
    ], [
        "l[alpha,alpha1]=sum((i), u[alpha,i]*u[alpha1,i])*sum((k),w[alpha,k]*w[alpha1,k])",
    ], [
        lp.GlobalArg("u", dtype, shape="r, n", order=order),
        lp.GlobalArg("w", dtype, shape="r, n", order=order),
        lp.GlobalArg("l", dtype, shape="r, r", order=order),
        lp.ValueArg("n", np.int64),
        lp.ValueArg("r", np.int64),
    ],
                         assumptions="n>=1")
    knl = lp.split_iname(knl, "alpha1", 16, outer_tag="g.0", inner_tag="l.0")
    knl = lp.split_iname(knl, "alpha", 3, outer_tag="g.1", inner_tag="l.1")
    knl = lp.split_iname(knl, "i", 16)
    knl = lp.split_iname(knl, "k", 16)

    return knl