Exemplo n.º 1
0
def test_callees_with_gbarriers_are_inlined_with_nested_calls(ctx_factory):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    ones_and_zeros = lp.make_function("{[i, j]: 0<=i<6 and 0<=j<3}",
                                      """
            x[i] = 0.0f
            ...gbarrier
            x[j] = 1.0f
            """,
                                      seq_dependencies=True,
                                      name="ones_and_zeros")

    dummy_ones_and_zeros = lp.make_function("{[i]: 0<=i<6}",
                                            """
            [i]: y[i] = ones_and_zeros()
            """,
                                            name="dummy_ones_and_zeros")

    t_unit = lp.make_kernel(
        "{ : }", """
            y[:] = dummy_ones_and_zeros()
            """, [lp.GlobalArg("y", shape=6, dtype=lp.auto)])

    t_unit = lp.merge([t_unit, dummy_ones_and_zeros, ones_and_zeros])
    evt, (out, ) = t_unit(queue)

    expected_out = np.array([1, 1, 1, 0, 0, 0]).astype(np.float32)

    assert (expected_out == out.get()).all()
Exemplo n.º 2
0
def test_stride_depending_on_args(ctx_factory):
    ctx = ctx_factory()

    twice = lp.make_function("{[i, j]: 0<=i, j < n}",
                             """
            b[i, j] = 2*a[i, j]
            """, [lp.ValueArg("n"),
                  lp.GlobalArg("a"),
                  lp.GlobalArg("b")],
                             name="twice")

    thrice = lp.make_function("{[i, j]: 0<=i, j < n}",
                              """
            b[i, j] = 3*a[i, j]
            """, [
                                  lp.ValueArg("n"),
                                  lp.GlobalArg("a", shape=lp.auto),
                                  lp.GlobalArg("b", shape=lp.auto)
                              ],
                              name="thrice")

    prog = lp.make_kernel(
        "{[i0,i1,i2,i3,i4,i5,i6,i7]: 0<=i0, i1, i2, i3, i4, i5, i6, i7< N}",
        """
            [i0, i1]: y[i0, i1] = twice(N, [i2, i3]: x[2*i2, i3])
            [i4, i5]: z[i4, i5] = thrice(N, [i6, i7]: x[2*i6+1, i7])
            """, [
            lp.ValueArg("N", dtype=np.int32),
            lp.GlobalArg("x", shape=lp.auto, dtype=np.float64), ...
        ])

    prog = lp.merge([prog, twice])
    prog = lp.merge([prog, thrice])

    lp.auto_test_vs_ref(prog, ctx, prog, parameters={"N": 4})
Exemplo n.º 3
0
def test_shape_translation_through_sub_array_ref(ctx_factory, inline):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    x1 = cl.clrandom.rand(queue, (3, 2), dtype=np.float64)
    x2 = cl.clrandom.rand(queue, (6, ), dtype=np.float64)
    x3 = cl.clrandom.rand(queue, (6, 6), dtype=np.float64)

    callee1 = lp.make_function("{[i]: 0<=i<6}",
                               """
            b[i] = 2*abs(a[i])
            """,
                               name="callee_fn1")

    callee2 = lp.make_function("{[i, j]: 0<=i<3 and 0 <= j < 2}",
                               """
            b[i, j] = 3*a[i, j]
            """,
                               name="callee_fn2")

    callee3 = lp.make_function("{[i]: 0<=i<6}",
                               """
            b[i] = 5*a[i]
            """,
                               name="callee_fn3")

    knl = lp.make_kernel(
        "{[i, j, k, l]:  0<= i < 6 and 0 <= j < 3 and 0 <= k < 2 and 0<=l<6}",
        """
            [i]: y1[i//2, i%2] = callee_fn1([i]: x1[i//2, i%2])
            [j, k]: y2[2*j+k] = callee_fn2([j, k]: x2[2*j+k])
            [l]: y3[l, l] = callee_fn3([l]: x3[l, l])
            """)

    knl = lp.merge([knl, callee1])
    knl = lp.merge([knl, callee2])
    knl = lp.merge([knl, callee3])

    if inline:
        knl = lp.inline_callable_kernel(knl, "callee_fn1")
        knl = lp.inline_callable_kernel(knl, "callee_fn2")
        knl = lp.inline_callable_kernel(knl, "callee_fn3")

    knl = lp.set_options(knl, "write_cl")
    knl = lp.set_options(knl, "return_dict")
    evt, out_dict = knl(queue, x1=x1, x2=x2, x3=x3)

    y1 = out_dict["y1"].get()
    y2 = out_dict["y2"].get()
    y3 = out_dict["y3"].get()

    assert (np.linalg.norm(y1 - 2 * x1.get())) < 1e-15
    assert (np.linalg.norm(y2 - 3 * x2.get())) < 1e-15
    assert (np.linalg.norm(np.diag(y3 - 5 * x3.get()))) < 1e-15
Exemplo n.º 4
0
def expression_kernel(expr, args):
    r"""Produce a :class:`pyop2.Kernel` from the processed UFL expression
    expr and the corresponding args."""

    # Empty slot indicating assignment to indexed LHS, so don't do anything
    if type(expr) is Zero:
        return

    fs = args[0].function.function_space()

    import islpy as isl
    inames = isl.make_zero_and_vars(["d"])
    domain = (inames[0].le_set(
        inames["d"])) & (inames["d"].lt_set(inames[0] + fs.dof_dset.cdim))

    context = Bag()
    context.within_inames = frozenset(["d"])
    context.indices = (p.Variable("d"), )

    insn = loopy_instructions(expr, context)
    data = [arg.arg for arg in args]
    knl = loopy.make_function([domain], [insn],
                              data,
                              name="expression",
                              silenced_warnings=["summing_if_branches_ops"])

    return op2.Kernel(knl, "expression")
Exemplo n.º 5
0
def test_double_hw_axes_used_in_knl_call(inline):
    from loopy.diagnostic import LoopyError

    twice = lp.make_function("{[i]: 0<=i<10}",
                             """
            y[i] = 2*x[i]
            """,
                             name="twice")

    knl = lp.make_kernel("{[i]: 0<=i<10}",
                         """
            y[:, i] = twice(x[:, i])
            """, [
                             lp.GlobalArg("x", shape=(10, 10), dtype=float),
                             lp.GlobalArg("y", shape=(10, 10))
                         ],
                         name="outer")

    twice = lp.tag_inames(twice, {"i": "l.0"})
    knl = lp.tag_inames(knl, {"i": "l.0"})
    knl = lp.merge([knl, twice])

    if inline:
        knl = lp.inline_callable_kernel(knl, "twice")

    with pytest.raises(LoopyError):
        lp.generate_code_v2(knl)
Exemplo n.º 6
0
def test_check_bounds_with_caller_assumptions(ctx_factory):
    import islpy as isl
    from loopy.diagnostic import LoopyIndexError

    arange = lp.make_function("{[i]: 0<=i<n}",
                              """
        y[i] = i
        """,
                              name="arange")

    knl = lp.make_kernel(
        "{[i]: 0<=i<20}",
        """
        [i]: Y[i] = arange(N)
        """,
        [lp.GlobalArg("Y", shape=(20, )),
         lp.ValueArg("N", dtype=np.int32)],
        name="epoint")

    knl = lp.merge([knl, arange])

    with pytest.raises(LoopyIndexError):
        lp.generate_code_v2(knl)

    knl = knl.with_kernel(
        lp.assume(knl.default_entrypoint, isl.BasicSet("[N] -> { : N <= 20}")))

    lp.auto_test_vs_ref(knl, ctx_factory(), parameters={"N": 15})
Exemplo n.º 7
0
def test_unused_hw_axes_in_callee(ctx_factory, inline):
    ctx = ctx_factory()

    twice = lp.make_function("{[i]: 0<=i<10}",
                             """
            y[i] = 2*x[i]
            """,
                             name="twice")

    knl = lp.make_kernel("{[i]: 0<=i<10}",
                         """
            y[:, i] = twice(x[:, i])
            """, [
                             lp.GlobalArg("x", shape=(10, 10), dtype=float),
                             lp.GlobalArg("y", shape=(10, 10))
                         ],
                         name="outer")

    twice = lp.tag_inames(twice, {"i": "l.1"})
    knl = lp.tag_inames(knl, {"i": "l.0"})
    knl = lp.merge([knl, twice])

    if inline:
        knl = lp.inline_callable_kernel(knl, "twice")

    lp.auto_test_vs_ref(knl, ctx, knl)
Exemplo n.º 8
0
def test_slices_with_negative_step(ctx_factory, inline):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)
    n = 4

    x = np.random.rand(n, n, n, n, n)
    y = np.random.rand(n, n, n, n, n)

    child_knl = lp.make_function("{[i, j]:0<=i, j < 4}",
                                 """
            g[i, j] = 2*e[i, j] + 3*f[i, j]
            """,
                                 name="linear_combo")

    parent_knl = lp.make_kernel("{[i, k, m]: 0<=i, k, m<4}",
                                """
            z[i, 3:-1:-1, k, :, m] = linear_combo(x[i, :, k, :, m],
                                                   y[i, :, k, :, m])
            """,
                                kernel_data=[
                                    lp.GlobalArg(name="x, y, z",
                                                 dtype=np.float64,
                                                 shape=(n, n, n, n, n)), ...
                                ])

    knl = lp.merge([parent_knl, child_knl])
    if inline:
        knl = lp.inline_callable_kernel(knl, "linear_combo")

    evt, (out, ) = knl(queue, x=x, y=y)

    assert (np.linalg.norm(2 * x + 3 * y - out[:, ::-1, :, :, :]) /
            (np.linalg.norm(2 * x + 3 * y))) < 1e-15
Exemplo n.º 9
0
def test_array_inputs_to_callee_kernels(ctx_factory, inline):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)
    n = 2**3

    x = np.random.rand(n, n)
    y = np.random.rand(n, n)

    child_knl = lp.make_function("{[i, j]:0<=i, j < 8}",
                                 """
            g[i, j] = 2*e[i, j] + 3*f[i, j]
            """,
                                 name="linear_combo")

    parent_knl = lp.make_kernel("{:}",
                                """
            z[:, :] = linear_combo(x, y)
            """,
                                kernel_data=[
                                    lp.GlobalArg(name="x, y, z",
                                                 dtype=np.float64,
                                                 shape=(n, n)), ...
                                ])

    knl = lp.merge([parent_knl, child_knl])
    if inline:
        knl = lp.inline_callable_kernel(knl, "linear_combo")

    evt, (out, ) = knl(queue, x=x, y=y)

    assert (np.linalg.norm(2 * x + 3 * y - out) /
            (np.linalg.norm(2 * x + 3 * y))) < 1e-15
Exemplo n.º 10
0
def merge_loopy(slate_loopy, output_arg, builder, var2terminal):
    """ Merges tsfc loopy kernels and slate loopy kernel into a wrapper kernel."""
    from firedrake.slate.slac.kernel_builder import SlateWrapperBag
    coeffs = builder.collect_coefficients()
    builder.bag = SlateWrapperBag(coeffs)

    # In the initialisation the loopy tensors for the terminals are generated
    # Those are the needed again for generating the TSFC calls
    inits, tensor2temp = builder.initialise_terminals(var2terminal, builder.bag.coefficients)
    terminal_tensors = list(filter(lambda x: isinstance(x, sl.Tensor), var2terminal.values()))
    tsfc_calls, tsfc_kernels = zip(*itertools.chain.from_iterable(
                                   (builder.generate_tsfc_calls(terminal, tensor2temp[terminal])
                                    for terminal in terminal_tensors)))

    # Construct args
    args = [output_arg] + builder.generate_wrapper_kernel_args(tensor2temp, tsfc_kernels)
    # Munge instructions
    insns = inits
    insns.extend(tsfc_calls)
    insns.append(builder.slate_call(slate_loopy, tensor2temp.values()))

    # Inames come from initialisations + loopyfying kernel args and lhs
    domains = builder.bag.index_creator.domains

    # Generates the loopy wrapper kernel
    slate_wrapper = lp.make_function(domains, insns, args, name="slate_wrapper",
                                     seq_dependencies=True, target=lp.CTarget())

    # Generate program from kernel, so that one can register kernels
    prg = make_program(slate_wrapper)
    for tsfc_loopy in tsfc_kernels:
        prg = register_callable_kernel(prg, tsfc_loopy)
    prg = register_callable_kernel(prg, slate_loopy)
    return prg
Exemplo n.º 11
0
def test_inlining_with_indirections(ctx_factory):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    ones_and_zeros = lp.make_function("{[i, j]: 0<=i<6 and 0<=j<3}",
                                      """
            x[i] = 0.0f
            ...gbarrier
            x[map[j]] = 1.0f
            """,
                                      seq_dependencies=True,
                                      name="ones_and_zeros")

    t_unit = lp.make_kernel(
        "{ : }", """
            y[:] = ones_and_zeros(mymap[:])
            """, [
            lp.GlobalArg("y", shape=6, dtype=lp.auto),
            lp.GlobalArg("mymap", dtype=np.int32, shape=3)
        ])

    t_unit = lp.merge([t_unit, ones_and_zeros])
    t_unit = lp.inline_callable_kernel(t_unit, "ones_and_zeros")

    map_in = np.arange(3).astype(np.int32)

    evt, (out, ) = t_unit(queue, mymap=map_in)

    expected_out = np.array([1, 1, 1, 0, 0, 0]).astype(np.float32)
    assert (expected_out == out).all()
Exemplo n.º 12
0
def test_empty_sub_array_refs(ctx_factory, inline):
    # See: https://github.com/OP2/PyOP2/pull/559#discussion_r272208618
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    x = np.random.randn(10)
    y = np.random.randn(10)

    callee = lp.make_function("{[d]:0<=d<1}",
                              """
            c[d] = a[d] - b[d]
            """,
                              name="wence_function")

    caller = lp.make_kernel(
        "{[i,k]: 0<=i<10 and 0<=k<1}", """
            [k]:z[i+k] = wence_function([k]:x[i+k], [k]:y[i+k])
            """, [lp.GlobalArg("x, y", dtype=np.float64, shape=(10, )), ...])

    caller = lp.merge([caller, callee])

    if inline:
        caller = lp.inline_callable_kernel(caller, "wence_function")

    evt, (out, ) = caller(queue, x=x, y=y)
    assert np.allclose(out, x - y)
Exemplo n.º 13
0
def merge_loopy(slate_loopy, output_arg, builder, var2terminal, name):
    """ Merges tsfc loopy kernels and slate loopy kernel into a wrapper kernel."""
    from firedrake.slate.slac.kernel_builder import SlateWrapperBag
    coeffs = builder.collect_coefficients()
    builder.bag = SlateWrapperBag(coeffs)

    # In the initialisation the loopy tensors for the terminals are generated
    # Those are the needed again for generating the TSFC calls
    inits, tensor2temp = builder.initialise_terminals(var2terminal,
                                                      builder.bag.coefficients)
    terminal_tensors = list(
        filter(lambda x: (x.terminal and not x.assembled),
               var2terminal.values()))
    calls_and_kernels = tuple((c, k) for terminal in terminal_tensors
                              for c, k in builder.generate_tsfc_calls(
                                  terminal, tensor2temp[terminal]))
    if calls_and_kernels:  # tsfc may not give a kernel back
        tsfc_calls, tsfc_kernels = zip(*calls_and_kernels)
    else:
        tsfc_calls = ()
        tsfc_kernels = ()

    # Construct args
    args = [output_arg] + builder.generate_wrapper_kernel_args(tensor2temp)
    # Munge instructions
    insns = inits
    insns.extend(tsfc_calls)
    insns.append(builder.slate_call(slate_loopy, tensor2temp.values()))

    # Inames come from initialisations + loopyfying kernel args and lhs
    domains = builder.bag.index_creator.domains

    # Generates the loopy wrapper kernel
    slate_wrapper = lp.make_function(domains,
                                     insns,
                                     args,
                                     name=name,
                                     seq_dependencies=True,
                                     target=lp.CTarget())

    # Generate program from kernel, so that one can register kernels
    from pyop2.codegen.loopycompat import _match_caller_callee_argument_dimension_
    from loopy.kernel.function_interface import CallableKernel

    for tsfc_loopy in tsfc_kernels:
        slate_wrapper = merge([slate_wrapper, tsfc_loopy])
        names = tsfc_loopy.callables_table
        for name in names:
            if isinstance(slate_wrapper.callables_table[name], CallableKernel):
                slate_wrapper = _match_caller_callee_argument_dimension_(
                    slate_wrapper, name)
    slate_wrapper = merge([slate_wrapper, slate_loopy])
    names = slate_loopy.callables_table
    for name in names:
        if isinstance(slate_wrapper.callables_table[name], CallableKernel):
            slate_wrapper = _match_caller_callee_argument_dimension_(
                slate_wrapper, name)

    return slate_wrapper
Exemplo n.º 14
0
def test_packing_unpacking(ctx_factory, inline):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    x1 = cl.clrandom.rand(queue, (3, 2), dtype=np.float64)
    x2 = cl.clrandom.rand(queue, (6, ), dtype=np.float64)

    callee1 = lp.make_function("{[i]: 0<=i<6}",
                               """
            b[i] = 2*a[i]
            """,
                               name="callee_fn1")

    callee2 = lp.make_function("{[i, j]: 0<=i<2 and 0 <= j < 3}",
                               """
            b[i, j] = 3*a[i, j]
            """,
                               name="callee_fn2")

    knl = lp.make_kernel(
        "{[i, j, k]:  0<= i < 3 and 0 <= j < 2 and 0 <= k < 6}", """
            [i, j]: y1[i, j] = callee_fn1([i, j]: x1[i, j])
            [k]: y2[k] = callee_fn2([k]: x2[k])
            """)

    knl = lp.merge([knl, callee1])
    knl = lp.merge([knl, callee2])

    knl = lp.pack_and_unpack_args_for_call(knl, "callee_fn1")
    knl = lp.pack_and_unpack_args_for_call(knl, "callee_fn2")

    if inline:
        knl = lp.inline_callable_kernel(knl, "callee_fn1")
        knl = lp.inline_callable_kernel(knl, "callee_fn2")

    knl = lp.set_options(knl, "write_cl")
    knl = lp.set_options(knl, "return_dict")
    evt, out_dict = knl(queue, x1=x1, x2=x2)

    y1 = out_dict["y1"].get()
    y2 = out_dict["y2"].get()

    assert np.linalg.norm(2 * x1.get() - y1) / np.linalg.norm(
        2 * x1.get()) < 1e-15
    assert np.linalg.norm(3 * x2.get() - y2) / np.linalg.norm(
        3 * x2.get()) < 1e-15
Exemplo n.º 15
0
def generate(impero_c, args, precision, scalar_type, kernel_name="loopy_kernel", index_names=[]):
    """Generates loopy code.

    :arg impero_c: ImperoC tuple with Impero AST and other data
    :arg args: list of loopy.GlobalArgs
    :arg precision: floating-point precision for printing
    :arg scalar_type: type of scalars as C typename string
    :arg kernel_name: function name of the kernel
    :arg index_names: pre-assigned index names
    :returns: loopy kernel
    """
    ctx = LoopyContext()
    ctx.indices = impero_c.indices
    ctx.index_names = defaultdict(lambda: "i", index_names)
    ctx.precision = precision
    ctx.scalar_type = scalar_type
    ctx.epsilon = 10.0 ** (-precision)

    # Create arguments
    data = list(args)
    for i, temp in enumerate(impero_c.temporaries):
        name = "t%d" % i
        if isinstance(temp, gem.Constant):
            data.append(lp.TemporaryVariable(name, shape=temp.shape, dtype=temp.array.dtype, initializer=temp.array, address_space=lp.AddressSpace.LOCAL, read_only=True))
        else:
            shape = tuple([i.extent for i in ctx.indices[temp]]) + temp.shape
            data.append(lp.TemporaryVariable(name, shape=shape, dtype=numpy.float64, initializer=None, address_space=lp.AddressSpace.LOCAL, read_only=False))
        ctx.gem_to_pymbolic[temp] = p.Variable(name)

    # Create instructions
    instructions = statement(impero_c.tree, ctx)

    # Create domains
    domains = []
    for idx, extent in ctx.index_extent.items():
        inames = isl.make_zero_and_vars([idx])
        domains.append(((inames[0].le_set(inames[idx])) & (inames[idx].lt_set(inames[0] + extent))))

    if not domains:
        domains = [isl.BasicSet("[] -> {[]}")]

    # Create loopy kernel
    knl = lp.make_function(domains, instructions, data, name=kernel_name, target=lp.CTarget(),
                           seq_dependencies=True, silenced_warnings=["summing_if_branches_ops"])

    # Prevent loopy interchange by loopy
    knl = lp.prioritize_loops(knl, ",".join(ctx.index_extent.keys()))

    # Help loopy in scheduling by assigning priority to instructions
    insn_new = []
    for i, insn in enumerate(knl.instructions):
        insn_new.append(insn.copy(priority=len(knl.instructions) - i))
    knl = knl.copy(instructions=insn_new)

    return knl
Exemplo n.º 16
0
def _form_loopy_kernel(kernel_domains, instructions, measure, args, **kwargs):

    kargs = []

    for var, (func, intent) in args.items():
        if isinstance(func, constant.Constant):
            if intent is not READ:
                raise RuntimeError("Only READ access is allowed to Constant")
            # Constants modelled as Globals, so no need for double
            # indirection
            ndof = func.dat.cdim
            kargs.append(loopy.GlobalArg(var, dtype=func.dat.dtype, shape=(ndof,)))
        else:
            # Do we have a component of a mixed function?
            if isinstance(func, Indexed):
                c, i = func.ufl_operands
                idx = i._indices[0]._value
                ndof = c.function_space()[idx].finat_element.space_dimension()
                cdim = c.dat[idx].cdim
                dtype = c.dat[idx].dtype
            else:
                if func.function_space().ufl_element().family() == "Real":
                    ndof = func.function_space().dim()  # == 1
                    kargs.append(loopy.GlobalArg(var, dtype=func.dat.dtype, shape=(ndof,)))
                    continue
                else:
                    if len(func.function_space()) > 1:
                        raise NotImplementedError("Must index mixed function in par_loop.")
                    ndof = func.function_space().finat_element.space_dimension()
                    cdim = func.dat.cdim
                    dtype = func.dat.dtype
            if measure.integral_type() == 'interior_facet':
                ndof *= 2
            # FIXME: shape for facets [2][ndof]?
            kargs.append(loopy.GlobalArg(var, dtype=dtype, shape=(ndof, cdim)))
        kernel_domains = kernel_domains.replace(var+".dofs", str(ndof))

    if kernel_domains == "":
        kernel_domains = "[] -> {[]}"
    try:
        key = (kernel_domains, tuple(instructions), tuple(map(tuple, kwargs.items())))
        if kernel_cache is not None:
            return kernel_cache[key]
        else:
            raise KeyError("No cache")
    except KeyError:
        kargs.append(...)
        knl = loopy.make_function(kernel_domains, instructions, kargs, seq_dependencies=True,
                                  name="par_loop_kernel", silenced_warnings=["summing_if_branches_ops"], target=loopy.CTarget())
        knl = pyop2.Kernel(knl, "par_loop_kernel", **kwargs)
        if kernel_cache is not None:
            return kernel_cache.setdefault(key, knl)
        else:
            return knl
Exemplo n.º 17
0
def test_register_knl(ctx_factory, inline):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)
    n = 4

    x = np.random.rand(n, n, n, n, n)
    y = np.random.rand(n, n, n, n, n)

    grandchild_knl = lp.make_function("{[i, j]:0<= i, j< 4}",
                                      """
            c[i, j] = 2*a[i, j] + 3*b[i, j]
            """,
                                      name="linear_combo1")

    child_knl = lp.make_function("{[i, j]:0<=i, j < 4}",
                                 """
            [i, j]: g[i, j] = linear_combo1([i, j]: e[i, j], [i, j]: f[i, j])
            """,
                                 name="linear_combo2")

    parent_knl = lp.make_kernel("{[i, j, k, l, m]: 0<=i, j, k, l, m<4}",
                                """
            [j, l]: z[i, j, k, l, m] = linear_combo2([j, l]: x[i, j, k, l, m],
                                                     [j, l]: y[i, j, k, l, m])
            """,
                                kernel_data=[
                                    lp.GlobalArg(name="x, y",
                                                 dtype=np.float64,
                                                 shape=(n, n, n, n, n)), ...
                                ])

    knl = lp.merge([grandchild_knl, child_knl, parent_knl])

    if inline:
        knl = lp.inline_callable_kernel(knl, "linear_combo2")
        knl = lp.inline_callable_kernel(knl, "linear_combo1")

    evt, (out, ) = knl(queue, x=x, y=y)

    assert (np.linalg.norm(2 * x + 3 * y - out) /
            (np.linalg.norm(2 * x + 3 * y))) < 1e-15
Exemplo n.º 18
0
def test_register_knl_with_hw_axes(ctx_factory, inline):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    n = 4

    x_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float64)
    y_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float64)

    callee_knl = lp.make_function("{[i, j]:0<=i, j < 4}",
                                  """
            g[i, j] = 2*e[i, j] + 3*f[i, j]
            """,
                                  name="linear_combo")

    callee_knl = lp.split_iname(callee_knl,
                                "i",
                                1,
                                inner_tag="l.0",
                                outer_tag="g.0")

    caller_knl = lp.make_kernel("{[i, j, k, l, m]: 0<=i, j, k, l, m<4}",
                                """
            [j, l]: z[i, j, k, l, m] = linear_combo([j, l]: x[i, j, k, l, m],
                                                     [j, l]: y[i, j, k, l, m])
            """,
                                name="caller")
    caller_knl = lp.split_iname(caller_knl,
                                "i",
                                4,
                                inner_tag="l.1",
                                outer_tag="g.1")

    knl = lp.merge([caller_knl, callee_knl])

    knl = lp.set_options(knl, "return_dict")

    if inline:
        knl = lp.inline_callable_kernel(knl, "linear_combo")

    evt, out = knl(queue, x=x_dev, y=y_dev)

    x_host = x_dev.get()
    y_host = y_dev.get()

    assert np.linalg.norm(2 * x_host + 3 * y_host - out["z"].get()
                          ) / np.linalg.norm(2 * x_host + 3 * y_host) < 1e-15
Exemplo n.º 19
0
def create_loop_kernel(component_name, domains, instructions, edges, signature):

    domains, assumption_string = create_domain_string(domains, edges)
    globals = create_globals(signature, edges)


    knl = lp.make_function(
        domains,
        instructions,
        globals + ["..."],
        name=component_name,
        assumptions=assumption_string,
        target=lp.CTarget()
    )
    knl = add_instruction_deps(knl)
    if component_name == 'main':
        print(lp.generate_code_v2(knl).device_code())
    return knl
Exemplo n.º 20
0
def test_multi_arg_array_call(ctx_factory):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)
    import pymbolic.primitives as p
    n = 10
    acc_i = p.Variable("acc_i")
    i = p.Variable("i")
    index = p.Variable("index")
    a_i = p.Subscript(p.Variable("a"), p.Variable("i"))
    argmin_kernel = lp.make_function("{[i]: 0 <= i < n}", [
        lp.Assignment(id="init2", assignee=index, expression=0),
        lp.Assignment(id="init1", assignee=acc_i, expression="214748367"),
        lp.Assignment(id="insn",
                      assignee=index,
                      expression=p.If(p.Expression.eq(acc_i, a_i), i, index),
                      depends_on="update"),
        lp.Assignment(id="update",
                      assignee=acc_i,
                      expression=p.Variable("min")(acc_i, a_i),
                      depends_on="init1,init2")
    ], [
        lp.GlobalArg("a"),
        lp.GlobalArg(
            "acc_i, index", is_input=False, is_output=True, shape=lp.auto), ...
    ],
                                     name="custom_argmin")

    argmin_kernel = lp.fix_parameters(argmin_kernel, n=n)

    knl = lp.make_kernel(
        "{[i]:0<=i<n}", """
            []: min_val[()], []: min_index[()] = custom_argmin([i]:b[i])
            """)

    knl = lp.fix_parameters(knl, n=n)
    knl = lp.set_options(knl, return_dict=True)

    knl = lp.merge([knl, argmin_kernel])
    b = np.random.randn(n)
    evt, out_dict = knl(queue, b=b)
    tol = 1e-15
    from numpy.linalg import norm
    assert (norm(out_dict["min_val"] - np.min(b)) < tol)
    assert (norm(out_dict["min_index"] - np.argmin(b)) < tol)
Exemplo n.º 21
0
def _form_loopy_kernel(kernel_domains, instructions, measure, args, **kwargs):

    kargs = []

    for var, (func, intent) in args.items():
        if isinstance(func, constant.Constant):
            if intent is not READ:
                raise RuntimeError("Only READ access is allowed to Constant")
            # Constants modelled as Globals, so no need for double
            # indirection
            ndof = func.dat.cdim
            kargs.append(loopy.GlobalArg(var, dtype=func.dat.dtype, shape=(ndof,)))
        else:
            # Do we have a component of a mixed function?
            if isinstance(func, Indexed):
                c, i = func.ufl_operands
                idx = i._indices[0]._value
                ndof = c.function_space()[idx].finat_element.space_dimension()
                cdim = c.dat[idx].cdim
                dtype = c.dat[idx].dtype
            else:
                if func.function_space().ufl_element().family() == "Real":
                    ndof = func.function_space().dim()  # == 1
                    kargs.append(loopy.GlobalArg(var, dtype=func.dat.dtype, shape=(ndof,)))
                    continue
                else:
                    if len(func.function_space()) > 1:
                        raise NotImplementedError("Must index mixed function in par_loop.")
                    ndof = func.function_space().finat_element.space_dimension()
                    cdim = func.dat.cdim
                    dtype = func.dat.dtype
            if measure.integral_type() == 'interior_facet':
                ndof *= 2
            # FIXME: shape for facets [2][ndof]?
            kargs.append(loopy.GlobalArg(var, dtype=dtype, shape=(ndof, cdim)))
        kernel_domains = kernel_domains.replace(var+".dofs", str(ndof))

    if kernel_domains == "":
        kernel_domains = "[] -> {[]}"
    kargs.append(...)
    knl = loopy.make_function(kernel_domains, instructions, kargs, seq_dependencies=True,
                              name="par_loop_kernel", silenced_warnings=["summing_if_branches_ops"])

    return pyop2.Kernel(knl, "par_loop_kernel", **kwargs)
Exemplo n.º 22
0
def test_argument_matching_for_inplace_update(ctx_factory):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)
    twice = lp.make_function("{[i]: 0<=i<10}",
                             """
            x[i] = 2*x[i]
            """,
                             name="twice")

    knl = lp.make_kernel("{:}", """
            x[:] = twice(x[:])
            """, [lp.GlobalArg("x", shape=(10, ), dtype=np.float64)])

    knl = lp.merge([knl, twice])

    x = np.random.randn(10)
    evt, (out, ) = knl(queue, x=np.copy(x))

    assert np.allclose(2 * x, out)
Exemplo n.º 23
0
def test_inlining_with_callee_domain_param(ctx_factory):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    fill2 = lp.make_function("{[i]: 0<=i<n}",
                             """
            y[i] = 2.0
            """,
                             name="fill2")

    caller = lp.make_kernel(
        "{[i]: 0<=i<10}", """
            [i]: res[i] = fill2(10)
            """)

    caller = lp.merge([caller, fill2])
    caller = lp.inline_callable_kernel(caller, "fill2")
    evt, (out, ) = caller(queue)

    assert (out == 2).all()
Exemplo n.º 24
0
def test_non_zero_start_in_subarray_ref(ctx_factory):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)
    twice = lp.make_function("{[i]: 0<=i<10}",
                             """
            b[i] = 2*a[i]
            """,
                             name="twice")

    knl = lp.make_kernel(
        "{[i, j]: -5<=i<5 and 0<=j<10}", """
            [i]:y[i+5] = twice([j]: x[j])
            """, [lp.GlobalArg("x, y", shape=(10, ), dtype=np.float64)])

    knl = lp.merge([knl, twice])

    x = np.random.randn(10)
    evt, (out, ) = knl(queue, x=np.copy(x))

    assert np.allclose(2 * x, out)
Exemplo n.º 25
0
def test_valueargs_being_mapped_in_inling(ctx_factory):
    doublify = lp.make_function(
        "{[i]: 0<=i<n}",
        """
            y[i] = n*x[i]
            """,
        [lp.ValueArg("n", dtype=np.int32), ...],
        name="doublify",
    )

    knl = lp.make_kernel(
        "{[i, j]: 0<=i, j<10}",
        """
            [i]: bar[i] = doublify(10, [j]: foo[j])
            """,
        [lp.GlobalArg("foo", dtype=float, shape=lp.auto), ...],
    )
    knl = lp.merge([knl, doublify])
    knl = lp.inline_callable_kernel(knl, "doublify")

    lp.auto_test_vs_ref(knl, ctx_factory(), knl)
Exemplo n.º 26
0
def test_passing_and_getting_scalar_in_clbl_knl(ctx_factory, inline):
    ctx = cl.create_some_context()
    cq = cl.CommandQueue(ctx)

    call_sin = lp.make_function("{:}",
                                """
        y = sin(x)
        """,
                                name="call_sin")

    knl = lp.make_kernel(
        "{:}", """
        []: real_y[()] = call_sin(real_x)
        """)

    knl = lp.merge([knl, call_sin])
    knl = lp.set_options(knl, "write_cl")
    if inline:
        knl = lp.inline_callable_kernel(knl, "call_sin")

    evt, (out, ) = knl(cq, real_x=np.asarray(3.0, dtype=float))
Exemplo n.º 27
0
def test_simplify_indices(ctx_factory):
    ctx = ctx_factory()
    twice = lp.make_function("{[i, j]: 0<=i<10 and 0<=j<4}",
                             """
        y[i,j] = 2*x[i,j]
        """,
                             name="zerozerozeroonezeroify")

    knl = lp.make_kernel(
        "{:}", """
        Y[:,:] = zerozerozeroonezeroify(X[:,:])
        """, [lp.GlobalArg("X,Y", shape=(10, 4), dtype=np.float64)])

    class ContainsFloorDiv(lp.symbolic.CombineMapper):
        def combine(self, values):
            return any(values)

        def map_floor_div(self, expr):
            return True

        def map_variable(self, expr):
            return False

        def map_constant(self, expr):
            return False

    knl = lp.merge([knl, twice])
    knl = lp.inline_callable_kernel(knl, "zerozerozeroonezeroify")
    simplified_knl = lp.simplify_indices(knl)
    contains_floordiv = ContainsFloorDiv()

    assert any(
        contains_floordiv(insn.expression)
        for insn in knl.default_entrypoint.instructions
        if isinstance(insn, lp.MultiAssignmentBase))
    assert all(not contains_floordiv(insn.expression)
               for insn in simplified_knl.default_entrypoint.instructions
               if isinstance(insn, lp.MultiAssignmentBase))

    lp.auto_test_vs_ref(knl, ctx, simplified_knl)
Exemplo n.º 28
0
def test_callee_with_auto_offset(ctx_factory):

    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    arange = lp.make_function("{[i]: 0<=i<7}",
                              """
        y[i] = 2*y[i]
        """, [lp.GlobalArg("y", offset=lp.auto)],
                              name="dosify")

    knl = lp.make_kernel(
        "{[i]: 0<=i<7}", """
        [i]: y[i] = dosify([i]: y[i])
        """, [lp.GlobalArg("y", offset=3, shape=10)])

    knl = lp.merge([knl, arange])

    y = np.arange(10)
    knl(queue, y=y)
    np.testing.assert_allclose(y[:3], np.arange(3))
    np.testing.assert_allclose(y[3:], 2 * np.arange(3, 10))
Exemplo n.º 29
0
def test_non1_step_slices(ctx_factory, start, inline):
    # See https://github.com/inducer/loopy/pull/222#discussion_r645905188

    ctx = ctx_factory()
    cq = cl.CommandQueue(ctx)

    callee = lp.make_function("{[i]: 0<=i<n}",
                              """
            y[i] = i**2
            """, [lp.ValueArg("n"), ...],
                              name="squared_arange")

    t_unit = lp.make_kernel("{[i_init, j_init]: 0<=i_init, j_init<40}",
                            f"""
            X[i_init] = 42
            X[{start}:40:3] = squared_arange({len(range(start, 40, 3))})

            Y[j_init] = 1729
            Y[39:{start}:-3] = squared_arange({len(range(39, start, -3))})
            """, [lp.GlobalArg("X,Y", shape=40)],
                            seq_dependencies=True)

    expected_out1 = 42 * np.ones(40, dtype=np.int64)
    expected_out1[start:40:3] = np.arange(len(range(start, 40, 3)))**2

    expected_out2 = 1729 * np.ones(40, dtype=np.int64)
    expected_out2[39:start:-3] = np.arange(len(range(39, start, -3)))**2

    t_unit = lp.merge([t_unit, callee])

    t_unit = lp.set_options(t_unit, "return_dict")

    if inline:
        t_unit = lp.inline_callable_kernel(t_unit, "squared_arange")

    evt, out_dict = t_unit(cq)

    np.testing.assert_allclose(out_dict["X"].get(), expected_out1)
    np.testing.assert_allclose(out_dict["Y"].get(), expected_out2)
Exemplo n.º 30
0
def test_kc_with_floor_div_in_expr(ctx_factory, inline):
    # See https://github.com/inducer/loopy/issues/366
    import loopy as lp

    ctx = ctx_factory()
    callee = lp.make_function("{[i]: 0<=i<10}",
                              """
            x[i] = 2*x[i]
            """,
                              name="callee_with_update")

    knl = lp.make_kernel(
        "{[i]: 0<=i<10}", """
            [i]: x[2*(i//2) + (i%2)] = callee_with_update([i]: x[i])
            """)

    knl = lp.merge([knl, callee])

    if inline:
        knl = lp.inline_callable_kernel(knl, "callee_with_update")

    lp.auto_test_vs_ref(knl, ctx, knl)
Exemplo n.º 31
0
def test_callee_with_parameter_and_grid(ctx_factory):
    ctx = ctx_factory()
    cq = cl.CommandQueue(ctx)

    callee = lp.make_function("{[i]: 0<=i<n}",
                              """
        y[i] = i
        """,
                              name="arange")

    knl = lp.make_kernel("{[i]: 0<=i<10}", """
        [i]: y[i] = arange(10)
        """)

    knl = lp.merge([callee, knl])
    knl = lp.split_iname(knl,
                         "i",
                         2,
                         outer_tag="g.0",
                         within="in_kernel:arange")

    evt, (out, ) = knl(cq)
    np.testing.assert_allclose(out.get(), np.arange(10))
def expression_kernel(expr, args):
    r"""Produce a :class:`pyop2.Kernel` from the processed UFL expression
    expr and the corresponding args."""

    # Empty slot indicating assignment to indexed LHS, so don't do anything
    if type(expr) is Zero:
        return

    fs = args[0].function.function_space()

    import islpy as isl
    inames = isl.make_zero_and_vars(["d"])
    domain = (inames[0].le_set(inames["d"])) & (inames["d"].lt_set(inames[0] + fs.dof_dset.cdim))

    context = Bag()
    context.within_inames = frozenset(["d"])
    context.indices = (p.Variable("d"),)

    insn = loopy_instructions(expr, context)
    data = [arg.arg for arg in args]
    knl = loopy.make_function([domain], [insn], data, name="expression", silenced_warnings=["summing_if_branches_ops"])

    return op2.Kernel(knl, "expression")