示例#1
0
def test_forced_iname_deps_and_reduction():
    # See https://github.com/inducer/loopy/issues/24

    # This is (purposefully) somewhat un-idiomatic, to replicate the conditions
    # under which the above bug was found. If assignees were phi[i], then the
    # iname propagation heuristic would not assume that dependent instructions
    # need to run inside of 'i', and hence the forced_iname_* bits below would not
    # be needed.

    i1 = lp.CInstruction("i", "doSomethingToGetPhi();", assignees="phi")

    from pymbolic.primitives import Subscript, Variable
    i2 = lp.Assignment("a",
                       lp.Reduction("sum", "j",
                                    Subscript(Variable("phi"), Variable("j"))),
                       forced_iname_deps=frozenset(),
                       forced_iname_deps_is_final=True)

    k = lp.make_kernel(
        "{[i,j] : 0<=i,j<n}",
        [i1, i2],
        [
            lp.GlobalArg("a", dtype=np.float32, shape=()),
            lp.ValueArg("n", dtype=np.int32),
            lp.TemporaryVariable("phi", dtype=np.float32, shape=("n", )),
        ],
        target=lp.CTarget(),
    )

    k = lp.preprocess_kernel(k)

    assert 'i' not in k.insn_inames("insn_0_j_update")
    print(k.stringify(with_dependencies=True))
示例#2
0
    def test_solve_callable(self, zero_vec, solve_mat, solve_vec):
        loopy.set_caching_enabled(False)

        k = loopy.make_kernel(
            ["{[i,j] : 0 <= i,j < 2}"],
            """
            x[:] = solve(A[:,:], b[:])
            """, [
                loopy.GlobalArg('x', dtype=np.float64, shape=(2, )),
                loopy.GlobalArg('A', dtype=np.float64, shape=(2, 2)),
                loopy.GlobalArg(
                    'b',
                    dtype=np.float64,
                    shape=(2, ),
                )
            ],
            target=loopy.CTarget(),
            name="callable_kernel2",
            lang_version=(2018, 2))

        k = loopy.register_function_id_to_in_knl_callable_mapper(
            k, solve_fn_lookup)
        code = loopy.generate_code_v2(k).device_code()
        code.replace('void callable_kernel2', 'static void callable_kernel2')
        loopykernel = op2.Kernel(code, k.name, ldargs=["-llapack"])
        args = [zero_vec(op2.READ), solve_mat(op2.READ), solve_vec(op2.WRITE)]

        op2.par_loop(loopykernel, solve_mat.dataset.set, *args)
        expected = np.linalg.solve(solve_mat.data, solve_vec.data)
        assert np.allclose(expected, zero_vec.data)
def build_loopy_kernel_A_text():
    knl_name = "kernel_tensor_A"

    knl = lp.make_kernel("{ [i,j,k]: 0<=i,j<n and 0<=k<m }",
                         """
            A[i,j] = c*sum(k, B[k,i]*B[k,j])
        """,
                         name=knl_name,
                         assumptions="n >= 1 and m >= 1",
                         lang_version=lp.MOST_RECENT_LANGUAGE_VERSION,
                         target=lp.CTarget())

    knl = lp.add_and_infer_dtypes(
        knl, {
            "A": np.dtype(np.double),
            "B": np.dtype(np.double),
            "c": np.dtype(np.double)
        })
    knl = lp.fix_parameters(knl, n=3, m=2)
    knl = lp.prioritize_loops(knl, "i,j")
    #print(knl)

    knl_c, knl_h = lp.generate_code_v2(knl).device_code(), str(
        lp.generate_header(knl)[0])

    replacements = [("__restrict__", "restrict")]
    knl_c = utils.replace_strings(knl_c, replacements)
    knl_h = utils.replace_strings(knl_h, replacements)

    knl_call = "kernel_tensor_A(A, &B[0][0], 1.0/(2.0*Ae));"

    return knl_name, knl_call, knl_c, knl_h
示例#4
0
def merge_loopy(slate_loopy, output_arg, builder, var2terminal):
    """ Merges tsfc loopy kernels and slate loopy kernel into a wrapper kernel."""
    from firedrake.slate.slac.kernel_builder import SlateWrapperBag
    coeffs = builder.collect_coefficients()
    builder.bag = SlateWrapperBag(coeffs)

    # In the initialisation the loopy tensors for the terminals are generated
    # Those are the needed again for generating the TSFC calls
    inits, tensor2temp = builder.initialise_terminals(var2terminal, builder.bag.coefficients)
    terminal_tensors = list(filter(lambda x: isinstance(x, sl.Tensor), var2terminal.values()))
    tsfc_calls, tsfc_kernels = zip(*itertools.chain.from_iterable(
                                   (builder.generate_tsfc_calls(terminal, tensor2temp[terminal])
                                    for terminal in terminal_tensors)))

    # Construct args
    args = [output_arg] + builder.generate_wrapper_kernel_args(tensor2temp, tsfc_kernels)
    # Munge instructions
    insns = inits
    insns.extend(tsfc_calls)
    insns.append(builder.slate_call(slate_loopy, tensor2temp.values()))

    # Inames come from initialisations + loopyfying kernel args and lhs
    domains = builder.bag.index_creator.domains

    # Generates the loopy wrapper kernel
    slate_wrapper = lp.make_function(domains, insns, args, name="slate_wrapper",
                                     seq_dependencies=True, target=lp.CTarget())

    # Generate program from kernel, so that one can register kernels
    prg = make_program(slate_wrapper)
    for tsfc_loopy in tsfc_kernels:
        prg = register_callable_kernel(prg, tsfc_loopy)
    prg = register_callable_kernel(prg, slate_loopy)
    return prg
def build_loopy_kernel_b_text():
    knl_name = "kernel_tensor_b"

    knl = lp.make_kernel("{ [i]: 0<=i<n }",
                         """
            b[i] = c
        """,
                         name="kernel_tensor_b",
                         lang_version=lp.MOST_RECENT_LANGUAGE_VERSION,
                         target=lp.CTarget())

    knl = lp.add_and_infer_dtypes(knl, {
        "b": np.dtype(np.double),
        "c": np.dtype(np.double)
    })
    knl = lp.fix_parameters(knl, n=3)
    #print(knl)

    knl_c, knl_h = lp.generate_code_v2(knl).device_code(), str(
        lp.generate_header(knl)[0])

    replacements = [("__restrict__", "restrict")]
    knl_c = utils.replace_strings(knl_c, replacements)
    knl_h = utils.replace_strings(knl_h, replacements)

    knl_call = "kernel_tensor_b(b, Ae / 6.0);"

    return knl_name, knl_call, knl_c, knl_h
示例#6
0
    def __init__(self,
                 cd_proto,
                 target=lp.CTarget(),
                 test_instructions=False,
                 print_domains=False,
                 print_instructions=False,
                 print_variables=False,
                 print_assumptions=False,
                 print_codegen=True):
        # Verbose debugging options
        self.test_instructions = test_instructions
        self.print_domains = print_domains
        self.print_instructions = print_instructions
        self.print_variables = print_variables
        self.print_assumptions = print_assumptions
        self.print_codegen = print_codegen

        # Target output
        self.target = target

        # Reading from proto
        self.input_proto = cd_proto
        self.output_dir, self.output_file = os.path.split(self.input_proto)
        self.proto_name = self.output_file.split('.')[0]
        program = load_store.load_program(self.input_proto)
        self.graph = program.graph
        self.templates = program.templates
        self.kernels = {}
        self.program = None

        self.function_gen()
示例#7
0
    def test_inverse_callable(self, zero_mat, inv_mat):
        loopy.set_caching_enabled(False)

        k = loopy.make_kernel(
            ["{[i,j] : 0 <= i,j < 2}"],
            """
            B[:,:] = inv(A[:,:])
            """, [
                loopy.GlobalArg('B', dtype=np.float64, shape=(2, 2)),
                loopy.GlobalArg('A', dtype=np.float64, shape=(2, 2))
            ],
            target=loopy.CTarget(),
            name="callable_kernel",
            lang_version=(2018, 2))

        k = loopy.register_function_id_to_in_knl_callable_mapper(
            k, inv_fn_lookup)
        code = loopy.generate_code_v2(k).device_code()
        code.replace('void callable_kernel', 'static void callable_kernel')

        loopykernel = op2.Kernel(code, k.name, ldargs=["-llapack"])

        op2.par_loop(loopykernel, zero_mat.dataset.set, zero_mat(op2.WRITE),
                     inv_mat(op2.READ))
        expected = np.linalg.inv(inv_mat.data)
        assert np.allclose(expected, zero_mat.data)
示例#8
0
def merge_loopy(slate_loopy, output_arg, builder, var2terminal, name):
    """ Merges tsfc loopy kernels and slate loopy kernel into a wrapper kernel."""
    from firedrake.slate.slac.kernel_builder import SlateWrapperBag
    coeffs = builder.collect_coefficients()
    builder.bag = SlateWrapperBag(coeffs)

    # In the initialisation the loopy tensors for the terminals are generated
    # Those are the needed again for generating the TSFC calls
    inits, tensor2temp = builder.initialise_terminals(var2terminal,
                                                      builder.bag.coefficients)
    terminal_tensors = list(
        filter(lambda x: (x.terminal and not x.assembled),
               var2terminal.values()))
    calls_and_kernels = tuple((c, k) for terminal in terminal_tensors
                              for c, k in builder.generate_tsfc_calls(
                                  terminal, tensor2temp[terminal]))
    if calls_and_kernels:  # tsfc may not give a kernel back
        tsfc_calls, tsfc_kernels = zip(*calls_and_kernels)
    else:
        tsfc_calls = ()
        tsfc_kernels = ()

    # Construct args
    args = [output_arg] + builder.generate_wrapper_kernel_args(tensor2temp)
    # Munge instructions
    insns = inits
    insns.extend(tsfc_calls)
    insns.append(builder.slate_call(slate_loopy, tensor2temp.values()))

    # Inames come from initialisations + loopyfying kernel args and lhs
    domains = builder.bag.index_creator.domains

    # Generates the loopy wrapper kernel
    slate_wrapper = lp.make_function(domains,
                                     insns,
                                     args,
                                     name=name,
                                     seq_dependencies=True,
                                     target=lp.CTarget())

    # Generate program from kernel, so that one can register kernels
    from pyop2.codegen.loopycompat import _match_caller_callee_argument_dimension_
    from loopy.kernel.function_interface import CallableKernel

    for tsfc_loopy in tsfc_kernels:
        slate_wrapper = merge([slate_wrapper, tsfc_loopy])
        names = tsfc_loopy.callables_table
        for name in names:
            if isinstance(slate_wrapper.callables_table[name], CallableKernel):
                slate_wrapper = _match_caller_callee_argument_dimension_(
                    slate_wrapper, name)
    slate_wrapper = merge([slate_wrapper, slate_loopy])
    names = slate_loopy.callables_table
    for name in names:
        if isinstance(slate_wrapper.callables_table[name], CallableKernel):
            slate_wrapper = _match_caller_callee_argument_dimension_(
                slate_wrapper, name)

    return slate_wrapper
示例#9
0
def loopy_example():
    knl = lp.make_kernel("{ [i]: 0<=i<n }",
                         "out[i] = 2*a[i]",
                         lang_version=lp.MOST_RECENT_LANGUAGE_VERSION,
                         target=lp.CTarget())

    knl = lp.add_and_infer_dtypes(knl, {"a": np.dtype(np.float32)})
    print(lp.generate_code_v2(knl).device_code())
示例#10
0
def generate(impero_c, args, precision, scalar_type, kernel_name="loopy_kernel", index_names=[]):
    """Generates loopy code.

    :arg impero_c: ImperoC tuple with Impero AST and other data
    :arg args: list of loopy.GlobalArgs
    :arg precision: floating-point precision for printing
    :arg scalar_type: type of scalars as C typename string
    :arg kernel_name: function name of the kernel
    :arg index_names: pre-assigned index names
    :returns: loopy kernel
    """
    ctx = LoopyContext()
    ctx.indices = impero_c.indices
    ctx.index_names = defaultdict(lambda: "i", index_names)
    ctx.precision = precision
    ctx.scalar_type = scalar_type
    ctx.epsilon = 10.0 ** (-precision)

    # Create arguments
    data = list(args)
    for i, temp in enumerate(impero_c.temporaries):
        name = "t%d" % i
        if isinstance(temp, gem.Constant):
            data.append(lp.TemporaryVariable(name, shape=temp.shape, dtype=temp.array.dtype, initializer=temp.array, address_space=lp.AddressSpace.LOCAL, read_only=True))
        else:
            shape = tuple([i.extent for i in ctx.indices[temp]]) + temp.shape
            data.append(lp.TemporaryVariable(name, shape=shape, dtype=numpy.float64, initializer=None, address_space=lp.AddressSpace.LOCAL, read_only=False))
        ctx.gem_to_pymbolic[temp] = p.Variable(name)

    # Create instructions
    instructions = statement(impero_c.tree, ctx)

    # Create domains
    domains = []
    for idx, extent in ctx.index_extent.items():
        inames = isl.make_zero_and_vars([idx])
        domains.append(((inames[0].le_set(inames[idx])) & (inames[idx].lt_set(inames[0] + extent))))

    if not domains:
        domains = [isl.BasicSet("[] -> {[]}")]

    # Create loopy kernel
    knl = lp.make_function(domains, instructions, data, name=kernel_name, target=lp.CTarget(),
                           seq_dependencies=True, silenced_warnings=["summing_if_branches_ops"])

    # Prevent loopy interchange by loopy
    knl = lp.prioritize_loops(knl, ",".join(ctx.index_extent.keys()))

    # Help loopy in scheduling by assigning priority to instructions
    insn_new = []
    for i, insn in enumerate(knl.instructions):
        insn_new.append(insn.copy(priority=len(knl.instructions) - i))
    knl = knl.copy(instructions=insn_new)

    return knl
示例#11
0
def generate_kernel():
    '''Generates and returns source and header for a kernel using loopy'''

    knl = lp.make_kernel("{ [i]: 0<=i<n }",
                         "out[i] = 2*a[i]",
                         lang_version=lp.MOST_RECENT_LANGUAGE_VERSION,
                         target=lp.CTarget())

    knl = lp.add_and_infer_dtypes(knl, {"a": np.dtype(np.double)})
    #knl = lp.split_iname(knl, "i", 4)
    #knl = lp.tag_inames(knl, dict(i_inner="unr"))

    return lp.generate_code_v2(knl).all_code(), str(lp.generate_header(knl)[0])
示例#12
0
def test_prefetch_through_indirect_access():
    knl = lp.make_kernel(
        "{[i, j, k]: 0 <= i,k < 10 and 0<=j<2}",
        """
        for i, j, k
            a[map1[indirect[i], j], k] = 2
        end
        """, [
            lp.GlobalArg("a", strides=(2, 1), dtype=int),
            lp.GlobalArg("map1", shape=(10, 10), dtype=int), "..."
        ],
        target=lp.CTarget())

    knl = lp.prioritize_loops(knl, "i,j,k")

    with pytest.raises(LoopyError):
        knl = lp.add_prefetch(knl, "map1[:, j]")
示例#13
0
def create_loop_kernel(component_name, domains, instructions, edges, signature):

    domains, assumption_string = create_domain_string(domains, edges)
    globals = create_globals(signature, edges)


    knl = lp.make_function(
        domains,
        instructions,
        globals + ["..."],
        name=component_name,
        assumptions=assumption_string,
        target=lp.CTarget()
    )
    knl = add_instruction_deps(knl)
    if component_name == 'main':
        print(lp.generate_code_v2(knl).device_code())
    return knl
示例#14
0
def test_reduction_with_conditional():
    # Test whether realization of a reduction inherits predicates
    # of the original instruction. Tested with the CTarget, because
    # the PyOpenCL target will hoist the conditional into the host
    # code in this minimal example.
    knl = lp.make_kernel(
                "{ [i] : 0<=i<42 }",
                """
                if n > 0
                    <>b = sum(i, a[i])
                end
                """,
                [lp.GlobalArg("a", dtype=np.float32, shape=(42,)),
                 lp.GlobalArg("n", dtype=np.float32, shape=())],
                target=lp.CTarget())
    code = lp.generate_body(knl)

    # Check that the if appears before the loop that realizes the reduction.
    assert code.index("if") < code.index("for")
示例#15
0
def make_extruded_coords(extruded_topology,
                         base_coords,
                         ext_coords,
                         layer_height,
                         extrusion_type='uniform',
                         kernel=None):
    """
    Given either a kernel or a (fixed) layer_height, compute an
    extruded coordinate field for an extruded mesh.

    :arg extruded_topology: an :class:`~.ExtrudedMeshTopology` to extrude
         a coordinate field for.
    :arg base_coords: a :class:`~.Function` to read the base
         coordinates from.
    :arg ext_coords: a :class:`~.Function` to write the extruded
         coordinates into.
    :arg layer_height: the height for each layer.  Either a scalar,
         where layers will be equi-spaced at the specified height, or a
         1D array of variable layer heights to use through the extrusion.
    :arg extrusion_type: the type of extrusion to use.  Predefined
         options are either "uniform" (creating equi-spaced layers by
         extruding in the (n+1)dth direction), "radial" (creating
         equi-spaced layers by extruding in the outward direction from
         the origin) or "radial_hedgehog" (creating equi-spaced layers
         by extruding coordinates in the outward cell-normal
         direction, needs a P1dgxP1 coordinate field).
    :arg kernel: an optional kernel to carry out coordinate extrusion.

    The kernel signature (if provided) is::

        void kernel(double **base_coords, double **ext_coords,
                    double *layer_height, int layer)

    The kernel iterates over the cells of the mesh and receives as
    arguments the coordinates of the base cell (to read), the
    coordinates on the extruded cell (to write to), the fixed layer
    height, and the current cell layer.
    """
    _, vert_space = ext_coords.function_space().ufl_element().sub_elements(
    )[0].sub_elements()
    if kernel is None and not (vert_space.degree() == 1
                               and vert_space.family()
                               in ['Lagrange', 'Discontinuous Lagrange']):
        raise RuntimeError(
            'Extrusion of coordinates is only possible for a P1 or P1dg interval unless a custom kernel is provided'
        )

    layer_height = numpy.atleast_1d(numpy.array(layer_height, dtype=RealType))

    if layer_height.ndim > 1:
        raise RuntimeError('Extrusion layer height should be 1d or scalar')

    if layer_height.size > 1:
        layer_height = numpy.cumsum(numpy.concatenate(([0], layer_height)))

    layer_heights = layer_height.size
    layer_height = op2.Global(layer_heights, layer_height, dtype=RealType)

    if kernel is not None:
        op2.ParLoop(kernel,
                    ext_coords.cell_set,
                    ext_coords.dat(op2.WRITE, ext_coords.cell_node_map()),
                    base_coords.dat(op2.READ, base_coords.cell_node_map()),
                    layer_height(op2.READ),
                    pass_layer_arg=True,
                    is_loopy_kernel=True).compute()
        return
    ext_fe = create_element(ext_coords.ufl_element())
    ext_shape = ext_fe.index_shape
    base_fe = create_element(base_coords.ufl_element())
    base_shape = base_fe.index_shape
    data = []
    data.append(lp.GlobalArg("ext_coords", dtype=ScalarType, shape=ext_shape))
    data.append(lp.GlobalArg("base_coords", dtype=ScalarType,
                             shape=base_shape))
    data.append(
        lp.GlobalArg("layer_height", dtype=RealType, shape=(layer_heights, )))
    data.append(lp.ValueArg('layer'))
    base_coord_dim = base_coords.function_space().value_size
    # Deal with tensor product cells
    adim = len(ext_shape) - 2

    # handle single or variable layer heights
    if layer_heights == 1:
        height_var = "layer_height[0] * (layer + l)"
    else:
        height_var = "layer_height[layer + l]"

    def _get_arity_axis_inames(_base):
        return tuple(_base + str(i) for i in range(adim))

    def _get_lp_domains(_inames, _extents):
        domains = []
        for idx, extent in zip(_inames, _extents):
            inames = isl.make_zero_and_vars([idx])
            domains.append(((inames[0].le_set(inames[idx])) &
                            (inames[idx].lt_set(inames[0] + extent))))
        return domains

    if extrusion_type == 'uniform':
        domains = []
        dd = _get_arity_axis_inames('d')
        domains.extend(_get_lp_domains(dd, ext_shape[:adim]))
        domains.extend(_get_lp_domains(('c', ), (base_coord_dim, )))
        if layer_heights == 1:
            domains.extend(_get_lp_domains(('l', ), (2, )))
        else:
            domains.append(
                "[layer] -> { [l] : 0 <= l <= 1 & 0 <= l + layer < %d}" %
                layer_heights)
        instructions = """
        ext_coords[{dd}, l, c] = base_coords[{dd}, c]
        ext_coords[{dd}, l, {base_coord_dim}] = ({hv})
        """.format(dd=', '.join(dd),
                   base_coord_dim=base_coord_dim,
                   hv=height_var)
        name = "pyop2_kernel_uniform_extrusion"
    elif extrusion_type == 'radial':
        domains = []
        dd = _get_arity_axis_inames('d')
        domains.extend(_get_lp_domains(dd, ext_shape[:adim]))
        domains.extend(_get_lp_domains(('c', 'k'), (base_coord_dim, ) * 2))
        if layer_heights == 1:
            domains.extend(_get_lp_domains(('l', ), (2, )))
        else:
            domains.append(
                "[layer] -> { [l] : 0 <= l <= 1 & 0 <= l + layer < %d}" %
                layer_heights)
        instructions = """
        <{RealType}> tt[{dd}] = 0
        <{RealType}> bc[{dd}] = 0
        for k
            bc[{dd}] = real(base_coords[{dd}, k])
            tt[{dd}] = tt[{dd}] + bc[{dd}] * bc[{dd}]
        end
        tt[{dd}] = sqrt(tt[{dd}])
        ext_coords[{dd}, l, c] = base_coords[{dd}, c] + base_coords[{dd}, c] * ({hv}) / tt[{dd}]
        """.format(RealType=RealType, dd=', '.join(dd), hv=height_var)
        name = "pyop2_kernel_radial_extrusion"
    elif extrusion_type == 'radial_hedgehog':
        # Only implemented for interval in 2D and triangle in 3D.
        # gdim != tdim already checked in ExtrudedMesh constructor.
        tdim = base_coords.ufl_domain().ufl_cell().topological_dimension()
        if tdim not in [1, 2]:
            raise NotImplementedError(
                "Hedgehog extrusion not implemented for %s" %
                base_coords.ufl_domain().ufl_cell())
        # tdim == 1:
        #
        # normal is:
        # (0 -1) (x2 - x1)
        # (1  0) (y2 - y1)
        #
        # tdim == 2:
        # normal is
        # v0 x v1
        #
        #    /\
        # v0/  \
        #  /    \
        # /------\
        #    v1
        domains = []
        dd = _get_arity_axis_inames('d')
        _dd = _get_arity_axis_inames('_d')
        domains.extend(_get_lp_domains(dd, ext_shape[:adim]))
        domains.extend(_get_lp_domains(_dd, ext_shape[:adim]))
        domains.extend(
            _get_lp_domains(('c0', 'c1', 'c2', 'c3', 'k', 'l'),
                            (base_coord_dim, ) * 5 + (2, )))
        # Formula for normal, n
        n_1_1 = """
        n[0] = -bc[1, 1] + bc[0, 1]
        n[1] = bc[1, 0] - bc[0, 0]
        """
        n_2_1 = """
        v0[c3] = bc[1, c3] - bc[0, c3]
        v1[c3] = bc[2, c3] - bc[0, c3]
        n[0] = v0[1] * v1[2] - v0[2] * v1[1]
        n[1] = v0[2] * v1[0] - v0[0] * v1[2]
        n[2] = v0[0] * v1[1] - v0[1] * v1[0]
        """
        n_2_2 = """
        v0[c3] = bc[0, 1, c3] - bc[0, 0, c3]
        v1[c3] = bc[1, 0, c3] - bc[0, 0, c3]
        n[0] = v0[1] * v1[2] - v0[2] * v1[1]
        n[1] = v0[2] * v1[0] - v0[0] * v1[2]
        n[2] = v0[0] * v1[1] - v0[1] * v1[0]
        """
        n_dict = {1: {1: n_1_1}, 2: {1: n_2_1, 2: n_2_2}}
        instructions = """
        <{RealType}> dot = 0
        <{RealType}> norm = 0
        <{RealType}> v0[c2] = 0
        <{RealType}> v1[c2] = 0
        <{RealType}> n[c2] = 0
        <{RealType}> x[c2] = 0
        <{RealType}> bc[{_dd}, c1] = real(base_coords[{_dd}, c1])
        for {_dd}
            x[c1] = x[c1] + bc[{_dd}, c1]
        end
        {ninst}
        for k
            dot = dot + x[k] * n[k]
            norm = norm + n[k] * n[k]
        end
        norm = sqrt(norm)
        norm = -norm if dot < 0 else norm
        ext_coords[{dd}, l, c0] = base_coords[{dd}, c0] + n[c0] * ({hv}) / norm
        """.format(RealType=RealType,
                   dd=', '.join(dd),
                   _dd=', '.join(_dd),
                   ninst=n_dict[tdim][adim],
                   hv=height_var)
        name = "pyop2_kernel_radial_hedgehog_extrusion"
    else:
        raise NotImplementedError('Unsupported extrusion type "%s"' %
                                  extrusion_type)

    ast = lp.make_function(domains,
                           instructions,
                           data,
                           name=name,
                           target=lp.CTarget(),
                           seq_dependencies=True,
                           silenced_warnings=["summing_if_branches_ops"])
    kernel = op2.Kernel(ast, name)
    op2.ParLoop(kernel,
                ext_coords.cell_set,
                ext_coords.dat(op2.WRITE, ext_coords.cell_node_map()),
                base_coords.dat(op2.READ, base_coords.cell_node_map()),
                layer_height(op2.READ),
                pass_layer_arg=True,
                is_loopy_kernel=True).compute()
示例#16
0
def generate(builder, wrapper_name=None):
    if builder.layer_index is not None:
        outer_inames = frozenset(
            [builder._loop_index.name, builder.layer_index.name])
    else:
        outer_inames = frozenset([builder._loop_index.name])

    instructions = list(builder.emit_instructions())

    parameters = Bag()
    parameters.domains = OrderedDict()
    parameters.assumptions = OrderedDict()
    parameters.wrapper_arguments = builder.wrapper_args
    parameters.layer_start = builder.layer_extents[0].name
    parameters.layer_end = builder.layer_extents[1].name
    parameters.conditions = []
    parameters.kernel_data = list(None for _ in parameters.wrapper_arguments)
    parameters.temporaries = OrderedDict()
    parameters.kernel_name = builder.kernel.name

    # replace Materialise
    mapper = Memoizer(replace_materialise)
    mapper.initialisers = []
    instructions = list(mapper(i) for i in instructions)

    # merge indices
    merger = index_merger(instructions)
    instructions = list(merger(i) for i in instructions)
    initialiser = list(itertools.chain(*mapper.initialisers))
    merger = index_merger(initialiser)
    initialiser = list(merger(i) for i in initialiser)
    instructions = instructions + initialiser
    mapper.initialisers = [
        tuple(merger(i) for i in inits) for inits in mapper.initialisers
    ]

    # rename indices and nodes (so that the counters start from zero)
    pattern = re.compile(r"^([a-zA-Z_]+)([0-9]+)(_offset)?$")
    replacements = {}
    counter = defaultdict(itertools.count)
    for node in traversal(instructions):
        if isinstance(node,
                      (Index, RuntimeIndex, Variable, Argument, NamedLiteral)):
            match = pattern.match(node.name)
            if match is None:
                continue
            prefix, _, postfix = match.groups()
            if postfix is None:
                postfix = ""
            replacements[node] = "%s%d%s" % (
                prefix, next(counter[(prefix, postfix)]), postfix)

    instructions = rename_nodes(instructions, replacements)
    mapper.initialisers = [
        rename_nodes(inits, replacements) for inits in mapper.initialisers
    ]
    parameters.wrapper_arguments = rename_nodes(parameters.wrapper_arguments,
                                                replacements)
    s, e = rename_nodes([mapper(e) for e in builder.layer_extents],
                        replacements)
    parameters.layer_start = s.name
    parameters.layer_end = e.name

    # scheduling and loop nesting
    deps = instruction_dependencies(instructions, mapper.initialisers)
    within_inames = loop_nesting(instructions, deps, outer_inames,
                                 parameters.kernel_name)

    # generate loopy
    context = Bag()
    context.parameters = parameters
    context.within_inames = within_inames
    context.conditions = []
    context.index_ordering = []
    context.instruction_dependencies = deps

    statements = list(statement(insn, context) for insn in instructions)
    # remote the dummy instructions (they were only used to ensure
    # that the kernel knows about the outer inames).
    statements = list(s for s in statements
                      if not isinstance(s, DummyInstruction))

    domains = list(parameters.domains.values())
    if builder.single_cell:
        new_domains = []
        for d in domains:
            if d.get_dim_name(isl.dim_type.set, 0) == builder._loop_index.name:
                # n = start
                new_domains.append(
                    d.add_constraint(
                        isl.Constraint.eq_from_names(d.space, {
                            "n": 1,
                            "start": -1
                        })))
            else:
                new_domains.append(d)
        domains = new_domains
        if builder.extruded:
            new_domains = []
            for d in domains:
                if d.get_dim_name(isl.dim_type.set,
                                  0) == builder.layer_index.name:
                    # layer = t1 - 1
                    t1 = parameters.layer_end
                    new_domains.append(
                        d.add_constraint(
                            isl.Constraint.eq_from_names(
                                d.space, {
                                    "layer": 1,
                                    t1: -1,
                                    1: 1
                                })))
                else:
                    new_domains.append(d)
        domains = new_domains

    assumptions, = reduce(
        operator.and_,
        parameters.assumptions.values()).params().get_basic_sets()
    options = loopy.Options(check_dep_resolution=True,
                            ignore_boostable_into=True)

    # sometimes masks are not used, but we still need to create the function arguments
    for i, arg in enumerate(parameters.wrapper_arguments):
        if parameters.kernel_data[i] is None:
            arg = loopy.GlobalArg(arg.name, dtype=arg.dtype, shape=arg.shape)
            parameters.kernel_data[i] = arg

    if wrapper_name is None:
        wrapper_name = "wrap_%s" % builder.kernel.name

    pwaffd = isl.affs_from_space(assumptions.get_space())
    assumptions = assumptions & pwaffd["start"].ge_set(pwaffd[0])
    if builder.single_cell:
        assumptions = assumptions & pwaffd["start"].lt_set(pwaffd["end"])
    else:
        assumptions = assumptions & pwaffd["start"].le_set(pwaffd["end"])
    if builder.extruded:
        assumptions = assumptions & pwaffd[parameters.layer_start].le_set(
            pwaffd[parameters.layer_end])
    assumptions = reduce(operator.and_, assumptions.get_basic_sets())

    wrapper = loopy.make_kernel(domains,
                                statements,
                                kernel_data=parameters.kernel_data,
                                target=loopy.CTarget(),
                                temporary_variables=parameters.temporaries,
                                symbol_manglers=[symbol_mangler],
                                options=options,
                                assumptions=assumptions,
                                lang_version=(2018, 2),
                                name=wrapper_name)

    # prioritize loops
    for indices in context.index_ordering:
        wrapper = loopy.prioritize_loops(wrapper, indices)

    # register kernel
    kernel = builder.kernel
    headers = set(kernel._headers)
    headers = headers | set(
        ["#include <math.h>", "#include <complex.h>", "#include <petsc.h>"])
    preamble = "\n".join(sorted(headers))

    from coffee.base import Node

    if isinstance(kernel._code, loopy.LoopKernel):
        knl = kernel._code
        wrapper = loopy.register_callable_kernel(wrapper, knl)
        from loopy.transform.callable import _match_caller_callee_argument_dimension_
        wrapper = _match_caller_callee_argument_dimension_(wrapper, knl.name)
        wrapper = loopy.inline_callable_kernel(wrapper, knl.name)
    else:
        # kernel is a string, add it to preamble
        if isinstance(kernel._code, Node):
            code = kernel._code.gencode()
        else:
            code = kernel._code
        wrapper = loopy.register_function_id_to_in_knl_callable_mapper(
            wrapper,
            PyOP2KernelLookup(kernel.name, code,
                              tuple(builder.argument_accesses)))
        preamble = preamble + "\n" + code

    wrapper = loopy.register_preamble_generators(wrapper,
                                                 [_PreambleGen(preamble)])

    # register petsc functions
    wrapper = loopy.register_function_id_to_in_knl_callable_mapper(
        wrapper, petsc_function_lookup)

    return wrapper
def build_loopy_kernel_A_auto():
    knl_name = "kernel_tensor_A"

    # Inputs to the kernel
    arg_names = ["A", "B", "c"]
    # Kernel parameters that will be fixed later
    param_names = ["n", "m"]
    # Tuples of inames and extents of their loops
    loops = [("i", "n"), ("j", "n"), ("k", "m")]

    # Generate the domains for the loops
    isl_domains = []
    for idx, extent in loops:
        # Create dict of loop variables (inames) and parameters
        vs = isl.make_zero_and_vars([idx], [extent])
        # Create the loop domain using '<=' and '>' restrictions
        isl_domains.append(
            ((vs[0].le_set(vs[idx])) & (vs[idx].lt_set(vs[0] + vs[extent]))))

    print("ISL loop domains:")
    print(isl_domains)
    print("")

    # Generate pymbolic variables for all used symbols
    args = {arg: pb.Variable(arg) for arg in arg_names}
    params = {param: pb.Variable(param) for param in param_names}
    inames = {iname: pb.Variable(iname) for iname, extent in loops}

    # Input arguments for the loopy kernel
    lp_args = {
        "A": lp.GlobalArg("A",
                          dtype=np.double,
                          shape=(params["n"], params["n"])),
        "B": lp.GlobalArg("B",
                          dtype=np.double,
                          shape=(params["m"], params["n"])),
        "c": lp.ValueArg("c", dtype=np.double)
    }

    # Generate the list of arguments & parameters that will be passed to loopy
    data = []
    data += [arg for arg in lp_args.values()]
    data += [lp.ValueArg(param) for param in ["n", "m"]]

    # Build the kernel instruction: computation and assignment of the element matrix
    def build_ass():
        """
        A[i,j] = c*sum(k, B[k,i]*B[k,j])
        """

        # The target of the assignment
        target = pb.Subscript(args["A"], (inames["i"], inames["j"]))

        # The rhs expression: A reduce operation of the matrix columns
        # Maybe replace with manual increment?
        reduce_op = lp.library.reduction.SumReductionOperation()
        reduce_expr = pb.Subscript(args["B"],
                                   (inames["k"], inames["i"])) * pb.Subscript(
                                       args["B"], (inames["k"], inames["j"]))
        expr = args["c"] * lp.Reduction(reduce_op, inames["k"], reduce_expr)

        return lp.Assignment(target, expr)

    ass = build_ass()
    print("Assignment expression:")
    print(ass)
    print("")

    instructions = [ass]

    # Construct the kernel
    knl = lp.make_kernel(isl_domains,
                         instructions,
                         data,
                         name=knl_name,
                         target=lp.CTarget(),
                         lang_version=lp.MOST_RECENT_LANGUAGE_VERSION)

    knl = lp.fix_parameters(knl, n=3, m=2)
    knl = lp.prioritize_loops(knl, "i,j")
    print(knl)
    print("")

    # Generate kernel code
    knl_c, knl_h = lp.generate_code_v2(knl).device_code(), str(
        lp.generate_header(knl)[0])
    print(knl_c)
    print("")

    # Postprocess kernel code
    replacements = [("__restrict__", "restrict")]
    knl_c = utils.replace_strings(knl_c, replacements)
    knl_h = utils.replace_strings(knl_h, replacements)

    knl_call = "kernel_tensor_A(A, &B[0][0], 1.0/(2.0*Ae));"

    return knl_name, knl_call, knl_c, knl_h
示例#18
0
文件: loopy.py 项目: jmv2009/tsfc
def generate(impero_c,
             args,
             scalar_type,
             kernel_name="loopy_kernel",
             index_names=[],
             return_increments=True):
    """Generates loopy code.

    :arg impero_c: ImperoC tuple with Impero AST and other data
    :arg args: list of loopy.GlobalArgs
    :arg scalar_type: type of scalars as C typename string
    :arg kernel_name: function name of the kernel
    :arg index_names: pre-assigned index names
    :arg return_increments: Does codegen for Return nodes increment the lvalue, or assign?
    :returns: loopy kernel
    """
    ctx = LoopyContext()
    ctx.indices = impero_c.indices
    ctx.index_names = defaultdict(lambda: "i", index_names)
    ctx.epsilon = numpy.finfo(scalar_type).resolution
    ctx.scalar_type = scalar_type
    ctx.return_increments = return_increments

    # Create arguments
    data = list(args)
    for i, (temp, dtype) in enumerate(
            assign_dtypes(impero_c.temporaries, scalar_type)):
        name = "t%d" % i
        if isinstance(temp, gem.Constant):
            data.append(
                lp.TemporaryVariable(name,
                                     shape=temp.shape,
                                     dtype=dtype,
                                     initializer=temp.array,
                                     address_space=lp.AddressSpace.LOCAL,
                                     read_only=True))
        else:
            shape = tuple([i.extent for i in ctx.indices[temp]]) + temp.shape
            data.append(
                lp.TemporaryVariable(name,
                                     shape=shape,
                                     dtype=dtype,
                                     initializer=None,
                                     address_space=lp.AddressSpace.LOCAL,
                                     read_only=False))
        ctx.gem_to_pymbolic[temp] = p.Variable(name)

    # Create instructions
    instructions = statement(impero_c.tree, ctx)

    # Create domains
    domains = create_domains(ctx.index_extent.items())

    # Create loopy kernel
    knl = lp.make_function(domains,
                           instructions,
                           data,
                           name=kernel_name,
                           target=lp.CTarget(),
                           seq_dependencies=True,
                           silenced_warnings=["summing_if_branches_ops"],
                           lang_version=(2018, 2))

    # Prevent loopy interchange by loopy
    knl = lp.prioritize_loops(knl, ",".join(ctx.index_extent.keys()))

    return knl
    def __generate_loopy(self, knl_name: str, verbose: bool = False, **kwargs):
        """Generate cell kernel for the Laplace operator using Loopy"""

        n_dof, n_dim = self.n_dof, self.n_dim

        # Inputs to the kernel
        arg_names = ["A_T", "A0", "G_T"]
        # Kernel parameters that will be fixed later
        param_names = ["n", "m"]
        # Tuples of inames and extents of their loops
        loops = [("i", "n"), ("j", "n"), ("k", "m")]

        # Generate the domains for the loops
        isl_domains = []
        for idx, extent in loops:
            # Create dict of loop variables (inames) and parameters
            vs = isl.make_zero_and_vars([idx], [extent])
            # Create the loop domain using '<=' and '>' restrictions
            isl_domains.append(((vs[0].le_set(vs[idx])) &
                                (vs[idx].lt_set(vs[0] + vs[extent]))))

        if verbose:
            print("ISL loop domains:")
            print(isl_domains)
            print("")

        # Generate pymbolic variables for all used symbols
        args = {arg: pb.Variable(arg) for arg in arg_names}
        params = {param: pb.Variable(param) for param in param_names}
        inames = {iname: pb.Variable(iname) for iname, extent in loops}

        # Input arguments for the loopy kernel
        n, m = params["n"], params["m"]
        lp_args = {
            "A_T": lp.GlobalArg("A_T", dtype=np.double, shape=(n, n)),
            "A0": lp.GlobalArg("A0", dtype=np.double, shape=(n, n, m)),
            "G_T": lp.GlobalArg("G_T", dtype=np.double, shape=(m))
        }

        # Generate the list of arguments & parameters that will be passed to loopy
        data = []
        data += [arg for arg in lp_args.values()]
        data += [lp.ValueArg(param) for param in param_names]

        # Build the kernel instruction: computation and assignment of the element matrix
        def build_ass():
            # A_T[i,j] = sum(k, A0[i,j,k] * G_T[k]);

            # Get variable symbols for all required variables
            i, j, k = inames["i"], inames["j"], inames["k"]
            A_T, A0, G_T = args["A_T"], args["A0"], args["G_T"]

            # The target of the assignment
            target = pb.Subscript(A_T, (i, j))

            # The rhs expression: Frobenius inner product <A0[i,j],G_T>
            reduce_op = lp.library.reduction.SumReductionOperation()
            reduce_expr = pb.Subscript(A0, (i, j, k)) * pb.Subscript(G_T, (k))
            expr = lp.Reduction(reduce_op, k, reduce_expr)

            return lp.Assignment(target, expr)

        ass = build_ass()

        if verbose:
            print("Assignment expression:")
            print(ass)
            print("")

        instructions = [ass]

        # Construct the kernel
        knl = lp.make_kernel(isl_domains,
                             instructions,
                             data,
                             name=knl_name,
                             target=lp.CTarget(),
                             lang_version=lp.MOST_RECENT_LANGUAGE_VERSION)

        knl = lp.fix_parameters(knl, n=n_dof, m=n_dim**2)
        knl = lp.prioritize_loops(knl, "i,j")

        if verbose:
            print("")
            print(knl)
            print("")

        # Generate kernel code
        knl_c, knl_h = lp.generate_code_v2(knl).device_code(), str(
            lp.generate_header(knl)[0])

        if verbose:
            print(knl_c)
            print("")

        # Postprocess kernel code
        knl_c = knl_c.replace("__restrict__", "restrict")
        knl_h = knl_h.replace("__restrict__", "restrict")

        return knl_c, knl_h
示例#20
0
def _form_loopy_kernel(kernel_domains, instructions, measure, args, **kwargs):

    kargs = []

    for var, (func, intent) in args.items():
        if isinstance(func, constant.Constant):
            if intent is not READ:
                raise RuntimeError("Only READ access is allowed to Constant")
            # Constants modelled as Globals, so no need for double
            # indirection
            ndof = func.dat.cdim
            kargs.append(loopy.GlobalArg(var, dtype=func.dat.dtype, shape=(ndof,)))
        else:
            # Do we have a component of a mixed function?
            if isinstance(func, Indexed):
                c, i = func.ufl_operands
                idx = i._indices[0]._value
                ndof = c.function_space()[idx].finat_element.space_dimension()
                cdim = c.dat[idx].cdim
                dtype = c.dat[idx].dtype
            else:
                if func.function_space().ufl_element().family() == "Real":
                    ndof = func.function_space().dim()  # == 1
                    kargs.append(loopy.GlobalArg(var, dtype=func.dat.dtype, shape=(ndof,)))
                    continue
                else:
                    if len(func.function_space()) > 1:
                        raise NotImplementedError("Must index mixed function in par_loop.")
                    ndof = func.function_space().finat_element.space_dimension()
                    cdim = func.dat.cdim
                    dtype = func.dat.dtype
            if measure.integral_type() == 'interior_facet':
                ndof *= 2
            # FIXME: shape for facets [2][ndof]?
            kargs.append(loopy.GlobalArg(var, dtype=dtype, shape=(ndof, cdim)))
        kernel_domains = kernel_domains.replace(var+".dofs", str(ndof))

    if kernel_domains == "":
        kernel_domains = "[] -> {[]}"
    try:
        key = (kernel_domains, tuple(instructions), tuple(map(tuple, kwargs.items())))
        if kernel_cache is not None:
            return kernel_cache[key]
        else:
            raise KeyError("No cache")
    except KeyError:
        kargs.append(...)
        knl = loopy.make_function(kernel_domains, instructions, kargs, seq_dependencies=True,
                                  name="par_loop_kernel", silenced_warnings=["summing_if_branches_ops"], target=loopy.CTarget())
        knl = pyop2.Kernel(knl, "par_loop_kernel", **kwargs)
        if kernel_cache is not None:
            return kernel_cache.setdefault(key, knl)
        else:
            return knl