Пример #1
0
def generate(impero_c, args, precision, scalar_type, kernel_name="loopy_kernel", index_names=[]):
    """Generates loopy code.

    :arg impero_c: ImperoC tuple with Impero AST and other data
    :arg args: list of loopy.GlobalArgs
    :arg precision: floating-point precision for printing
    :arg scalar_type: type of scalars as C typename string
    :arg kernel_name: function name of the kernel
    :arg index_names: pre-assigned index names
    :returns: loopy kernel
    """
    ctx = LoopyContext()
    ctx.indices = impero_c.indices
    ctx.index_names = defaultdict(lambda: "i", index_names)
    ctx.precision = precision
    ctx.scalar_type = scalar_type
    ctx.epsilon = 10.0 ** (-precision)

    # Create arguments
    data = list(args)
    for i, temp in enumerate(impero_c.temporaries):
        name = "t%d" % i
        if isinstance(temp, gem.Constant):
            data.append(lp.TemporaryVariable(name, shape=temp.shape, dtype=temp.array.dtype, initializer=temp.array, address_space=lp.AddressSpace.LOCAL, read_only=True))
        else:
            shape = tuple([i.extent for i in ctx.indices[temp]]) + temp.shape
            data.append(lp.TemporaryVariable(name, shape=shape, dtype=numpy.float64, initializer=None, address_space=lp.AddressSpace.LOCAL, read_only=False))
        ctx.gem_to_pymbolic[temp] = p.Variable(name)

    # Create instructions
    instructions = statement(impero_c.tree, ctx)

    # Create domains
    domains = []
    for idx, extent in ctx.index_extent.items():
        inames = isl.make_zero_and_vars([idx])
        domains.append(((inames[0].le_set(inames[idx])) & (inames[idx].lt_set(inames[0] + extent))))

    if not domains:
        domains = [isl.BasicSet("[] -> {[]}")]

    # Create loopy kernel
    knl = lp.make_function(domains, instructions, data, name=kernel_name, target=lp.CTarget(),
                           seq_dependencies=True, silenced_warnings=["summing_if_branches_ops"])

    # Prevent loopy interchange by loopy
    knl = lp.prioritize_loops(knl, ",".join(ctx.index_extent.keys()))

    # Help loopy in scheduling by assigning priority to instructions
    insn_new = []
    for i, insn in enumerate(knl.instructions):
        insn_new.append(insn.copy(priority=len(knl.instructions) - i))
    knl = knl.copy(instructions=insn_new)

    return knl
Пример #2
0
def test_c_execution_with_global_temporaries():
    # ensure that the "host" code of a bare ExecutableCTarget with
    # global constant temporaries is None

    from loopy.target.c import ExecutableCTarget
    from loopy.kernel.data import temp_var_scope as scopes
    n = 10

    knl = lp.make_kernel(
        '{[i]: 0 <= i < n}',
        """
            a[i] = b[i]
        """, [
            lp.GlobalArg('a', shape=(n, ), dtype=np.int32),
            lp.TemporaryVariable('b',
                                 shape=(n, ),
                                 initializer=np.arange(n, dtype=np.int32),
                                 dtype=np.int32,
                                 read_only=True,
                                 scope=scopes.GLOBAL)
        ],
        target=ExecutableCTarget())

    knl = lp.fix_parameters(knl, n=n)
    assert ('int b[%d]' % n) not in lp.generate_code_v2(knl).host_code()
    assert np.allclose(knl(a=np.zeros(10, dtype=np.int32))[1], np.arange(10))
Пример #3
0
def test_forced_iname_deps_and_reduction():
    # See https://github.com/inducer/loopy/issues/24

    # This is (purposefully) somewhat un-idiomatic, to replicate the conditions
    # under which the above bug was found. If assignees were phi[i], then the
    # iname propagation heuristic would not assume that dependent instructions
    # need to run inside of 'i', and hence the forced_iname_* bits below would not
    # be needed.

    i1 = lp.CInstruction("i", "doSomethingToGetPhi();", assignees="phi")

    from pymbolic.primitives import Subscript, Variable
    i2 = lp.Assignment("a",
                       lp.Reduction("sum", "j",
                                    Subscript(Variable("phi"), Variable("j"))),
                       forced_iname_deps=frozenset(),
                       forced_iname_deps_is_final=True)

    k = lp.make_kernel(
        "{[i,j] : 0<=i,j<n}",
        [i1, i2],
        [
            lp.GlobalArg("a", dtype=np.float32, shape=()),
            lp.ValueArg("n", dtype=np.int32),
            lp.TemporaryVariable("phi", dtype=np.float32, shape=("n", )),
        ],
        target=lp.CTarget(),
    )

    k = lp.preprocess_kernel(k)

    assert 'i' not in k.insn_inames("insn_0_j_update")
    print(k.stringify(with_dependencies=True))
Пример #4
0
    def make_kernels(self, seq_dependencies):
        result = []

        for sub in self.kernels:
            # {{{ figure out arguments

            kernel_data = []
            for arg_name in sub.arg_names:
                dims = sub.dim_map.get(arg_name)

                if dims is not None:
                    # default order is set to "F" in kernel creation below
                    kernel_data.append(
                            lp.GlobalArg(
                                arg_name,
                                dtype=sub.get_type(arg_name),
                                shape=sub.get_loopy_shape(arg_name),
                                ))
                else:
                    kernel_data.append(
                            lp.ValueArg(arg_name,
                                dtype=sub.get_type(arg_name)))

            # }}}

            # {{{ figure out temporary variables

            for var_name in (
                    sub.known_names()
                    - set(sub.arg_names)
                    - sub.all_inames()):
                dtype = sub.get_type(var_name, none_ok=True)
                if sub.implicit_types is None and dtype is None:
                    continue

                kernel_data.append(
                        lp.TemporaryVariable(
                            var_name, dtype=dtype,
                            shape=sub.get_loopy_shape(var_name)))

            # }}}

            knl = lp.make_kernel(
                    sub.index_sets,
                    sub.instructions,
                    kernel_data,
                    name=sub.subprogram_name,
                    default_order="F",
                    index_dtype=self.index_dtype,
                    target=self.target,
                    seq_dependencies=seq_dependencies,
                    )

            from loopy.loop import fuse_loop_domains
            knl = fuse_loop_domains(knl)
            knl = lp.fold_constants(knl)

            result.append(knl)

        return result
Пример #5
0
def test_c_execution_with_global_temporaries():
    # ensure that the "host" code of a bare ExecutableCTarget with
    # global constant temporaries is None

    from loopy.target.c import ExecutableCTarget
    AS = lp.AddressSpace  # noqa
    n = 10

    knl = lp.make_kernel(
        "{[i]: 0 <= i < n}",
        """
            a[i] = b[i]
        """, [
            lp.GlobalArg("a", shape=(n, ), dtype=np.int32),
            lp.TemporaryVariable("b",
                                 shape=(n, ),
                                 initializer=np.arange(n, dtype=np.int32),
                                 dtype=np.int32,
                                 read_only=True,
                                 address_space=AS.GLOBAL)
        ],
        target=ExecutableCTarget())

    knl = lp.fix_parameters(knl, n=n)
    assert ("int b[%d]" % n) not in lp.generate_code_v2(knl).host_code()
    assert np.allclose(knl(a=np.zeros(10, dtype=np.int32))[1], np.arange(10))
Пример #6
0
def test_barrier_counter_barriers():

    knl = lp.make_kernel(
            "[n,m,ell] -> {[i,k,j]: 0<=i<50 and 1<=k<98 and 0<=j<10}",
            [
                """
            c[i,j,k] = 2*a[i,j,k] {id=first}
            e[i,j,k] = c[i,j,k+1]+c[i,j,k-1] {dep=first}
            """
            ], [
                lp.TemporaryVariable("c", lp.auto, shape=(50, 10, 99)),
                "..."
            ],
            name="weird2",
            )
    knl = lp.add_and_infer_dtypes(knl, dict(a=np.int32))
    knl = lp.split_iname(knl, "k", 128, inner_tag="l.0")
    sync_map = lp.get_synchronization_map(knl)
    print(sync_map)
    n = 512
    m = 256
    ell = 128
    params = {'n': n, 'm': m, 'ell': ell}
    barrier_count = sync_map["barrier_local"].eval_with_dict(params)
    assert barrier_count == 50*10*2
Пример #7
0
def test_to_batched_temp(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel(
        ''' { [i,j]: 0<=i,j<n } ''', ''' cnst = 2.0
         out[i] = sum(j, cnst*a[i,j]*x[j])''', [
            lp.TemporaryVariable("cnst",
                                 dtype=np.float32,
                                 shape=(),
                                 scope=lp.temp_var_scope.PRIVATE), '...'
        ])
    knl = lp.add_and_infer_dtypes(
        knl, dict(out=np.float32, x=np.float32, a=np.float32))
    ref_knl = lp.make_kernel(''' { [i,j]: 0<=i,j<n } ''',
                             '''out[i] = sum(j, 2.0*a[i,j]*x[j])''')
    ref_knl = lp.add_and_infer_dtypes(
        ref_knl, dict(out=np.float32, x=np.float32, a=np.float32))

    bknl = lp.to_batched(knl, "nbatches", "out,x")
    bref_knl = lp.to_batched(ref_knl, "nbatches", "out,x")

    # checking that cnst is not being bathced
    assert bknl.temporary_variables['cnst'].shape == ()

    a = np.random.randn(5, 5)
    x = np.random.randn(7, 5)

    # Checking that the program compiles and the logic is correct
    lp.auto_test_vs_ref(bref_knl,
                        ctx,
                        bknl,
                        parameters=dict(a=a, x=x, n=5, nbatches=7))
Пример #8
0
def test_to_batched_temp(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel(
        """ { [i,j]: 0<=i,j<n } """, """ cnst = 2.0
         out[i] = sum(j, cnst*a[i,j]*x[j])""", [
            lp.TemporaryVariable("cnst",
                                 dtype=np.float32,
                                 shape=(),
                                 address_space=lp.AddressSpace.PRIVATE), "..."
        ])
    knl = lp.add_and_infer_dtypes(
        knl, dict(out=np.float32, x=np.float32, a=np.float32))
    ref_knl = lp.make_kernel(""" { [i,j]: 0<=i,j<n } """,
                             """out[i] = sum(j, 2.0*a[i,j]*x[j])""")
    ref_knl = lp.add_and_infer_dtypes(
        ref_knl, dict(out=np.float32, x=np.float32, a=np.float32))

    bknl = lp.to_batched(knl, "nbatches", "out,x")
    bref_knl = lp.to_batched(ref_knl, "nbatches", "out,x")

    # checking that cnst is not being bathced
    assert bknl["loopy_kernel"].temporary_variables["cnst"].shape == ()

    a = np.random.randn(5, 5)
    x = np.random.randn(7, 5)

    # Checking that the program compiles and the logic is correct
    lp.auto_test_vs_ref(bref_knl,
                        ctx,
                        bknl,
                        parameters=dict(a=a, x=x, n=5, nbatches=7))
Пример #9
0
    def generate_wrapper_kernel_args(self, tensor2temp, templated_subkernels):
        coords_extent = self.extent(self.expression.ufl_domain().coordinates)
        args = [loopy.GlobalArg(self.coordinates_arg, shape=coords_extent,
                                dtype=self.tsfc_parameters["scalar_type"])]

        for loopy_inner in templated_subkernels:
            for arg in loopy_inner.args[1:]:
                if arg.name == self.cell_orientations_arg or\
                   arg.name == self.cell_size_arg:
                    if arg not in args:
                        args.append(arg)

        for coeff in self.bag.coefficients.values():
            if isinstance(coeff, OrderedDict):
                for (name, extent) in coeff.values():
                    arg = loopy.GlobalArg(name, shape=extent,
                                          dtype=self.tsfc_parameters["scalar_type"])
                    args.append(arg)
            else:
                (name, extent) = coeff
                arg = loopy.GlobalArg(name, shape=extent,
                                      dtype=self.tsfc_parameters["scalar_type"])
                args.append(arg)

        if self.bag.needs_cell_facets:
            # Arg for is exterior (==0)/interior (==1) facet or not
            args.append(loopy.GlobalArg(self.cell_facets_arg, shape=(self.num_facets, 2),
                                        dtype=np.int8))

            args.append(
                loopy.TemporaryVariable(self.local_facet_array_arg,
                                        shape=(self.num_facets,),
                                        dtype=np.uint32,
                                        address_space=loopy.AddressSpace.LOCAL,
                                        read_only=True,
                                        initializer=np.arange(self.num_facets, dtype=np.uint32),))

        if self.bag.needs_mesh_layers:
            args.append(loopy.GlobalArg(self.layer_count, shape=(),
                        dtype=np.int32))
            args.append(loopy.TemporaryVariable(self.layer_arg, shape=(),
                        dtype=np.int32, address_space=loopy.AddressSpace.GLOBAL))

        for tensor_temp in tensor2temp.values():
            args.append(tensor_temp)

        return args
Пример #10
0
    def initialise_terminals(self, var2terminal, coefficients):
        """ Initilisation of the variables in which coefficients
            and the Tensors coming from TSFC are saved.

            :arg var2terminal: dictionary that maps Slate Tensors to gem Variables
        """

        tensor2temp = OrderedDict()
        inits = []
        for gem_tensor, slate_tensor in var2terminal.items():
            assert slate_tensor.terminal, "Only terminal tensors need to be initialised in Slate kernels."
            (_, dtype), = assign_dtypes([gem_tensor],
                                        self.tsfc_parameters["scalar_type"])
            loopy_tensor = loopy.TemporaryVariable(
                gem_tensor.name,
                dtype=dtype,
                shape=gem_tensor.shape,
                address_space=loopy.AddressSpace.LOCAL)
            tensor2temp[slate_tensor] = loopy_tensor

            if not slate_tensor.assembled:
                indices = self.bag.index_creator(self.shape(slate_tensor))
                inames = {var.name for var in indices}
                var = pym.Subscript(pym.Variable(loopy_tensor.name), indices)
                inits.append(
                    loopy.Assignment(var,
                                     "0.",
                                     id="init%d" % len(inits),
                                     within_inames=frozenset(inames)))

            else:
                f = slate_tensor.form if isinstance(
                    slate_tensor.form, tuple) else (slate_tensor.form, )
                coeff = tuple(coefficients[c] for c in f)
                offset = 0
                ismixed = tuple(
                    (type(c.ufl_element()) == MixedElement) for c in f)
                names = []
                for (im, c) in zip(ismixed, coeff):
                    names += [name
                              for (name, ext) in c.values()] if im else [c[0]]

                # Mixed coefficients come as seperate parameter (one per space)
                for i, shp in enumerate(*slate_tensor.shapes.values()):
                    indices = self.bag.index_creator((shp, ))
                    inames = {var.name for var in indices}
                    offset_index = (pym.Sum((offset, indices[0])), )
                    name = names[i] if ismixed else names
                    var = pym.Subscript(pym.Variable(loopy_tensor.name),
                                        offset_index)
                    c = pym.Subscript(pym.Variable(name), indices)
                    inits.append(
                        loopy.Assignment(var,
                                         c,
                                         id="init%d" % len(inits),
                                         within_inames=frozenset(inames)))
                    offset += shp

        return inits, tensor2temp
Пример #11
0
def expression_variable(expr, parameters):
    name = expr.name
    shape = expr.shape
    dtype = expr.dtype
    if name not in parameters.temporaries:
        parameters.temporaries[name] = loopy.TemporaryVariable(
            name, dtype=dtype, shape=shape, address_space=loopy.auto)
    return pym.Variable(name)
Пример #12
0
def get_loopy_temporary(name: str, expr: Array) -> lp.TemporaryVariable:
    is_shape_symbolic = not all(isinstance(dim, int) for dim in expr.shape)
    # Only global variables can have symbolic shape.
    address_space = lp.AddressSpace.GLOBAL if is_shape_symbolic else lp.auto
    return lp.TemporaryVariable(name,
                                dtype=expr.dtype,
                                shape=expr.shape,
                                address_space=address_space)
Пример #13
0
    def get_kernel(self, **kwargs):

        extra_kernel_kwarg_types = ()
        if "extra_kernel_kwarg_types" in kwargs:
            extra_kernel_kwarg_types = kwargs["extra_kernel_kwarg_types"]

        eval_inames = frozenset(["itgt"])
        scalar_assignment = lp.Assignment(
            id=None,
            assignee="expr_val",
            expression=self.get_normalised_expr(),
            temp_var_type=None,
        )
        eval_insns = [
            insn.copy(within_inames=insn.within_inames | eval_inames)
            for insn in [scalar_assignment]
        ]

        loopy_knl = lp.make_kernel(  # NOQA
            "{ [itgt]: 0<=itgt<n_targets }",
            [
                """
                for itgt
                    VAR_ASSIGNMENT
                end
                """.replace("VAR_ASSIGNMENT",
                            self.get_variable_assignment_code())
            ] + eval_insns + [
                """
                for itgt
                    result[itgt] = expr_val
                end
                """
            ],
            [
                lp.ValueArg("dim, n_targets", np.int32),
                lp.GlobalArg("target_points", np.float64, "dim, n_targets"),
                lp.TemporaryVariable("expr_val", None, ()),
            ] + list(extra_kernel_kwarg_types) + [
                "...",
            ],
            name="eval_expr",
            lang_version=(2018, 2),
        )

        loopy_knl = lp.fix_parameters(loopy_knl, dim=self.dim)
        loopy_knl = lp.set_options(loopy_knl, write_cl=False)
        loopy_knl = lp.set_options(loopy_knl, return_dict=True)

        if self.function_manglers is not None:
            loopy_knl = lp.register_function_manglers(loopy_knl,
                                                      self.function_manglers)

        if self.preamble_generators is not None:
            loopy_knl = lp.register_preamble_generators(
                loopy_knl, self.preamble_generators)

        return loopy_knl
Пример #14
0
def test_pyopencl_target_with_global_temps_with_base_storage(ctx_factory):
    from pyopencl.tools import ImmediateAllocator

    class RecordingAllocator(ImmediateAllocator):
        def __init__(self, queue):
            super().__init__(queue)
            self.allocated_nbytes = 0

        def __call__(self, size):
            self.allocated_nbytes += size
            return super().__call__(size)

    ctx = ctx_factory()
    cq = cl.CommandQueue(ctx)

    knl = lp.make_kernel(
        "{[i, j]: 0<=i, j<10}",
        """
        tmp1[i] = 2*i    {id=w_tmp1}
        y[i] = tmp1[i] {nosync=w_tmp1}
        ... gbarrier
        tmp2[j] = 3*j    {id=w_tmp2}
        z[j] = tmp2[j] {nosync=w_tmp2}
        """, [
            lp.TemporaryVariable("tmp1",
                                 base_storage="base",
                                 address_space=lp.AddressSpace.GLOBAL),
            lp.TemporaryVariable("tmp2",
                                 base_storage="base",
                                 address_space=lp.AddressSpace.GLOBAL), ...
        ],
        seq_dependencies=True)
    knl = lp.tag_inames(knl, {"i": "g.0", "j": "g.0"})
    knl = lp.set_options(knl, "return_dict")

    my_allocator = RecordingAllocator(cq)
    _, out = knl(cq, allocator=my_allocator)

    np.testing.assert_allclose(out["y"].get(), 2 * np.arange(10))
    np.testing.assert_allclose(out["z"].get(), 3 * np.arange(10))
    assert my_allocator.allocated_nbytes == (
        40  # base
        + 40  # y
        + 40  # z
    )
Пример #15
0
def expression_namedliteral(expr, parameters):
    name = expr.name
    val = loopy.TemporaryVariable(name,
                                  dtype=expr.dtype,
                                  shape=expr.shape,
                                  address_space=loopy.AddressSpace.LOCAL,
                                  read_only=True,
                                  initializer=expr.value)
    parameters.temporaries[name] = val

    return pym.Variable(name)
Пример #16
0
def get_loopy_temporary(name: str, expr: Array, cgen_mapper: CodeGenMapper,
                        state: CodeGenState) -> lp.TemporaryVariable:
    # always allocating to global address space to avoid stack overflow
    address_space = lp.AddressSpace.GLOBAL
    return lp.TemporaryVariable(name,
                                shape=shape_to_scalar_expression(
                                    expr.shape, cgen_mapper, state),
                                dtype=expr.dtype,
                                address_space=address_space,
                                tags=_filter_tags_not_of_type(
                                    expr,
                                    cgen_mapper.array_tag_t_to_not_propagate))
Пример #17
0
    def __call__(self, preamble_info):
        from loopy.kernel.data import temp_var_scope as scopes

        # find a function matching our name
        func_match = next((x for x in preamble_info.seen_functions
                           if x.name == self.func_name), None)
        desc = 'custom_funcs_indirect'
        if func_match is not None:
            from loopy.types import to_loopy_type
            # check types
            if tuple(to_loopy_type(x) for x in self.func_arg_dtypes) == \
                    func_match.arg_dtypes:
                # if match, create our temporary
                var = lp.TemporaryVariable('lookup',
                                           initializer=self.arr,
                                           dtype=self.arr.dtype,
                                           shape=self.arr.shape,
                                           scope=scopes.GLOBAL,
                                           read_only=True)
                # and code
                code = """
        int {name}(int start, int end, int match)
        {{
            int result = start;
            for (int i = start + 1; i < end; ++i)
            {{
                if (lookup[i] == match)
                    result = i;
            }}
            return result;
        }}
        """.format(name=self.func_name)

        # generate temporary variable code
        from cgen import Initializer
        from loopy.target.c import generate_array_literal
        codegen_state = preamble_info.codegen_state.copy(
            is_generating_device_code=True)
        kernel = preamble_info.kernel
        ast_builder = codegen_state.ast_builder
        target = kernel.target
        decl_info, = var.decl_info(target, index_dtype=kernel.index_dtype)
        decl = ast_builder.wrap_global_constant(
            ast_builder.get_temporary_decl(codegen_state, None, var,
                                           decl_info))
        if var.initializer is not None:
            decl = Initializer(
                decl,
                generate_array_literal(codegen_state, var, var.initializer))
        # return generated code
        yield (desc, '\n'.join([str(decl), code]))
Пример #18
0
    def get_vars(self, cname):
        vars = []
        comp = self.templates[cname]

        # Parameters are constant arguments
        for p in list(comp.parameters):
            p_edge = comp.edge_info[p]
            dtype = hu.get_attribute_value(p_edge.attributes['type'])
            dims = self.get_edge_dims(p_edge)
            param_var = lp.ConstantArg(p, dtype, shape=dims)
            vars.append(param_var)

        # Need to add read only option for inputs
        # Inputs, outputs, and states represent
        global_args = list(comp.input) + list(comp.output) + list(comp.state)
        dim_vars = []
        for g in global_args:
            g_edge = comp.edge_info[g]
            dtype = hu.get_attribute_value(g_edge.attributes['type'])
            dims = self.get_edge_dims(g_edge)
            for d in dims:
                if d not in dim_vars:
                    dim_vars.append(d)
                    dim_var = lp.ValueArg(d, dtype=np.int32)
                    vars.append(dim_var)
            if len(dims) == 0:
                g_var = lp.ValueArg(g, dtype=dtype)
            else:
                g_var = lp.GlobalArg(g, dtype=dtype, shape=dims)
            vars.append(g_var)

        # Each flow declaration is a temporary variable declared in the comp scope
        for s in comp.statements:

            if s.op_type == 'declaration' and s.op_cat == 'declaration':
                d_vars = list(s.output)
                for d in d_vars:
                    edge_decl = comp.edge_info[d]
                    dims = self.get_edge_dims(edge_decl)
                    dtype = hu.get_attribute_value(
                        edge_decl.attributes['type'])
                    t_var = lp.TemporaryVariable(d, dtype=dtype, shape=dims)
                    vars.append(t_var)

        return vars
Пример #19
0
def test_get_kernel_input_and_output():
    # make a kernel
    knl = lp.make_kernel('{[i]: 0 <= i < 2}',
                         '<> a = 1')
    assert not len(find_inputs_and_outputs(knl))

    knl = lp.make_kernel('{[i]: 0 <= i < 2}',
                         '<> a = b[i]',
                         [lp.GlobalArg('b', shape=(2,))])
    assert find_inputs_and_outputs(knl) == set(['b'])

    knl = lp.make_kernel('{[i]: 0 <= i < 2}',
                         '<> a = b[i] + c[i]',
                         [lp.GlobalArg('b', shape=(2,)),
                          lp.TemporaryVariable('c', shape=(2,), scope=scopes.GLOBAL)
                          ],
                         silenced_warnings=['read_no_write(c)'])
    assert find_inputs_and_outputs(knl) == set(['b', 'c'])
Пример #20
0
    def initialise_terminals(self, var2terminal, coefficients):
        """ Initilisation of the variables in which coefficients
            and the Tensors coming from TSFC are saved.

            :arg var2terminal: dictionary that maps Slate Tensors to gem Variables
        """

        tensor2temp = OrderedDict()
        inits = []
        for gem_tensor, slate_tensor in var2terminal.items():
            loopy_tensor = loopy.TemporaryVariable(gem_tensor.name,
                                                   shape=gem_tensor.shape,
                                                   address_space=loopy.AddressSpace.LOCAL)
            tensor2temp[slate_tensor] = loopy_tensor

            if isinstance(slate_tensor, slate.Tensor):
                indices = self.bag.index_creator(self.shape(slate_tensor))
                inames = {var.name for var in indices}
                var = pym.Subscript(pym.Variable(loopy_tensor.name), indices)
                inits.append(loopy.Assignment(var, "0.", id="init%d" % len(inits),
                                              within_inames=frozenset(inames)))

            elif isinstance(slate_tensor, slate.AssembledVector):
                f = slate_tensor._function
                coeff = coefficients[f]
                offset = 0
                ismixed = (type(f.ufl_element()) == MixedElement)
                names = [name for (name, ext) in coeff.values()] if ismixed else coeff[0]

                # Mixed coefficients come as seperate parameter (one per space)
                for i, shp in enumerate(*slate_tensor.shapes.values()):
                    indices = self.bag.index_creator((shp,))
                    inames = {var.name for var in indices}
                    offset_index = (pym.Sum((offset, indices[0])),)
                    name = names[i] if ismixed else names
                    var = pym.Subscript(pym.Variable(loopy_tensor.name), offset_index)
                    c = pym.Subscript(pym.Variable(name), indices)
                    inits.append(loopy.Assignment(var, c, id="init%d" % len(inits),
                                                  within_inames=frozenset(inames)))
                    offset += shp

        return inits, tensor2temp
Пример #21
0
def test_memory_tools_defn():
    wrapper = __test_cases()
    for opts in wrapper:
        # create a dummy callgen
        callgen = CallgenResult(order=opts.order, lang=opts.lang,
                                dev_mem_type=wrapper.state['dev_mem_type'],
                                type_map=type_map(opts.lang))
        # create a memory manager
        mem = get_memory(callgen, host_namer=HostNamer(), device_namer=DeviceNamer())

        a1 = lp.GlobalArg('a1', shape=(arc.problem_size), dtype=np.int32)
        a2 = lp.GlobalArg('a2', shape=(arc.problem_size, 10), dtype=np.int64)
        d3 = lp.GlobalArg('d3', shape=(arc.problem_size, 10, 10), dtype=np.float64)
        a4 = lp.ValueArg('a4', dtype=np.int64)
        a5 = lp.ValueArg('a5', dtype=np.int32)
        a6 = lp.TemporaryVariable('a6', initializer=np.array([0, 1, 2]),
                                  read_only=True)

        if opts.lang == 'opencl':
            assert mem.define(True, a1) == 'cl_mem d_a1;'
            assert mem.define(False, a2) == 'long int* h_a2;'
            assert mem.define(True, d3) == 'cl_mem d_d3;'
            assert mem.define(False, a4) == 'long int h_a4;'
            assert mem.define(True, a5) == 'cl_uint d_a5;'
            assert mem.define(True, a5) == 'cl_uint d_a5;'
            with assert_raises(Exception):
                mem.define(True, a6, host_constant=True)
            assert mem.define(False, a6, host_constant=True) == \
                'const long int h_a6[3] = {0, 1, 2};'

        elif opts.lang == 'c':
            assert mem.define(True, a1) == 'int* d_a1;'
            assert mem.define(False, a2) == 'long int* h_a2;'
            assert mem.define(True, d3) == 'double* d_d3;'
            assert mem.define(False, a4) == 'long int h_a4;'
            assert mem.define(True, a5) == 'int d_a5;'
            with assert_raises(Exception):
                mem.define(True, a6, host_constant=True)
            assert mem.define(False, a6, host_constant=True) == \
                'const long int h_a6[3] = {0, 1, 2};'
        else:
            raise NotImplementedError
Пример #22
0
def test_c_instruction(ctx_factory):
    #logging.basicConfig(level=logging.DEBUG)
    ctx = ctx_factory()

    knl = lp.make_kernel("{[i,j]: 0<=i,j<n }", [
        lp.CInstruction("i,j",
                        """
                    x = sin((float) i*j);
                    """,
                        assignees="x"),
        "a[i,j] = x",
    ], [
        lp.GlobalArg("a", shape=lp.auto, dtype=np.float32),
        lp.TemporaryVariable("x", np.float32),
        "...",
    ],
                         assumptions="n>=1")

    knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")

    print(knl)
    print(lp.CompiledKernel(ctx, knl).get_highlighted_code())
Пример #23
0
    def get_diff_var(self, var_name):
        """
        :return: a string containing the name of a new variable
            holding the derivative of *var_name* by the desired
            *diff_context.by_name*, or *None* if no dependency exists.
        """
        new_var_name = self.rule_mapping_context.make_unique_var_name(
            var_name + "_d" + self.by_name)

        writers = self.kernel.writer_map().get(var_name, [])

        if not writers:
            # FIXME: There should be hooks to supply earlier dvar_dby
            # This would be the spot to think about them.
            return None

        if len(writers) > 1:
            raise LoopyError("%s is written in more than one place" % var_name)

        orig_writer_id, = writers
        orig_writer_insn = self.kernel.id_to_insn[orig_writer_id]

        diff_inames = self.add_diff_inames()
        diff_iname_exprs = tuple(var(diname) for diname in diff_inames)

        # {{{ write code

        diff_mapper = LoopyDiffMapper(self.rule_mapping_context, self,
                                      diff_inames)

        diff_expr = diff_mapper(orig_writer_insn.expression, self.kernel,
                                orig_writer_insn)

        if not diff_expr:
            return None

        assert isinstance(orig_writer_insn, lp.Assignment)
        if isinstance(orig_writer_insn.assignee, p.Subscript):
            lhs_ind = orig_writer_insn.assignee.index_tuple
        elif isinstance(orig_writer_insn.assignee, p.Variable):
            lhs_ind = ()
        else:
            raise LoopyError("Unrecognized LHS type in differentiation: %s" %
                             type(orig_writer_insn.assignee).__name__)

        new_insn_id = self.generate_instruction_id()
        insn = lp.Assignment(id=new_insn_id,
                             assignee=var(new_var_name)[lhs_ind +
                                                        diff_iname_exprs],
                             expression=diff_expr,
                             within_inames=(orig_writer_insn.within_inames
                                            | frozenset(diff_inames)))

        self.new_instructions.append(insn)

        # }}}

        # {{{ manage variable declaration

        if var_name in self.kernel.arg_dict:
            arg = self.kernel.arg_dict[var_name]
            orig_shape = arg.shape

        elif var_name in self.kernel.temporary_variables:
            tv = self.kernel.temporary_variables[var_name]
            orig_shape = tv.shape

        else:
            raise ValueError("%s: variable not found" % var_name)

        shape = orig_shape + self.additional_shape
        dim_tags = ("c", ) * len(shape)

        if var_name in self.kernel.arg_dict:
            self.new_args.append(
                lp.GlobalArg(
                    new_var_name,
                    arg.dtype,
                    shape=shape,
                    dim_tags=dim_tags,
                ))

        elif var_name in self.kernel.temporary_variables:
            self.new_temporary_variables[new_var_name] = lp.TemporaryVariable(
                new_var_name, tv.dtype, shape=shape, dim_tags=dim_tags)

        # }}}

        return new_var_name
Пример #24
0
def buffer_array(kernel, var_name, buffer_inames, init_expression=None,
        store_expression=None, within=None, default_tag="l.auto",
        temporary_scope=None, temporary_is_local=None,
        fetch_bounding_box=False):
    """Replace accesses to *var_name* with ones to a temporary, which is
    created and acts as a buffer. To perform this transformation, the access
    footprint to *var_name* is determined and a temporary of a suitable
    :class:`loopy.AddressSpace` and shape is created.

    By default, the value of the buffered cells in *var_name* are read prior to
    any (read/write) use, and the modified values are written out after use has
    concluded, but for special use cases (e.g. additive accumulation), the
    behavior can be modified using *init_expression* and *store_expression*.

    :arg buffer_inames: The inames across which the buffer should be usable--i.e.
        all possible values of these inames will be covered by the buffer footprint.
        A tuple of inames or a comma-separated string.
    :arg init_expression: Either *None* (indicating the prior value of the buffered
        array should be read) or an expression optionally involving the
        variable 'base' (which references the associated location in the array
        being buffered).
    :arg store_expression: Either *None*, *False*, or an expression involving
        variables 'base' and 'buffer' (without array indices).
        (*None* indicates that a default storage instruction should be used,
        *False* indicates that no storing of the temporary should occur
        at all.)
    :arg within: If not None, limit the action of the transformation to
        matching contexts.  See :func:`loopy.match.parse_stack_match`
        for syntax.
    :arg temporary_scope: If given, override the choice of
        :class:`AddressSpace` for the created temporary.
    :arg default_tag: The default :ref:`iname-tags` to be assigned to the
        inames used for fetching and storing
    :arg fetch_bounding_box: If the access footprint is non-convex
        (resulting in an error), setting this argument to *True* will force a
        rectangular (and hence convex) superset of the footprint to be
        fetched.
    """

    # {{{ unify temporary_scope / temporary_is_local

    from loopy.kernel.data import AddressSpace
    if temporary_is_local is not None:
        from warnings import warn
        warn("temporary_is_local is deprecated. Use temporary_scope instead",
                DeprecationWarning, stacklevel=2)

        if temporary_scope is not None:
            raise LoopyError("may not specify both temporary_is_local and "
                    "temporary_scope")

        if temporary_is_local:
            temporary_scope = AddressSpace.LOCAL
        else:
            temporary_scope = AddressSpace.PRIVATE

    del temporary_is_local

    # }}}

    # {{{ process arguments

    if isinstance(init_expression, str):
        from loopy.symbolic import parse
        init_expression = parse(init_expression)

    if isinstance(store_expression, str):
        from loopy.symbolic import parse
        store_expression = parse(store_expression)

    if isinstance(buffer_inames, str):
        buffer_inames = [s.strip()
                for s in buffer_inames.split(",") if s.strip()]

    for iname in buffer_inames:
        if iname not in kernel.all_inames():
            raise RuntimeError("sweep iname '%s' is not a known iname"
                    % iname)

    buffer_inames = list(buffer_inames)
    buffer_inames_set = frozenset(buffer_inames)

    from loopy.match import parse_stack_match
    within = parse_stack_match(within)

    if var_name in kernel.arg_dict:
        var_descr = kernel.arg_dict[var_name]
    elif var_name in kernel.temporary_variables:
        var_descr = kernel.temporary_variables[var_name]
    else:
        raise ValueError("variable '%s' not found" % var_name)

    from loopy.kernel.data import ArrayBase
    if isinstance(var_descr, ArrayBase):
        var_shape = var_descr.shape
    else:
        var_shape = ()

    if temporary_scope is None:
        import loopy as lp
        temporary_scope = lp.auto

    # }}}

    # {{{ caching

    from loopy import CACHING_ENABLED

    from loopy.preprocess import prepare_for_caching
    key_kernel = prepare_for_caching(kernel)
    cache_key = (key_kernel, var_name, tuple(buffer_inames),
            PymbolicExpressionHashWrapper(init_expression),
            PymbolicExpressionHashWrapper(store_expression), within,
            default_tag, temporary_scope, fetch_bounding_box)

    if CACHING_ENABLED:
        try:
            result = buffer_array_cache[cache_key]
            logger.info("%s: buffer_array cache hit" % kernel.name)
            return result
        except KeyError:
            pass

    # }}}

    var_name_gen = kernel.get_var_name_generator()
    within_inames = set()

    access_descriptors = []
    for insn in kernel.instructions:
        if not within(kernel, insn.id, ()):
            continue

        from pymbolic.primitives import Variable, Subscript
        from loopy.symbolic import LinearSubscript

        for assignee in insn.assignees:
            if isinstance(assignee, Variable):
                assignee_name = assignee.name
                index = ()

            elif isinstance(assignee, Subscript):
                assignee_name = assignee.aggregate.name
                index = assignee.index_tuple

            elif isinstance(assignee, LinearSubscript):
                if assignee.aggregate.name == var_name:
                    raise LoopyError("buffer_array may not be applied in the "
                            "presence of linear write indexing into '%s'" % var_name)

            else:
                raise LoopyError("invalid lvalue '%s'" % assignee)

            if assignee_name == var_name:
                within_inames.update(
                        (get_dependencies(index) & kernel.all_inames())
                        - buffer_inames_set)
                access_descriptors.append(
                        AccessDescriptor(
                            identifier=insn.id,
                            storage_axis_exprs=index))

    # {{{ find fetch/store inames

    init_inames = []
    store_inames = []
    new_iname_to_tag = {}

    for i in range(len(var_shape)):
        dim_name = str(i)
        if isinstance(var_descr, ArrayBase) and var_descr.dim_names is not None:
            dim_name = var_descr.dim_names[i]

        init_iname = var_name_gen(f"{var_name}_init_{dim_name}")
        store_iname = var_name_gen(f"{var_name}_store_{dim_name}")

        new_iname_to_tag[init_iname] = default_tag
        new_iname_to_tag[store_iname] = default_tag

        init_inames.append(init_iname)
        store_inames.append(store_iname)

    # }}}

    # {{{ modify loop domain

    non1_init_inames = []
    non1_store_inames = []

    if var_shape:
        # {{{ find domain to be changed

        from loopy.kernel.tools import DomainChanger
        domch = DomainChanger(kernel, buffer_inames_set | within_inames)

        if domch.leaf_domain_index is not None:
            # If the sweep inames are at home in parent domains, then we'll add
            # fetches with loops over copies of these parent inames that will end
            # up being scheduled *within* loops over these parents.

            for iname in buffer_inames_set:
                if kernel.get_home_domain_index(iname) != domch.leaf_domain_index:
                    raise RuntimeError("buffer iname '%s' is not 'at home' in the "
                            "sweep's leaf domain" % iname)

        # }}}

        abm = ArrayToBufferMap(kernel, domch.domain, buffer_inames,
                access_descriptors, len(var_shape))

        for i in range(len(var_shape)):
            if abm.non1_storage_axis_flags[i]:
                non1_init_inames.append(init_inames[i])
                non1_store_inames.append(store_inames[i])
            else:
                del new_iname_to_tag[init_inames[i]]
                del new_iname_to_tag[store_inames[i]]

        new_domain = domch.domain
        new_domain = abm.augment_domain_with_sweep(
                    new_domain, non1_init_inames,
                    boxify_sweep=fetch_bounding_box)
        new_domain = abm.augment_domain_with_sweep(
                    new_domain, non1_store_inames,
                    boxify_sweep=fetch_bounding_box)
        new_kernel_domains = domch.get_domains_with(new_domain)
        del new_domain

    else:
        # leave kernel domains unchanged
        new_kernel_domains = kernel.domains

        abm = NoOpArrayToBufferMap()

    # }}}

    # {{{ set up temp variable

    import loopy as lp

    buf_var_name = var_name_gen(based_on=var_name+"_buf")

    new_temporary_variables = kernel.temporary_variables.copy()
    temp_var = lp.TemporaryVariable(
            name=buf_var_name,
            dtype=var_descr.dtype,
            base_indices=(0,)*len(abm.non1_storage_shape),
            shape=tuple(abm.non1_storage_shape),
            address_space=temporary_scope)

    new_temporary_variables[buf_var_name] = temp_var

    # }}}

    new_insns = []

    buf_var = var(buf_var_name)

    # {{{ generate init instruction

    buf_var_init = buf_var
    if non1_init_inames:
        buf_var_init = buf_var_init.index(
                tuple(var(iname) for iname in non1_init_inames))

    init_base = var(var_name)

    init_subscript = []
    init_iname_idx = 0
    if var_shape:
        for i in range(len(var_shape)):
            ax_subscript = abm.storage_base_indices[i]
            if abm.non1_storage_axis_flags[i]:
                ax_subscript += var(non1_init_inames[init_iname_idx])
                init_iname_idx += 1
            init_subscript.append(ax_subscript)

    if init_subscript:
        init_base = init_base.index(tuple(init_subscript))

    if init_expression is None:
        init_expression = init_base
    else:
        init_expression = init_expression
        init_expression = SubstitutionMapper(
                make_subst_func({
                    "base": init_base,
                    }))(init_expression)

    init_insn_id = kernel.make_unique_instruction_id(based_on="init_"+var_name)
    from loopy.kernel.data import Assignment
    init_instruction = Assignment(id=init_insn_id,
                assignee=buf_var_init,
                expression=init_expression,
                within_inames=(
                    frozenset(within_inames)
                    | frozenset(non1_init_inames)),
                depends_on=frozenset(),
                depends_on_is_final=True)

    # }}}

    rule_mapping_context = SubstitutionRuleMappingContext(
            kernel.substitutions, kernel.get_var_name_generator())
    aar = ArrayAccessReplacer(rule_mapping_context, var_name,
            within, abm, buf_var)
    kernel = rule_mapping_context.finish_kernel(aar.map_kernel(kernel))

    did_write = False
    for insn_id in aar.modified_insn_ids:
        insn = kernel.id_to_insn[insn_id]
        if buf_var_name in insn.assignee_var_names():
            did_write = True

    # {{{ add init_insn_id to depends_on

    new_insns = []

    def none_to_empty_set(s):
        if s is None:
            return frozenset()
        else:
            return s

    for insn in kernel.instructions:
        if insn.id in aar.modified_insn_ids:
            new_insns.append(
                    insn.copy(
                        depends_on=(
                            none_to_empty_set(insn.depends_on)
                            | frozenset([init_insn_id]))))
        else:
            new_insns.append(insn)

    # }}}

    # {{{ generate store instruction

    buf_var_store = buf_var
    if non1_store_inames:
        buf_var_store = buf_var_store.index(
                tuple(var(iname) for iname in non1_store_inames))

    store_subscript = []
    store_iname_idx = 0
    if var_shape:
        for i in range(len(var_shape)):
            ax_subscript = abm.storage_base_indices[i]
            if abm.non1_storage_axis_flags[i]:
                ax_subscript += var(non1_store_inames[store_iname_idx])
                store_iname_idx += 1
            store_subscript.append(ax_subscript)

    store_target = var(var_name)
    if store_subscript:
        store_target = store_target.index(tuple(store_subscript))

    if store_expression is None:
        store_expression = buf_var_store
    else:
        store_expression = SubstitutionMapper(
                make_subst_func({
                    "base": store_target,
                    "buffer": buf_var_store,
                    }))(store_expression)

    if store_expression is not False:
        from loopy.kernel.data import Assignment
        store_instruction = Assignment(
                    id=kernel.make_unique_instruction_id(based_on="store_"+var_name),
                    depends_on=frozenset(aar.modified_insn_ids),
                    no_sync_with=frozenset([(init_insn_id, "any")]),
                    assignee=store_target,
                    expression=store_expression,
                    within_inames=(
                        frozenset(within_inames)
                        | frozenset(non1_store_inames)))
    else:
        did_write = False

    # }}}

    new_insns.append(init_instruction)
    if did_write:
        new_insns.append(store_instruction)
    else:
        for iname in store_inames:
            del new_iname_to_tag[iname]

    kernel = kernel.copy(
            domains=new_kernel_domains,
            instructions=new_insns,
            temporary_variables=new_temporary_variables)

    from loopy import tag_inames
    kernel = tag_inames(kernel, new_iname_to_tag)

    from loopy.kernel.tools import assign_automatic_axes
    kernel = assign_automatic_axes(kernel)

    if CACHING_ENABLED:
        from loopy.preprocess import prepare_for_caching
        buffer_array_cache.store_if_not_present(
                cache_key, prepare_for_caching(kernel))

    return kernel
Пример #25
0
def generate(impero_c,
             args,
             scalar_type,
             kernel_name="loopy_kernel",
             index_names=[],
             return_increments=True):
    """Generates loopy code.

    :arg impero_c: ImperoC tuple with Impero AST and other data
    :arg args: list of loopy.GlobalArgs
    :arg scalar_type: type of scalars as C typename string
    :arg kernel_name: function name of the kernel
    :arg index_names: pre-assigned index names
    :arg return_increments: Does codegen for Return nodes increment the lvalue, or assign?
    :returns: loopy kernel
    """
    ctx = LoopyContext()
    ctx.indices = impero_c.indices
    ctx.index_names = defaultdict(lambda: "i", index_names)
    ctx.epsilon = numpy.finfo(scalar_type).resolution
    ctx.scalar_type = scalar_type
    ctx.return_increments = return_increments

    # Create arguments
    data = list(args)
    for i, (temp, dtype) in enumerate(
            assign_dtypes(impero_c.temporaries, scalar_type)):
        name = "t%d" % i
        if isinstance(temp, gem.Constant):
            data.append(
                lp.TemporaryVariable(name,
                                     shape=temp.shape,
                                     dtype=dtype,
                                     initializer=temp.array,
                                     address_space=lp.AddressSpace.LOCAL,
                                     read_only=True))
        else:
            shape = tuple([i.extent for i in ctx.indices[temp]]) + temp.shape
            data.append(
                lp.TemporaryVariable(name,
                                     shape=shape,
                                     dtype=dtype,
                                     initializer=None,
                                     address_space=lp.AddressSpace.LOCAL,
                                     read_only=False))
        ctx.gem_to_pymbolic[temp] = p.Variable(name)

    # Create instructions
    instructions = statement(impero_c.tree, ctx)

    # Create domains
    domains = create_domains(ctx.index_extent.items())

    # Create loopy kernel
    knl = lp.make_function(domains,
                           instructions,
                           data,
                           name=kernel_name,
                           target=lp.CTarget(),
                           seq_dependencies=True,
                           silenced_warnings=["summing_if_branches_ops"],
                           lang_version=(2018, 2))

    # Prevent loopy interchange by loopy
    knl = lp.prioritize_loops(knl, ",".join(ctx.index_extent.keys()))

    return knl
Пример #26
0
    def __init__(self, fft, effective_k, dk, dx):
        self.fft = fft

        if not callable(effective_k):
            if effective_k != 0:
                from pystella.derivs import FirstCenteredDifference
                h = effective_k
                effective_k = FirstCenteredDifference(h).get_eigenvalues
            else:

                def effective_k(k, dx):  # pylint: disable=function-redefined
                    return k

        queue = self.fft.sub_k["momenta_x"].queue
        sub_k = list(x.get().astype("int") for x in self.fft.sub_k.values())
        eff_mom_names = ("eff_mom_x", "eff_mom_y", "eff_mom_z")
        self.eff_mom = {}
        for mu, (name, kk) in enumerate(zip(eff_mom_names, sub_k)):
            eff_k = effective_k(dk[mu] * kk.astype(fft.rdtype), dx[mu])
            eff_k[abs(sub_k[mu]) == fft.grid_shape[mu] // 2] = 0.
            eff_k[sub_k[mu] == 0] = 0.

            import pyopencl.array as cla
            self.eff_mom[name] = cla.to_device(queue, eff_k)

        from pymbolic import var, parse
        from pymbolic.primitives import If, Comparison, LogicalAnd
        from pystella import Field
        indices = parse("i, j, k")
        eff_k = tuple(
            var(array)[mu] for array, mu in zip(eff_mom_names, indices))
        fabs, sqrt, conj = parse("fabs, sqrt, conj")
        kmag = sqrt(sum(kk**2 for kk in eff_k))

        from pystella import ElementWiseMap
        vector = Field("vector", shape=(3, ))
        vector_T = Field("vector_T", shape=(3, ))

        kvec_zero = LogicalAnd(
            tuple(Comparison(fabs(eff_k[mu]), "<", 1e-14) for mu in range(3)))

        # note: write all output via private temporaries to allow for in-place

        div = var("div")
        div_insn = [(div, sum(eff_k[mu] * vector[mu] for mu in range(3)))]
        self.transversify_knl = ElementWiseMap(
            {
                vector_T[mu]: If(kvec_zero, 0,
                                 vector[mu] - eff_k[mu] / kmag**2 * div)
                for mu in range(3)
            },
            tmp_instructions=div_insn,
            lsize=(32, 1, 1),
            rank_shape=fft.shape(True),
        )

        import loopy as lp

        def assign(asignee, expr, **kwargs):
            default = dict(within_inames=frozenset(("i", "j", "k")),
                           no_sync_with=[("*", "any")])
            default.update(kwargs)
            return lp.Assignment(asignee, expr, **default)

        kmag, Kappa = parse("kmag, Kappa")
        eps_insns = [
            assign(kmag, sqrt(sum(kk**2 for kk in eff_k))),
            assign(Kappa, sqrt(sum(kk**2 for kk in eff_k[:2])))
        ]

        zero = fft.cdtype.type(0)
        kx_ky_zero = LogicalAnd(
            tuple(Comparison(fabs(eff_k[mu]), "<", 1e-10) for mu in range(2)))
        kz_nonzero = Comparison(fabs(eff_k[2]), ">", 1e-10)

        eps = var("eps")
        eps_insns.extend([
            assign(
                eps[0],
                If(kx_ky_zero, If(kz_nonzero, fft.cdtype.type(1 / 2**.5),
                                  zero),
                   (eff_k[0] * eff_k[2] / kmag - 1j * eff_k[1]) / Kappa /
                   2**.5)),
            assign(
                eps[1],
                If(kx_ky_zero,
                   If(kz_nonzero, fft.cdtype.type(1j / 2**(1 / 2)),
                      zero), (eff_k[1] * eff_k[2] / kmag + 1j * eff_k[0]) /
                   Kappa / 2**.5)),
            assign(eps[2], If(kx_ky_zero, zero, -Kappa / kmag / 2**.5))
        ])

        plus, minus, lng = Field("plus"), Field("minus"), Field("lng")

        plus_tmp, minus_tmp = parse("plus_tmp, minus_tmp")
        pol_isns = [(plus_tmp,
                     sum(vector[mu] * conj(eps[mu]) for mu in range(3))),
                    (minus_tmp, sum(vector[mu] * eps[mu] for mu in range(3)))]

        args = [
            lp.TemporaryVariable("kmag"),
            lp.TemporaryVariable("Kappa"),
            lp.TemporaryVariable("eps", shape=(3, )), ...
        ]

        self.vec_to_pol_knl = ElementWiseMap(
            {
                plus: plus_tmp,
                minus: minus_tmp
            },
            tmp_instructions=eps_insns + pol_isns,
            args=args,
            lsize=(32, 1, 1),
            rank_shape=fft.shape(True),
        )

        vector_tmp = var("vector_tmp")
        vec_insns = [(vector_tmp[mu], plus * eps[mu] + minus * conj(eps[mu]))
                     for mu in range(3)]

        self.pol_to_vec_knl = ElementWiseMap(
            {vector[mu]: vector_tmp[mu]
             for mu in range(3)},
            tmp_instructions=eps_insns + vec_insns,
            args=args,
            lsize=(32, 1, 1),
            rank_shape=fft.shape(True),
        )

        ksq = sum(kk**2 for kk in eff_k)
        lng_rhs = If(kvec_zero, 0, -div / ksq * 1j)
        self.vec_decomp_knl = ElementWiseMap(
            {
                plus: plus_tmp,
                minus: minus_tmp,
                lng: lng_rhs
            },
            tmp_instructions=eps_insns + pol_isns + div_insn,
            args=args,
            lsize=(32, 1, 1),
            rank_shape=fft.shape(True),
        )
        lng_rhs = If(kvec_zero, 0, -div / ksq**.5 * 1j)
        self.vec_decomp_knl_times_abs_k = ElementWiseMap(
            {
                plus: plus_tmp,
                minus: minus_tmp,
                lng: lng_rhs
            },
            tmp_instructions=eps_insns + pol_isns + div_insn,
            args=args,
            lsize=(32, 1, 1),
            rank_shape=fft.shape(True),
        )

        from pystella.sectors import tensor_index as tid

        eff_k_hat = tuple(kk / sqrt(sum(kk**2 for kk in eff_k))
                          for kk in eff_k)
        hij = Field("hij", shape=(6, ))
        hij_TT = Field("hij_TT", shape=(6, ))

        Pab = var("P")
        Pab_insns = [(Pab[tid(a, b)], (If(Comparison(a, "==", b), 1, 0) -
                                       eff_k_hat[a - 1] * eff_k_hat[b - 1]))
                     for a in range(1, 4) for b in range(a, 4)]

        hij_TT_tmp = var("hij_TT_tmp")
        TT_insns = [(hij_TT_tmp[tid(a, b)],
                     sum((Pab[tid(a, c)] * Pab[tid(d, b)] -
                          Pab[tid(a, b)] * Pab[tid(c, d)] / 2) * hij[tid(c, d)]
                         for c in range(1, 4) for d in range(1, 4)))
                    for a in range(1, 4) for b in range(a, 4)]
        # note: where conditionals (branch divergence) go can matter:
        # this kernel is twice as fast when putting the branching in the global
        # write, rather than when setting hij_TT_tmp
        write_insns = [(hij_TT[tid(a,
                                   b)], If(kvec_zero, 0, hij_TT_tmp[tid(a,
                                                                        b)]))
                       for a in range(1, 4) for b in range(a, 4)]
        self.tt_knl = ElementWiseMap(
            write_insns,
            tmp_instructions=Pab_insns + TT_insns,
            lsize=(32, 1, 1),
            rank_shape=fft.shape(True),
        )

        tensor_to_pol_insns = {
            plus:
            sum(hij[tid(c, d)] * conj(eps[c - 1]) * conj(eps[d - 1])
                for c in range(1, 4) for d in range(1, 4)),
            minus:
            sum(hij[tid(c, d)] * eps[c - 1] * eps[d - 1] for c in range(1, 4)
                for d in range(1, 4))
        }
        self.tensor_to_pol_knl = ElementWiseMap(
            tensor_to_pol_insns,
            tmp_instructions=eps_insns,
            args=args,
            lsize=(32, 1, 1),
            rank_shape=fft.shape(True),
        )

        pol_to_tensor_insns = {
            hij[tid(a, b)]: (plus * eps[a - 1] * eps[b - 1] +
                             minus * conj(eps[a - 1]) * conj(eps[b - 1]))
            for a in range(1, 4) for b in range(a, 4)
        }
        self.pol_to_tensor_knl = ElementWiseMap(
            pol_to_tensor_insns,
            tmp_instructions=eps_insns,
            args=args,
            lsize=(32, 1, 1),
            rank_shape=fft.shape(True),
        )
Пример #27
0
    def __init__(self, decomp, histograms, num_bins, dtype, **kwargs):
        self.decomp = decomp
        self.histograms = histograms
        self.num_bins = num_bins
        num_hists = len(histograms)

        from pymbolic import var
        _bin = var("bin")
        b = var("b")
        bb = var("bb")
        hist = var("hist")
        temp = var("temp")
        weight_val = var("weight")

        args = kwargs.pop("args", [])
        args += [
            lp.TemporaryVariable("temp",
                                 dtype,
                                 shape=(
                                     num_hists,
                                     self.num_bins,
                                 ),
                                 for_atomic=True,
                                 address_space=lp.AddressSpace.LOCAL),
            lp.TemporaryVariable("bin", "int", shape=(num_hists, )),
            lp.TemporaryVariable("weight", dtype, shape=(num_hists, )),
            lp.GlobalArg("hist",
                         dtype,
                         shape=(
                             num_hists,
                             self.num_bins,
                         ),
                         for_atomic=True),
        ]

        fixed_pars = kwargs.pop("fixed_parameters", dict())
        fixed_pars.update(dict(num_bins=num_bins, num_hists=num_hists))

        silenced_warnings = kwargs.pop("silenced_warnings", [])
        silenced_warnings += ["write_race(tmp*)", "write_race(glb*)"]

        domains = """
        [Nx, Ny, Nz, num_bins] ->
           {[i, j, k, b, bb]: 0<=i<Nx and 0<=j<Ny and 0<=k<Nz and 0<=b<num_bins
                              and 0<=bb<num_bins}
        """

        insns = [
            lp.Assignment(hist[j, bb],
                          0,
                          id=f"zero_hist_{j}",
                          within_inames=frozenset("bb"),
                          atomicity=(lp.AtomicInit(str(hist)), ))
            for j in range(num_hists)
        ]
        insns.append(
            lp.BarrierInstruction("post_zero_barrier",
                                  synchronization_kind="global"))
        insns.extend([
            lp.Assignment(temp[j, bb],
                          0,
                          id=f"zero_temp_{j}",
                          within_inames=frozenset(("j", "bb")),
                          atomicity=(lp.AtomicInit(str(temp)), ))
            for j in range(num_hists)
        ])
        for j, (bin_expr, weight_expr) in enumerate(histograms.values()):
            insns.extend([
                lp.Assignment(_bin[j],
                              var("floor")(bin_expr),
                              id=f"set_bin_{j}",
                              within_inames=frozenset(("i", "j", "k"))),
                lp.Assignment(weight_val[j],
                              weight_expr,
                              id=f"set_weight_{j}",
                              within_inames=frozenset(("i", "j", "k"))),
                lp.Assignment(temp[j, _bin[j]],
                              temp[j, _bin[j]] + weight_val[j],
                              id=f"tmp_{j}",
                              within_inames=frozenset(("i", "j", "k")),
                              atomicity=(lp.AtomicUpdate(str(temp)), ))
            ])

        insns.extend([
            lp.Assignment(hist[j, b],
                          hist[j, b] + temp[j, b],
                          id=f"glb_{j}",
                          within_inames=frozenset(("j", "b")),
                          atomicity=(lp.AtomicUpdate(str(hist)), ))
            for j in range(num_hists)
        ])

        lsize = [min(256, self.num_bins)]

        super().__init__(insns,
                         args=args,
                         lsize=lsize,
                         fixed_parameters=fixed_pars,
                         domains=domains,
                         silenced_warnings=silenced_warnings,
                         **kwargs)
Пример #28
0
def precompute(
        kernel,
        subst_use,
        sweep_inames=[],
        within=None,
        storage_axes=None,
        temporary_name=None,
        precompute_inames=None,
        precompute_outer_inames=None,
        storage_axis_to_tag={},

        # "None" is a valid value here, distinct from the default.
        default_tag=_not_provided,
        dtype=None,
        fetch_bounding_box=False,
        temporary_address_space=None,
        compute_insn_id=None,
        **kwargs):
    """Precompute the expression described in the substitution rule determined by
    *subst_use* and store it in a temporary array. A precomputation needs two
    things to operate, a list of *sweep_inames* (order irrelevant) and an
    ordered list of *storage_axes* (whose order will describe the axis ordering
    of the temporary array).

    :arg subst_use: Describes what to prefetch.

        The following objects may be given for *subst_use*:

        * The name of the substitution rule.

        * The tagged name ("name$tag") of the substitution rule.

        * A list of invocations of the substitution rule.
          This list of invocations, when swept across *sweep_inames*, then serves
          to define the footprint of the precomputation.

          Invocations may be tagged ("name$tag") to filter out a subset of the
          usage sites of the substitution rule. (Namely those usage sites that
          use the same tagged name.)

          Invocations may be given as a string or as a
          :class:`pymbolic.primitives.Expression` object.

          If only one invocation is to be given, then the only entry of the list
          may be given directly.

    If the list of invocations generating the footprint is not given,
    all (tag-matching, if desired) usage sites of the substitution rule
    are used to determine the footprint.

    The following cases can arise for each sweep axis:

    * The axis is an iname that occurs within arguments specified at
      usage sites of the substitution rule. This case is assumed covered
      by the storage axes provided for the argument.

    * The axis is an iname that occurs within the *value* of the rule, but not
      within its arguments. A new, dedicated storage axis is allocated for
      such an axis.

    :arg sweep_inames: A :class:`list` of inames to be swept.
        May also equivalently be a comma-separated string.
    :arg within: a stack match as understood by
        :func:`loopy.match.parse_stack_match`.
    :arg storage_axes: A :class:`list` of inames and/or rule argument
        names/indices to be used as storage axes.
        May also equivalently be a comma-separated string.
    :arg temporary_name:
        The temporary variable name to use for storing the precomputed data.
        If it does not exist, it will be created. If it does exist, its properties
        (such as size, type) are checked (and updated, if possible) to match
        its use.
    :arg precompute_inames:
        A tuple of inames to be used to carry out the precomputation.
        If the specified inames do not already exist, they will be
        created. If they do already exist, their loop domain is verified
        against the one required for this precomputation. This tuple may
        be shorter than the (provided or automatically found) *storage_axes*
        tuple, in which case names will be automatically created.
        May also equivalently be a comma-separated string.

    :arg precompute_outer_inames: A :class:`frozenset` of inames within which
        the compute instruction is nested. If *None*, make an educated guess.
        May also be specified as a comma-separated string.

    :arg default_tag: The :ref:`iname tag <iname-tags>` to be applied to the
        inames created to perform the precomputation. The current default will
        make them local axes and automatically split them to fit the work
        group size, but this default will disappear in favor of simply leaving them
        untagged in 2019. For 2018, a warning will be issued if no *default_tag* is
        specified.

    :arg compute_insn_id: The ID of the instruction generated to perform the
        precomputation.

    If `storage_axes` is not specified, it defaults to the arrangement
    `<direct sweep axes><arguments>` with the direct sweep axes being the
    slower-varying indices.

    Trivial storage axes (i.e. axes of length 1 with respect to the sweep) are
    eliminated.
    """

    # {{{ unify temporary_address_space / temporary_scope

    temporary_scope = kwargs.pop("temporary_scope", None)

    from loopy.kernel.data import AddressSpace
    if temporary_scope is not None:
        from warnings import warn
        warn(
            "temporary_scope is deprecated. Use temporary_address_space instead",
            DeprecationWarning,
            stacklevel=2)

        if temporary_address_space is not None:
            raise LoopyError(
                "may not specify both temporary_address_space and "
                "temporary_scope")

        temporary_address_space = temporary_scope

    del temporary_scope

    # }}}

    if kwargs:
        raise TypeError("unrecognized keyword arguments: %s" %
                        ", ".join(kwargs.keys()))

    # {{{ check, standardize arguments

    if isinstance(sweep_inames, str):
        sweep_inames = [iname.strip() for iname in sweep_inames.split(",")]

    for iname in sweep_inames:
        if iname not in kernel.all_inames():
            raise RuntimeError("sweep iname '%s' is not a known iname" % iname)

    sweep_inames = list(sweep_inames)
    sweep_inames_set = frozenset(sweep_inames)

    if isinstance(storage_axes, str):
        storage_axes = [ax.strip() for ax in storage_axes.split(",")]

    if isinstance(precompute_inames, str):
        precompute_inames = [
            iname.strip() for iname in precompute_inames.split(",")
        ]

    if isinstance(precompute_outer_inames, str):
        precompute_outer_inames = frozenset(
            iname.strip() for iname in precompute_outer_inames.split(","))

    if isinstance(subst_use, str):
        subst_use = [subst_use]

    footprint_generators = None

    subst_name = None
    subst_tag = None

    from pymbolic.primitives import Variable, Call
    from loopy.symbolic import parse, TaggedVariable

    for use in subst_use:
        if isinstance(use, str):
            use = parse(use)

        if isinstance(use, Call):
            if footprint_generators is None:
                footprint_generators = []

            footprint_generators.append(use)
            subst_name_as_expr = use.function
        else:
            subst_name_as_expr = use

        if isinstance(subst_name_as_expr, TaggedVariable):
            new_subst_name = subst_name_as_expr.name
            new_subst_tag = subst_name_as_expr.tag
        elif isinstance(subst_name_as_expr, Variable):
            new_subst_name = subst_name_as_expr.name
            new_subst_tag = None
        else:
            raise ValueError("unexpected type of subst_name")

        if (subst_name, subst_tag) == (None, None):
            subst_name, subst_tag = new_subst_name, new_subst_tag
        else:
            if (subst_name, subst_tag) != (new_subst_name, new_subst_tag):
                raise ValueError("not all uses in subst_use agree "
                                 "on rule name and tag")

    from loopy.match import parse_stack_match
    within = parse_stack_match(within)

    try:
        subst = kernel.substitutions[subst_name]
    except KeyError:
        raise LoopyError("substitution rule '%s' not found" % subst_name)

    c_subst_name = subst_name.replace(".", "_")

    # {{{ handle default_tag

    from loopy.transform.data import _not_provided \
            as transform_data_not_provided

    if default_tag is _not_provided or default_tag is transform_data_not_provided:
        # no need to warn for scalar precomputes
        if sweep_inames:
            from warnings import warn
            warn(
                "Not specifying default_tag is deprecated, and default_tag "
                "will become mandatory in 2019.x. "
                "Pass 'default_tag=\"l.auto\" to match the current default, "
                "or Pass 'default_tag=None to leave the loops untagged, which "
                "is the recommended behavior.",
                DeprecationWarning,
                stacklevel=(

                    # In this case, we came here through add_prefetch. Increase
                    # the stacklevel.
                    3 if default_tag is transform_data_not_provided else 2))

        default_tag = "l.auto"

    from loopy.kernel.data import parse_tag
    default_tag = parse_tag(default_tag)

    # }}}

    # }}}

    # {{{ process invocations in footprint generators, start access_descriptors

    if footprint_generators:
        from pymbolic.primitives import Variable, Call

        access_descriptors = []

        for fpg in footprint_generators:
            if isinstance(fpg, Variable):
                args = ()
            elif isinstance(fpg, Call):
                args = fpg.parameters
            else:
                raise ValueError("footprint generator must "
                                 "be substitution rule invocation")

            access_descriptors.append(
                RuleAccessDescriptor(identifier=access_descriptor_id(
                    args, None),
                                     args=args))

    # }}}

    # {{{ gather up invocations in kernel code, finish access_descriptors

    if not footprint_generators:
        rule_mapping_context = SubstitutionRuleMappingContext(
            kernel.substitutions, kernel.get_var_name_generator())
        invg = RuleInvocationGatherer(rule_mapping_context, kernel, subst_name,
                                      subst_tag, within)
        del rule_mapping_context

        import loopy as lp
        for insn in kernel.instructions:
            if isinstance(insn, lp.MultiAssignmentBase):
                for assignee in insn.assignees:
                    invg(assignee, kernel, insn)
                invg(insn.expression, kernel, insn)

        access_descriptors = invg.access_descriptors
        if not access_descriptors:
            raise RuntimeError("no invocations of '%s' found" % subst_name)

    # }}}

    # {{{ find inames used in arguments

    expanding_usage_arg_deps = set()

    for accdesc in access_descriptors:
        for arg in accdesc.args:
            expanding_usage_arg_deps.update(
                get_dependencies(arg) & kernel.all_inames())

    # }}}

    var_name_gen = kernel.get_var_name_generator()

    # {{{ use given / find new storage_axes

    # extra axes made necessary because they don't occur in the arguments
    extra_storage_axes = set(sweep_inames_set - expanding_usage_arg_deps)

    from loopy.symbolic import SubstitutionRuleExpander
    submap = SubstitutionRuleExpander(kernel.substitutions)

    value_inames = (get_dependencies(submap(subst.expression)) -
                    frozenset(subst.arguments)) & kernel.all_inames()
    if value_inames - expanding_usage_arg_deps < extra_storage_axes:
        raise RuntimeError("unreferenced sweep inames specified: " +
                           ", ".join(extra_storage_axes - value_inames -
                                     expanding_usage_arg_deps))

    new_iname_to_tag = {}

    if storage_axes is None:
        storage_axes = []

        # Add sweep_inames (in given--rather than arbitrary--order) to
        # storage_axes *if* they are part of extra_storage_axes.
        for iname in sweep_inames:
            if iname in extra_storage_axes:
                extra_storage_axes.remove(iname)
                storage_axes.append(iname)

        if extra_storage_axes:
            if (precompute_inames is not None
                    and len(storage_axes) < len(precompute_inames)):
                raise LoopyError(
                    "must specify a sufficient number of "
                    "storage_axes to uniquely determine the meaning "
                    "of the given precompute_inames. (%d storage_axes "
                    "needed)" % len(precompute_inames))
            storage_axes.extend(sorted(extra_storage_axes))

        storage_axes.extend(range(len(subst.arguments)))

    del extra_storage_axes

    prior_storage_axis_name_dict = {}

    storage_axis_names = []
    storage_axis_sources = []  # number for arg#, or iname

    # {{{ check for pre-existing precompute_inames

    if precompute_inames is not None:
        preexisting_precompute_inames = (set(precompute_inames)
                                         & kernel.all_inames())
    else:
        preexisting_precompute_inames = set()

    # }}}

    for i, saxis in enumerate(storage_axes):
        tag_lookup_saxis = saxis

        if saxis in subst.arguments:
            saxis = subst.arguments.index(saxis)

        storage_axis_sources.append(saxis)

        if isinstance(saxis, int):
            # argument index
            name = old_name = subst.arguments[saxis]
        else:
            old_name = saxis
            name = "%s_%s" % (c_subst_name, old_name)

        if (precompute_inames is not None and i < len(precompute_inames)
                and precompute_inames[i]):
            name = precompute_inames[i]
            tag_lookup_saxis = name
            if (name not in preexisting_precompute_inames
                    and var_name_gen.is_name_conflicting(name)):
                raise RuntimeError("new storage axis name '%s' "
                                   "conflicts with existing name" % name)
        else:
            name = var_name_gen(name)

        storage_axis_names.append(name)
        if name not in preexisting_precompute_inames:
            new_iname_to_tag[name] = storage_axis_to_tag.get(
                tag_lookup_saxis, default_tag)

        prior_storage_axis_name_dict[name] = old_name

    del storage_axis_to_tag
    del storage_axes
    del precompute_inames

    # }}}

    # {{{ fill out access_descriptors[...].storage_axis_exprs

    access_descriptors = [
        accdesc.copy(storage_axis_exprs=storage_axis_exprs(
            storage_axis_sources, accdesc.args))
        for accdesc in access_descriptors
    ]

    # }}}

    expanding_inames = sweep_inames_set | frozenset(expanding_usage_arg_deps)
    assert expanding_inames <= kernel.all_inames()

    if storage_axis_names:
        # {{{ find domain to be changed

        change_inames = expanding_inames | preexisting_precompute_inames

        from loopy.kernel.tools import DomainChanger
        domch = DomainChanger(kernel, change_inames)

        if domch.leaf_domain_index is not None:
            # If the sweep inames are at home in parent domains, then we'll add
            # fetches with loops over copies of these parent inames that will end
            # up being scheduled *within* loops over these parents.

            for iname in sweep_inames_set:
                if kernel.get_home_domain_index(
                        iname) != domch.leaf_domain_index:
                    raise RuntimeError(
                        "sweep iname '%s' is not 'at home' in the "
                        "sweep's leaf domain" % iname)

        # }}}

        abm = ArrayToBufferMap(kernel, domch.domain, sweep_inames,
                               access_descriptors, len(storage_axis_names))

        non1_storage_axis_names = []
        for i, saxis in enumerate(storage_axis_names):
            if abm.non1_storage_axis_flags[i]:
                non1_storage_axis_names.append(saxis)
            else:
                del new_iname_to_tag[saxis]

                if saxis in preexisting_precompute_inames:
                    raise LoopyError(
                        "precompute axis %d (1-based) was "
                        "eliminated as "
                        "having length 1 but also mapped to existing "
                        "iname '%s'" % (i + 1, saxis))

        mod_domain = domch.domain

        # {{{ modify the domain, taking into account preexisting inames

        # inames may already exist in mod_domain, add them primed to start
        primed_non1_saxis_names = [
            iname + "'" for iname in non1_storage_axis_names
        ]

        mod_domain = abm.augment_domain_with_sweep(
            domch.domain,
            primed_non1_saxis_names,
            boxify_sweep=fetch_bounding_box)

        check_domain = mod_domain

        for i, saxis in enumerate(non1_storage_axis_names):
            var_dict = mod_domain.get_var_dict(isl.dim_type.set)

            if saxis in preexisting_precompute_inames:
                # add equality constraint between existing and new variable

                dt, dim_idx = var_dict[saxis]
                saxis_aff = isl.Aff.var_on_domain(mod_domain.space, dt,
                                                  dim_idx)

                dt, dim_idx = var_dict[primed_non1_saxis_names[i]]
                new_var_aff = isl.Aff.var_on_domain(mod_domain.space, dt,
                                                    dim_idx)

                mod_domain = mod_domain.add_constraint(
                    isl.Constraint.equality_from_aff(new_var_aff - saxis_aff))

                # project out the new one
                mod_domain = mod_domain.project_out(dt, dim_idx, 1)

            else:
                # remove the prime from the new variable
                dt, dim_idx = var_dict[primed_non1_saxis_names[i]]
                mod_domain = mod_domain.set_dim_name(dt, dim_idx, saxis)

        def add_assumptions(d):
            assumption_non_param = isl.BasicSet.from_params(kernel.assumptions)
            assumptions, domain = isl.align_two(assumption_non_param, d)
            return assumptions & domain

        # {{{ check that we got the desired domain

        check_domain = add_assumptions(
            check_domain.project_out_except(primed_non1_saxis_names,
                                            [isl.dim_type.set]))

        mod_check_domain = add_assumptions(mod_domain)

        # re-add the prime from the new variable
        var_dict = mod_check_domain.get_var_dict(isl.dim_type.set)

        for saxis in non1_storage_axis_names:
            dt, dim_idx = var_dict[saxis]
            mod_check_domain = mod_check_domain.set_dim_name(
                dt, dim_idx, saxis + "'")

        mod_check_domain = mod_check_domain.project_out_except(
            primed_non1_saxis_names, [isl.dim_type.set])

        mod_check_domain, check_domain = isl.align_two(mod_check_domain,
                                                       check_domain)

        # The modified domain can't get bigger by adding constraints
        assert mod_check_domain <= check_domain

        if not check_domain <= mod_check_domain:
            print(check_domain)
            print(mod_check_domain)
            raise LoopyError("domain of preexisting inames does not match "
                             "domain needed for precompute")

        # }}}

        # {{{ check that we didn't shrink the original domain

        # project out the new names from the modified domain
        orig_domain_inames = list(domch.domain.get_var_dict(isl.dim_type.set))
        mod_check_domain = add_assumptions(
            mod_domain.project_out_except(orig_domain_inames,
                                          [isl.dim_type.set]))

        check_domain = add_assumptions(domch.domain)

        mod_check_domain, check_domain = isl.align_two(mod_check_domain,
                                                       check_domain)

        # The modified domain can't get bigger by adding constraints
        assert mod_check_domain <= check_domain

        if not check_domain <= mod_check_domain:
            print(check_domain)
            print(mod_check_domain)
            raise LoopyError(
                "original domain got shrunk by applying the precompute")

        # }}}

        # }}}

        new_kernel_domains = domch.get_domains_with(mod_domain)

    else:
        # leave kernel domains unchanged
        new_kernel_domains = kernel.domains

        non1_storage_axis_names = []
        abm = NoOpArrayToBufferMap()

    kernel = kernel.copy(domains=new_kernel_domains)

    # {{{ set up compute insn

    if temporary_name is None:
        temporary_name = var_name_gen(based_on=c_subst_name)

    assignee = var(temporary_name)

    if non1_storage_axis_names:
        assignee = assignee[tuple(
            var(iname) for iname in non1_storage_axis_names)]

    # {{{ process substitutions on compute instruction

    storage_axis_subst_dict = {}

    for arg_name, bi in zip(storage_axis_names, abm.storage_base_indices):
        if arg_name in non1_storage_axis_names:
            arg = var(arg_name)
        else:
            arg = 0

        storage_axis_subst_dict[prior_storage_axis_name_dict.get(
            arg_name, arg_name)] = arg + bi

    rule_mapping_context = SubstitutionRuleMappingContext(
        kernel.substitutions, kernel.get_var_name_generator())

    from loopy.match import parse_stack_match
    expr_subst_map = RuleAwareSubstitutionMapper(
        rule_mapping_context,
        make_subst_func(storage_axis_subst_dict),
        within=parse_stack_match(None))

    compute_expression = expr_subst_map(subst.expression, kernel, None)

    # }}}

    from loopy.kernel.data import Assignment
    if compute_insn_id is None:
        compute_insn_id = kernel.make_unique_instruction_id(
            based_on=c_subst_name)

    compute_insn = Assignment(
        id=compute_insn_id,
        assignee=assignee,
        expression=compute_expression,
        # within_inames determined below
    )
    compute_dep_id = compute_insn_id
    added_compute_insns = [compute_insn]

    if temporary_address_space == AddressSpace.GLOBAL:
        barrier_insn_id = kernel.make_unique_instruction_id(
            based_on=c_subst_name + "_barrier")
        from loopy.kernel.instruction import BarrierInstruction
        barrier_insn = BarrierInstruction(id=barrier_insn_id,
                                          depends_on=frozenset(
                                              [compute_insn_id]),
                                          synchronization_kind="global",
                                          mem_kind="global")
        compute_dep_id = barrier_insn_id

        added_compute_insns.append(barrier_insn)

    # }}}

    # {{{ substitute rule into expressions in kernel (if within footprint)

    from loopy.symbolic import SubstitutionRuleExpander
    expander = SubstitutionRuleExpander(kernel.substitutions)

    invr = RuleInvocationReplacer(rule_mapping_context,
                                  subst_name,
                                  subst_tag,
                                  within,
                                  access_descriptors,
                                  abm,
                                  storage_axis_names,
                                  storage_axis_sources,
                                  non1_storage_axis_names,
                                  temporary_name,
                                  compute_insn_id,
                                  compute_dep_id,
                                  compute_read_variables=get_dependencies(
                                      expander(compute_expression)))

    kernel = invr.map_kernel(kernel)
    kernel = kernel.copy(instructions=added_compute_insns +
                         kernel.instructions)
    kernel = rule_mapping_context.finish_kernel(kernel)

    # }}}

    # {{{ add dependencies to compute insn

    kernel = kernel.copy(instructions=[
        insn.copy(depends_on=frozenset(invr.compute_insn_depends_on)) if insn.
        id == compute_insn_id else insn for insn in kernel.instructions
    ])

    # }}}

    # {{{ propagate storage iname subst to dependencies of compute instructions

    from loopy.kernel.tools import find_recursive_dependencies
    compute_deps = find_recursive_dependencies(kernel,
                                               frozenset([compute_insn_id]))

    # FIXME: Need to verify that there are no outside dependencies
    # on compute_deps

    prior_storage_axis_names = frozenset(storage_axis_subst_dict)

    new_insns = []
    for insn in kernel.instructions:
        if (insn.id in compute_deps
                and insn.within_inames & prior_storage_axis_names):
            insn = (insn.with_transformed_expressions(
                lambda expr: expr_subst_map(expr, kernel, insn)).copy(
                    within_inames=frozenset(
                        storage_axis_subst_dict.get(iname, var(iname)).name
                        for iname in insn.within_inames)))

            new_insns.append(insn)
        else:
            new_insns.append(insn)

    kernel = kernel.copy(instructions=new_insns)

    # }}}

    # {{{ determine inames for compute insn

    if precompute_outer_inames is None:
        from loopy.kernel.tools import guess_iname_deps_based_on_var_use
        precompute_outer_inames = (
            frozenset(non1_storage_axis_names)
            | frozenset((expanding_usage_arg_deps | value_inames) -
                        sweep_inames_set)
            | guess_iname_deps_based_on_var_use(kernel, compute_insn))
    else:
        if not isinstance(precompute_outer_inames, frozenset):
            raise TypeError("precompute_outer_inames must be a frozenset")

        precompute_outer_inames = precompute_outer_inames \
                | frozenset(non1_storage_axis_names)

    kernel = kernel.copy(instructions=[
        insn.copy(within_inames=precompute_outer_inames) if insn.id ==
        compute_insn_id else insn for insn in kernel.instructions
    ])

    # }}}

    # {{{ set up temp variable

    import loopy as lp
    if dtype is not None:
        dtype = np.dtype(dtype)

    if temporary_address_space is None:
        temporary_address_space = lp.auto

    new_temp_shape = tuple(abm.non1_storage_shape)

    new_temporary_variables = kernel.temporary_variables.copy()
    if temporary_name not in new_temporary_variables:
        temp_var = lp.TemporaryVariable(
            name=temporary_name,
            dtype=dtype,
            base_indices=(0, ) * len(new_temp_shape),
            shape=tuple(abm.non1_storage_shape),
            address_space=temporary_address_space,
            dim_names=tuple(non1_storage_axis_names))

    else:
        temp_var = new_temporary_variables[temporary_name]

        # {{{ check and adapt existing temporary

        if temp_var.dtype is lp.auto:
            pass
        elif temp_var.dtype is not lp.auto and dtype is lp.auto:
            dtype = temp_var.dtype
        elif temp_var.dtype is not lp.auto and dtype is not lp.auto:
            if temp_var.dtype != dtype:
                raise LoopyError("Existing and new dtype of temporary '%s' "
                                 "do not match (existing: %s, new: %s)" %
                                 (temporary_name, temp_var.dtype, dtype))

        temp_var = temp_var.copy(dtype=dtype)

        if len(temp_var.shape) != len(new_temp_shape):
            raise LoopyError(
                "Existing and new temporary '%s' do not "
                "have matching number of dimensions ('%d' vs. '%d') " %
                (temporary_name, len(temp_var.shape), len(new_temp_shape)))

        if temp_var.base_indices != (0, ) * len(new_temp_shape):
            raise LoopyError(
                "Existing and new temporary '%s' do not "
                "have matching number of dimensions ('%d' vs. '%d') " %
                (temporary_name, len(temp_var.shape), len(new_temp_shape)))

        new_temp_shape = tuple(
            max(i, ex_i) for i, ex_i in zip(new_temp_shape, temp_var.shape))

        temp_var = temp_var.copy(shape=new_temp_shape)

        if temporary_address_space == temp_var.address_space:
            pass
        elif temporary_address_space is lp.auto:
            temporary_address_space = temp_var.address_space
        elif temp_var.address_space is lp.auto:
            pass
        else:
            raise LoopyError("Existing and new temporary '%s' do not "
                             "have matching scopes (existing: %s, new: %s)" %
                             (temporary_name,
                              AddressSpace.stringify(temp_var.address_space),
                              AddressSpace.stringify(temporary_address_space)))

        temp_var = temp_var.copy(address_space=temporary_address_space)

        # }}}

    new_temporary_variables[temporary_name] = temp_var

    kernel = kernel.copy(temporary_variables=new_temporary_variables)

    # }}}

    from loopy import tag_inames
    kernel = tag_inames(kernel, new_iname_to_tag)

    from loopy.kernel.data import AutoFitLocalIndexTag, filter_iname_tags_by_type

    if filter_iname_tags_by_type(new_iname_to_tag.values(),
                                 AutoFitLocalIndexTag):
        from loopy.kernel.tools import assign_automatic_axes
        kernel = assign_automatic_axes(kernel)

    return kernel
Пример #29
0
def test_fuzz_expression_code_gen(ctx_factory, expr_type, random_seed):
    from pymbolic import evaluate

    def get_numpy_type(x):
        if expr_type in ["real", "complex"]:
            if isinstance(x, (complex, np.complexfloating)):
                return np.complex128
            else:
                return np.float64

        elif expr_type in ["int", "int_nonneg"]:
            return np.int64

        else:
            raise ValueError("unknown expr_type: %s" % expr_type)

    from random import seed

    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    seed(random_seed)

    data = []
    instructions = []

    ref_values = {}

    if expr_type in ["real", "complex"]:
        result_type = np.complex128
    elif expr_type in ["int", "int_nonneg"]:
        result_type = np.int64
    else:
        assert False

    var_names = []

    fuzz_iter = iter(generate_random_fuzz_examples(expr_type))
    count = 0

    while True:
        if count == 10:
            break

        i, expr, var_values = next(fuzz_iter)

        var_name = "expr%d" % i

        print(expr)
        #assert_parse_roundtrip(expr)

        if expr_type in ["int", "int_nonneg"]:
            result_type_iinfo = np.iinfo(np.int32)
            bceval_mapper = BoundsCheckingEvaluationMapper(
                var_values,
                lbound=result_type_iinfo.min,
                ubound=result_type_iinfo.max)
            print(expr)
            try:
                ref_values[var_name] = bceval_mapper(expr)
            except BoundsCheckError:
                print(expr)
                print("BOUNDS CHECK FAILED")
                continue
        else:
            try:
                ref_values[var_name] = evaluate(expr, var_values)
            except ZeroDivisionError:
                continue

        count += 1

        data.append(lp.GlobalArg(var_name, result_type, shape=()))
        data.extend([
            lp.TemporaryVariable(name, get_numpy_type(val))
            for name, val in var_values.items()
        ])
        instructions.extend([
            lp.Assignment(name,
                          get_numpy_type(val)(val))
            for name, val in var_values.items()
        ])
        instructions.append(lp.Assignment(var_name, expr))

        if expr_type == "int_nonneg":
            var_names.extend(var_values)

    knl = lp.make_kernel("{ : }", instructions, data, seq_dependencies=True)

    import islpy as isl
    knl = lp.assume(
        knl,
        isl.BasicSet(
            "[%s] -> { : %s}" %
            (", ".join(var_names), " and ".join("%s >= 0" % name
                                                for name in var_names))))

    knl = lp.set_options(knl, return_dict=True)
    print(knl)
    evt, lp_values = knl(queue, out_host=True)

    for name, ref_value in ref_values.items():
        lp_value = lp_values[name]
        if expr_type in ["real", "complex"]:
            err = abs(ref_value - lp_value) / abs(ref_value)
        elif expr_type in ["int", "int_nonneg"]:
            err = abs(ref_value - lp_value)
        else:
            assert False

        if abs(err) > 1e-10:
            print(80 * "-")
            print(knl)
            print(80 * "-")
            print(lp.generate_code_v2(knl).device_code())
            print(80 * "-")
            print(f"WRONG: {name} rel error={err:g}")
            print("reference=%r" % ref_value)
            print("loopy=%r" % lp_value)
            print(80 * "-")
            1 / 0

    print(lp.generate_code_v2(knl).device_code())
Пример #30
0
    def get_kernel(self):

        if self.integral_kernel.is_complex_valued:
            potential_dtype = np.complex128
        else:
            potential_dtype = np.float64

        lpknl = loopy.make_kernel(  # NOQA
            [
                "{ [ tbox ] : 0 <= tbox < n_tgt_boxes }",
                "{ [ tid, sbox ] : 0 <= tid < n_box_targets and \
                        sbox_begin <= sbox < sbox_end }",
                "{ [ sid ] : 0 <= sid < n_box_sources }",
            ],
            """
            for tbox
                <> target_box_id    = target_boxes[tbox]
                <> box_target_beg   = box_target_starts[target_box_id]
                <> n_box_targets    = box_target_counts_cumul[target_box_id]

                <> sbox_begin = neighbor_source_boxes_starts[tbox]
                <> sbox_end   = neighbor_source_boxes_starts[tbox+1]

                <> tbox_level  = box_levels[target_box_id]
                <> tbox_extent = root_extent * (1.0 / (2**tbox_level))

                for tid
                    <> target_id = box_target_beg + tid
                end

                for tid, sbox
                    <> source_box_id  = source_boxes[sbox]
                    <> n_box_sources  = box_source_counts_cuml[source_box_id]
                    <> box_source_beg = box_source_starts[source_box_id]

                    <> sbox_level  = box_levels[source_box_id]
                    <> sbox_extent = root_extent * (1.0 / (2**sbox_level))

                    table_lev_tmp = GET_TABLE_LEVEL {id=tab_lev_tmp}
                    table_lev = round(table_lev_tmp) {id=tab_lev,dep=tab_lev_tmp}

                    vec_id_tmp = COMPUTE_VEC_ID {id=vec_id_tmp}
                    vec_id = round(vec_id_tmp) {id=vec_id,dep=vec_id_tmp}
                    <> case_id = case_indices[vec_id] {dep=vec_id}

                    <> scaling = COMPUTE_SCALING

                    for sid

                        <> tgt_scaling = COMPUTE_TGT_SCALING
                        <> tgt_displacement = COMPUTE_TGT_DISPLACEMENT
                        tgt_table_lev_tmp = GET_TGT_TABLE_LEVEL {id=tgttab_lev_tmp}
                        tgt_table_lev = round(tgt_table_lev_tmp) \
                                {id=tgttab_lev,dep=tgttab_lev_tmp}
                        <> ext_nmlz = exterior_mode_nmlz[tgt_table_lev, tid] \
                                * tgt_scaling + tgt_displacement \
                                {id=extnmlz,dep=tgttab_lev}

                        <> source_id = box_source_beg + sid
                        <> pair_id = sid * n_box_targets + tid
                        <> entry_id = case_id * \
                                      (n_box_targets * n_box_sources) \
                                      + pair_id

                        <> displacement = COMPUTE_DISPLACEMENT

                        <> integ = table_data[table_lev, entry_id] * scaling \
                                   + displacement {id=integ,dep=tab_lev}
                        # <> source_id_tree = user_source_ids[source_id]
                        <> coef = source_coefs[source_id] {id=coef}

                        # <> target_id_user = sorted_target_ids[target_id]

                        #db_table_lev[target_id] = table_lev_tmp {dep=tab_lev}
                        #db_case_id[target_id] = case_id
                        #db_vec_id[target_id] = vec_id
                        #db_n_box_targets[target_id] = n_box_targets
                        #db_n_box_sources[target_id] = n_box_sources
                        #db_entry_id[target_id] = entry_id

                    end
                end

                for tid

                    result[target_id] = sum((sbox, sid),
                        coef * integ) + EXTERIOR_PART {dep=integ:coef:extnmlz}

                    # Try inspecting case_id if something goes wrong
                    # (like segmentation fault) and look for -1's
                    # result[target_id] = min((sbox, sid), case_id)
                    # result[target_id] = vec_id_tmp

                end
            end
            """.replace("COMPUTE_VEC_ID", self.codegen_vec_id()).replace(
                "COMPUTE_SCALING", self.codegen_compute_scaling()).replace(
                    "COMPUTE_DISPLACEMENT",
                    self.codegen_compute_displacement()).replace(
                        "COMPUTE_TGT_SCALING",
                        self.codegen_compute_scaling('tbox')).replace(
                            "COMPUTE_TGT_DISPLACEMENT",
                            self.codegen_compute_displacement('tbox')).replace(
                                "GET_TABLE_LEVEL",
                                self.codegen_get_table_level()).replace(
                                    "GET_TGT_TABLE_LEVEL",
                                    self.codegen_get_table_level(
                                        'tbox')).replace(
                                            "EXTERIOR_PART",
                                            self.codegen_exterior_part()),
            [
                loopy.TemporaryVariable("vec_id", np.int32),
                loopy.TemporaryVariable("vec_id_tmp", np.float64),
                loopy.TemporaryVariable("table_lev", np.int32),
                loopy.TemporaryVariable("table_lev_tmp", np.float64),
                loopy.TemporaryVariable("tgt_table_lev", np.int32),
                loopy.TemporaryVariable("tgt_table_lev_tmp", np.float64),
                loopy.ValueArg("encoding_base", np.int32),
                loopy.GlobalArg("mode_nmlz", potential_dtype,
                                "n_tables, n_q_points"),
                loopy.GlobalArg("exterior_mode_nmlz", potential_dtype,
                                "n_tables, n_q_points"),
                loopy.GlobalArg("table_data", potential_dtype,
                                "n_tables, n_table_entries"),
                loopy.GlobalArg("source_boxes", np.int32, "n_source_boxes"),
                loopy.GlobalArg("box_centers", None, "dim, aligned_nboxes"),
                loopy.ValueArg("aligned_nboxes", np.int32),
                loopy.ValueArg("table_root_extent", np.float64),
                loopy.ValueArg(
                    "dim, n_source_boxes, n_tables, "
                    "n_q_points, n_table_entries",
                    np.int32,
                ),
                "...",
            ],
            name="near_field",
            lang_version=(2018, 2))

        # lpknl = loopy.set_options(lpknl, write_code=True)
        lpknl = loopy.set_options(lpknl, return_dict=True)

        return lpknl