Exemplo n.º 1
0
Arquivo: data.py Projeto: shwina/loopy
def _add_kernel_axis(kernel, axis_name, start, stop, base_inames):
    from loopy.kernel.tools import DomainChanger
    domch = DomainChanger(kernel, base_inames)

    domain = domch.domain
    new_dim_idx = domain.dim(dim_type.set)
    domain = (domain
            .insert_dims(dim_type.set, new_dim_idx, 1)
            .set_dim_name(dim_type.set, new_dim_idx, axis_name))

    from loopy.symbolic import get_dependencies
    deps = get_dependencies(start) | get_dependencies(stop)
    assert deps <= kernel.all_params()

    param_names = domain.get_var_names(dim_type.param)
    for dep in deps:
        if dep not in param_names:
            new_dim_idx = domain.dim(dim_type.param)
            domain = (domain
                    .insert_dims(dim_type.param, new_dim_idx, 1)
                    .set_dim_name(dim_type.param, new_dim_idx, dep))

    from loopy.isl_helpers import make_slab
    slab = make_slab(domain.get_space(), axis_name, start, stop)

    domain = domain & slab

    return kernel.copy(domains=domch.get_domains_with(domain))
Exemplo n.º 2
0
def _add_kernel_axis(kernel, axis_name, start, stop, base_inames):
    from loopy.kernel.tools import DomainChanger
    domch = DomainChanger(kernel, base_inames)

    domain = domch.domain
    new_dim_idx = domain.dim(dim_type.set)
    domain = (domain
            .insert_dims(dim_type.set, new_dim_idx, 1)
            .set_dim_name(dim_type.set, new_dim_idx, axis_name))

    from loopy.symbolic import get_dependencies
    deps = get_dependencies(start) | get_dependencies(stop)
    assert deps <= kernel.all_params()

    param_names = domain.get_var_names(dim_type.param)
    for dep in deps:
        if dep not in param_names:
            new_dim_idx = domain.dim(dim_type.param)
            domain = (domain
                    .insert_dims(dim_type.param, new_dim_idx, 1)
                    .set_dim_name(dim_type.param, new_dim_idx, dep))

    from loopy.isl_helpers import make_slab
    slab = make_slab(domain.get_space(), axis_name, start, stop)

    domain = domain & slab

    return kernel.copy(domains=domch.get_domains_with(domain))
Exemplo n.º 3
0
    def process_set(s):
        var_dict = s.get_var_dict()

        if split_iname not in var_dict:
            return s

        orig_dim_type, _ = var_dict[split_iname]

        outer_var_nr = s.dim(orig_dim_type)
        inner_var_nr = s.dim(orig_dim_type)+1

        s = s.add_dims(orig_dim_type, 2)
        s = s.set_dim_name(orig_dim_type, outer_var_nr, outer_iname)
        s = s.set_dim_name(orig_dim_type, inner_var_nr, inner_iname)

        from loopy.isl_helpers import make_slab

        space = s.get_space()
        inner_constraint_set = (
                make_slab(space, inner_iname, 0, inner_length)
                # name = inner + length*outer
                .add_constraint(isl.Constraint.eq_from_names(
                    space, {
                        split_iname: 1,
                        inner_iname: -1,
                        outer_iname: -inner_length})))

        name_dim_type, name_idx = space.get_var_dict()[split_iname]
        s = s.intersect(inner_constraint_set)

        if within is None:
            s = s.project_out(name_dim_type, name_idx, 1)

        return s
Exemplo n.º 4
0
    def cumsum(self, arg):
        """
        Registers  a substitution rule in order to cumulatively sum the
        elements of array ``arg`` along ``axis``. Mimics :func:`numpy.cumsum`.

        :return: An instance of :class:`numloopy.ArraySymbol` which is
            which is registered as the cumulative summed-substitution rule.
        """
        # Note: this can remain as a substitution but loopy does not have
        # support for translating inames for substitutions to the kernel
        # domains
        assert len(arg.shape) == 1
        i_iname = self.name_generator(based_on="i")
        j_iname = self.name_generator(based_on="i")

        space = isl.Space.create_from_names(isl.DEFAULT_CONTEXT,
                                            [i_iname, j_iname])
        domain = isl.BasicSet.universe(space)
        arg_name = self.name_generator(based_on="arr")
        subst_name = self.name_generator(based_on="subst")
        domain = domain & make_slab(space, i_iname, 0, arg.shape[0])
        domain = domain.add_constraint(
            isl.Constraint.ineq_from_names(space, {j_iname: 1}))
        domain = domain.add_constraint(
            isl.Constraint.ineq_from_names(space, {
                j_iname: -1,
                i_iname: 1,
                1: -1
            }))
        cumsummed_arg = ArraySymbol(stack=self,
                                    name=arg_name,
                                    shape=arg.shape,
                                    dtype=arg.dtype)
        cumsummed_subst = ArraySymbol(stack=self,
                                      name=subst_name,
                                      shape=arg.shape,
                                      dtype=arg.dtype)
        subst_iname = self.name_generator(based_on="i")
        rule = lp.SubstitutionRule(
            subst_name, (subst_iname, ),
            Subscript(Variable(arg_name), (Variable(subst_iname), )))

        from loopy.library.reduction import SumReductionOperation

        insn = lp.Assignment(assignee=Subscript(Variable(arg_name),
                                                (Variable(i_iname), )),
                             expression=lp.Reduction(
                                 SumReductionOperation(), (j_iname, ),
                                 parse('{}({})'.format(arg.name, j_iname))))
        self.data.append(cumsummed_arg)
        self.substs_to_arrays[subst_name] = arg_name
        self.register_implicit_assignment(insn)
        self.domains.append(domain)

        self.register_substitution(rule)
        return cumsummed_subst
Exemplo n.º 5
0
    def sum(self, arg, axis=None):
        """
        Registers  a substitution rule in order to sum the elements of array
        ``arg`` along ``axis``.

        :return: An instance of :class:`numloopy.ArraySymbol` which is
            which is registered as the sum-substitution rule.
        """
        if isinstance(axis, int):
            axis = (axis, )

        if not axis:
            axis = tuple(range(len(arg.shape)))

        inames = [self.name_generator(based_on="i") for _ in arg.shape]

        space = isl.Space.create_from_names(isl.DEFAULT_CONTEXT, inames)
        domain = isl.BasicSet.universe(space)
        for axis_len, iname in zip(arg.shape, inames):
            domain &= make_slab(space, iname, 0, axis_len)

        self.domains.append(domain)

        reduction_inames = tuple(iname for i, iname in enumerate(inames)
                                 if i in axis)
        left_inames = tuple(iname for i, iname in enumerate(inames)
                            if i not in axis)

        def _one_if_empty(t):
            if t:
                return t
            else:
                return (1, )

        subst_name = self.name_generator(based_on="subst")

        summed_arg = ArraySymbol(
            stack=self,
            name=subst_name,
            shape=_one_if_empty(
                tuple(axis_len for i, axis_len in enumerate(arg.shape)
                      if i not in axis)),
            dtype=arg.dtype)

        from loopy.library.reduction import SumReductionOperation

        rule = lp.SubstitutionRule(
            subst_name, left_inames,
            lp.Reduction(SumReductionOperation(), reduction_inames,
                         parse('{}({})'.format(arg.name, ', '.join(inames)))))
        self.register_substitution(rule)

        return summed_arg
Exemplo n.º 6
0
def generate_vectorize_loop(codegen_state, sched_index):
    kernel = codegen_state.kernel

    iname = kernel.schedule[sched_index].iname

    bounds = kernel.get_iname_bounds(iname, constants_only=True)

    from loopy.isl_helpers import (
            static_max_of_pw_aff, static_value_of_pw_aff)
    from loopy.symbolic import pw_aff_to_expr

    length_aff = static_max_of_pw_aff(bounds.size, constants_only=True)

    if not length_aff.is_cst():
        warn(kernel, "vec_upper_not_const",
                "upper bound for vectorized loop '%s' is not a constant, "
                "cannot vectorize--unrolling instead")
        return generate_unroll_loop(codegen_state, sched_index)

    length = int(pw_aff_to_expr(length_aff))

    try:
        lower_bound_aff = static_value_of_pw_aff(
                bounds.lower_bound_pw_aff.coalesce(),
                constants_only=False)
    except Exception as e:
        raise type(e)("while finding lower bound of '%s': " % iname)

    if not lower_bound_aff.plain_is_zero():
        warn(kernel, "vec_lower_not_0",
                "lower bound for vectorized loop '%s' is not zero, "
                "cannot vectorize--unrolling instead")
        return generate_unroll_loop(codegen_state, sched_index)

    # {{{ 'implement' vectorization bounds

    domain = kernel.get_inames_domain(iname)

    from loopy.isl_helpers import make_slab
    slab = make_slab(domain.get_space(), iname,
            lower_bound_aff, lower_bound_aff+length)
    codegen_state = codegen_state.intersect(slab)

    # }}}

    from loopy.codegen import VectorizationInfo
    new_codegen_state = codegen_state.copy(
            vectorization_info=VectorizationInfo(
                iname=iname,
                length=length,
                space=length_aff.space))

    return build_loop_nest(new_codegen_state, sched_index+1)
Exemplo n.º 7
0
Arquivo: loop.py Projeto: tj-sun/loopy
def generate_vectorize_loop(codegen_state, sched_index):
    kernel = codegen_state.kernel

    iname = kernel.schedule[sched_index].iname

    bounds = kernel.get_iname_bounds(iname, constants_only=True)

    from loopy.isl_helpers import (
            static_max_of_pw_aff, static_value_of_pw_aff)
    from loopy.symbolic import pw_aff_to_expr

    length_aff = static_max_of_pw_aff(bounds.size, constants_only=True)

    if not length_aff.is_cst():
        warn(kernel, "vec_upper_not_const",
                "upper bound for vectorized loop '%s' is not a constant, "
                "cannot vectorize--unrolling instead")
        return generate_unroll_loop(kernel, sched_index, codegen_state)

    length = int(pw_aff_to_expr(length_aff))

    try:
        lower_bound_aff = static_value_of_pw_aff(
                bounds.lower_bound_pw_aff.coalesce(),
                constants_only=False)
    except Exception as e:
        raise type(e)("while finding lower bound of '%s': " % iname)

    if not lower_bound_aff.plain_is_zero():
        warn(kernel, "vec_lower_not_0",
                "lower bound for vectorized loop '%s' is not zero, "
                "cannot vectorize--unrolling instead")
        return generate_unroll_loop(kernel, sched_index, codegen_state)

    # {{{ 'implement' vectorization bounds

    domain = kernel.get_inames_domain(iname)

    from loopy.isl_helpers import make_slab
    slab = make_slab(domain.get_space(), iname,
            lower_bound_aff, lower_bound_aff+length)
    codegen_state = codegen_state.intersect(slab)

    # }}}

    from loopy.codegen import VectorizationInfo
    new_codegen_state = codegen_state.copy(
            vectorization_info=VectorizationInfo(
                iname=iname,
                length=length,
                space=length_aff.space))

    return build_loop_nest(new_codegen_state, sched_index+1)
Exemplo n.º 8
0
    def process_set(s):
        var_dict = s.get_var_dict()

        if split_iname not in var_dict:
            return s

        orig_dim_type, _ = var_dict[split_iname]

        outer_var_nr = s.dim(orig_dim_type)
        inner_var_nr = s.dim(orig_dim_type) + 1

        s = s.add_dims(orig_dim_type, 2)
        s = s.set_dim_name(orig_dim_type, outer_var_nr, outer_iname)
        s = s.set_dim_name(orig_dim_type, inner_var_nr, inner_iname)

        from loopy.isl_helpers import make_slab

        if fixed_length_is_inner:
            fixed_iname, var_length_iname = inner_iname, outer_iname
        else:
            fixed_iname, var_length_iname = outer_iname, inner_iname

        space = s.get_space()
        fixed_constraint_set = (
            make_slab(space, fixed_iname, 0, fixed_length)
            # name = fixed_iname + fixed_length*var_length_iname
            .add_constraint(
                isl.Constraint.eq_from_names(
                    space, {
                        split_iname: 1,
                        fixed_iname: -1,
                        var_length_iname: -fixed_length
                    })))

        name_dim_type, name_idx = space.get_var_dict()[split_iname]
        s = s.intersect(fixed_constraint_set)

        if within is None:
            s = s.project_out(name_dim_type, name_idx, 1)

        return s
Exemplo n.º 9
0
    def add_diff_inames(self):
        diff_inames = tuple(
            self.rule_mapping_context.make_unique_var_name(
                self.diff_iname_prefix + str(i))
            for i in range(len(self.additional_shape)))

        diff_parameters = set()
        from loopy.symbolic import get_dependencies
        for s in self.additional_shape:
            diff_parameters.update(get_dependencies(s))

        diff_domain = isl.BasicSet(
            "[%s] -> {[%s]}" %
            (", ".join(diff_parameters), ", ".join(diff_inames)))

        for i, diff_iname in enumerate(diff_inames):
            diff_domain = diff_domain & make_slab(
                diff_domain.space, diff_iname, 0, self.additional_shape[i])

        self.new_domains.append(diff_domain)

        return diff_inames
Exemplo n.º 10
0
    def add_diff_inames(self):
        diff_inames = tuple(
            self.rule_mapping_context.make_unique_var_name(
                self.diff_iname_prefix+str(i))
            for i in range(len(self.additional_shape)))

        diff_parameters = set()
        from loopy.symbolic import get_dependencies
        for s in self.additional_shape:
            diff_parameters.update(get_dependencies(s))

        diff_domain = isl.BasicSet(
                "[%s] -> {[%s]}"
                % (", ".join(diff_parameters), ", ".join(diff_inames)))

        for i, diff_iname in enumerate(diff_inames):
            diff_domain = diff_domain & make_slab(
                diff_domain.space, diff_iname, 0, self.additional_shape[i])

        self.new_domains.append(diff_domain)

        return diff_inames
Exemplo n.º 11
0
    def map_subscript(self, expr):
        WalkMapper.map_subscript(self, expr)

        from pymbolic.primitives import Variable
        assert isinstance(expr.aggregate, Variable)

        shape = None
        var_name = expr.aggregate.name
        if var_name in self.kernel.arg_dict:
            arg = self.kernel.arg_dict[var_name]
            shape = arg.shape
        elif var_name in self.kernel.temporary_variables:
            tv = self.kernel.temporary_variables[var_name]
            shape = tv.shape

        if shape is not None:
            subscript = expr.index

            if not isinstance(subscript, tuple):
                subscript = (subscript,)

            from loopy.symbolic import (get_dependencies, get_access_range,
                    UnableToDetermineAccessRange)

            available_vars = set(self.domain.get_var_dict())
            shape_deps = set()
            for shape_axis in shape:
                if shape_axis is not None:
                    shape_deps.update(get_dependencies(shape_axis))

            if not (get_dependencies(subscript) <= available_vars
                    and shape_deps <= available_vars):
                return

            if len(subscript) != len(shape):
                raise LoopyError("subscript to '%s' in '%s' has the wrong "
                        "number of indices (got: %d, expected: %d)" % (
                            expr.aggregate.name, expr,
                            len(subscript), len(shape)))

            try:
                access_range = get_access_range(self.domain, subscript,
                        self.kernel.assumptions)
            except UnableToDetermineAccessRange:
                # Likely: index was non-affine, nothing we can do.
                return

            shape_domain = isl.BasicSet.universe(access_range.get_space())
            for idim in range(len(subscript)):
                shape_axis = shape[idim]

                if shape_axis is not None:
                    from loopy.isl_helpers import make_slab
                    slab = make_slab(
                            shape_domain.get_space(), (dim_type.in_, idim),
                            0, shape_axis)

                    shape_domain = shape_domain.intersect(slab)

            if not access_range.is_subset(shape_domain):
                raise LoopyError("'%s' in instruction '%s' "
                        "accesses out-of-bounds array element (could not"
                        " establish '%s' is a subset of '%s')."
                        % (expr, self.insn_id, access_range, shape_domain))
Exemplo n.º 12
0
def set_up_hw_parallel_loops(codegen_state,
                             schedule_index,
                             next_func,
                             hw_inames_left=None):
    kernel = codegen_state.kernel

    from loopy.kernel.data import (UniqueTag, HardwareConcurrentTag,
                                   LocalIndexTag, GroupIndexTag, VectorizeTag)

    from loopy.schedule import get_insn_ids_for_block_at
    insn_ids_for_block = get_insn_ids_for_block_at(kernel.schedule,
                                                   schedule_index)

    if hw_inames_left is None:
        all_inames_by_insns = set()
        for insn_id in insn_ids_for_block:
            all_inames_by_insns |= kernel.insn_inames(insn_id)

        hw_inames_left = [
            iname for iname in all_inames_by_insns
            if kernel.iname_tags_of_type(iname, HardwareConcurrentTag)
            and not kernel.iname_tags_of_type(iname, VectorizeTag)
        ]

    if not hw_inames_left:
        return next_func(codegen_state)

    global_size, local_size = kernel.get_grid_sizes_for_insn_ids(
        insn_ids_for_block)

    hw_inames_left = hw_inames_left[:]
    iname = hw_inames_left.pop()

    from loopy.symbolic import GroupHardwareAxisIndex, LocalHardwareAxisIndex

    tag, = kernel.iname_tags_of_type(iname, UniqueTag, max_num=1, min_num=1)

    if isinstance(tag, GroupIndexTag):
        hw_axis_expr = GroupHardwareAxisIndex(tag.axis)
    elif isinstance(tag, LocalIndexTag):
        hw_axis_expr = LocalHardwareAxisIndex(tag.axis)
    else:
        raise RuntimeError("unexpected hw tag type")

    other_inames_with_same_tag = [
        other_iname for other_iname in kernel.all_inames()
        if (kernel.iname_tags_of_type(other_iname, UniqueTag)
            and other_iname != iname and any(
                _tag.key == tag.key
                for _tag in kernel.iname_tags(other_iname) if _tag))
    ]

    # {{{ 'implement' hardware axis boundaries

    if isinstance(tag, LocalIndexTag):
        hw_axis_size = local_size[tag.axis]
    elif isinstance(tag, GroupIndexTag):
        hw_axis_size = global_size[tag.axis]
    else:
        raise RuntimeError("unknown hardware parallel tag")

    result = []

    bounds = kernel.get_iname_bounds(iname)
    domain = kernel.get_inames_domain(iname)

    # It's ok to find a bound that's too "loose". The conditional
    # generators will mop up after us.
    from loopy.isl_helpers import static_min_of_pw_aff
    lower_bound = static_min_of_pw_aff(bounds.lower_bound_pw_aff,
                                       constants_only=False)

    # These bounds are 'implemented' by the hardware. Make sure
    # that the downstream conditional generators realize that.
    if not isinstance(hw_axis_size, int):
        hw_axis_size, lower_bound = isl.align_two(hw_axis_size, lower_bound)

    from loopy.isl_helpers import make_slab
    slab = make_slab(domain.get_space(), iname, lower_bound,
                     lower_bound + hw_axis_size)
    codegen_state = codegen_state.intersect(slab)

    from loopy.symbolic import pw_aff_to_expr
    hw_axis_expr = hw_axis_expr + pw_aff_to_expr(lower_bound)

    # }}}

    slabs = get_slab_decomposition(kernel, iname)

    if other_inames_with_same_tag and len(slabs) > 1:
        raise RuntimeError("cannot do slab decomposition on inames that share "
                           "a tag with other inames")

    result = []

    for slab_name, slab in slabs:
        if len(slabs) > 1:
            result.append(
                codegen_state.ast_builder.emit_comment("%s slab for '%s'" %
                                                       (slab_name, iname)))

        # Have the conditional infrastructure generate the
        # slabbing conditionals.
        slabbed_kernel = intersect_kernel_with_slab(kernel, slab, iname)
        new_codegen_state = (codegen_state.copy_and_assign(
            iname, hw_axis_expr).copy(kernel=slabbed_kernel))

        inner = set_up_hw_parallel_loops(new_codegen_state, schedule_index,
                                         next_func, hw_inames_left)

        result.append(inner)

    return merge_codegen_results(codegen_state, result)
Exemplo n.º 13
0
    def end_computation_stack(self, evaluate=(), transform=False):
        """
        Returns an instance :class:`loopy.LoopKernel` corresponding to the
        computations pushed in the computation stack.

        :arg variables_needed: An instance of :class:`tuple` of the variables
            that must be computed

        :return: An instance of :class:`loopy.LoopKerneel` for the computations
            registered on the stack. If ``transform=True`` the transformation
            data is also returned through the tuple ``knl, tf_data``. The
            transformation data ``tf_data`` is a mapping from names of the
            variables which are to be evaluated to the tuple of inames which
            are involved in their respective assignments.
        """
        statements = []
        tf_data = {}
        domains = self.domains[:]
        data = self.data[:]
        substitutions = {}
        substitutions_needed = [
            array_sym.name for array_sym in evaluate
            if array_sym.name not in self.substs_to_arrays
        ]

        substs_to_arrays = self.substs_to_arrays.copy()

        for i, rule in enumerate(self.registered_substitutions):
            substs_to_arg_mapper = SubstToArrayExapander(
                substs_to_arrays.copy())
            statements.extend([
                insn.with_transformed_expressions(substs_to_arg_mapper)
                for insn in self.implicit_assignments.pop(i, [])
            ])
            if rule.name in substitutions_needed:
                rule = rule.copy(
                    expression=substs_to_arg_mapper(rule.expression))
                arg_name = self.name_generator(based_on="arr")
                arg = evaluate[substitutions_needed.index(rule.name)]
                data.append(arg.copy(name=arg_name))
                substs_to_arrays[arg.name] = arg_name

                if arg.shape != (1, ) and arg.shape != (1):
                    inames = tuple(
                        self.name_generator(based_on='i') for _ in arg.shape)
                    space = isl.Space.create_from_names(
                        isl.DEFAULT_CONTEXT, inames)
                    domain = isl.BasicSet.universe(space)

                    for iname_name, axis_length in zip(inames, arg.shape):
                        domain &= make_slab(space, iname_name, 0, axis_length)

                    assignee = substs_to_arg_mapper(
                        parse('{}[{}]'.format(arg_name, ', '.join(inames))))
                    stmnt = lp.Assignment(assignee=assignee,
                                          expression=parse('{}({})'.format(
                                              arg.name, ', '.join(inames))))
                    domains.append(domain)
                    tf_data[arg.name] = inames
                else:
                    assignee = parse('{}[0]'.format(arg_name))
                    stmnt = lp.Assignment(assignee=assignee,
                                          expression=parse('{}()'.format(
                                              arg.name)))
                    tf_data[arg.name] = ()
                statements.append(
                    stmnt.with_transformed_expressions(substs_to_arg_mapper))

            substitutions[rule.name] = rule.copy(
                expression=substs_to_arg_mapper(rule.expression))

        substs_to_arg_mapper = SubstToArrayExapander(substs_to_arrays.copy())

        statements.extend([
            insn.with_transformed_expressions(substs_to_arg_mapper)
            for insn in self.implicit_assignments.pop(i + 1, [])
        ])

        knl = lp.make_kernel(domains=domains,
                             instructions=statements,
                             kernel_data=data,
                             seq_dependencies=True,
                             lang_version=(2018, 2))
        knl = knl.copy(substitutions=substitutions)
        if transform:
            return knl, tf_data
        else:
            return knl
Exemplo n.º 14
0
def set_up_hw_parallel_loops(kernel, sched_index, codegen_state,
        hw_inames_left=None):
    from loopy.kernel.data import (
            UniqueTag, HardwareParallelTag, LocalIndexTag, GroupIndexTag)

    if hw_inames_left is None:
        hw_inames_left = [iname
                for iname in kernel.all_inames()
                if isinstance(kernel.iname_to_tag.get(iname), HardwareParallelTag)]

    if not hw_inames_left:
        return build_loop_nest(kernel, sched_index, codegen_state)

    global_size, local_size = kernel.get_grid_sizes()

    hw_inames_left = hw_inames_left[:]
    iname = hw_inames_left.pop()

    tag = kernel.iname_to_tag.get(iname)

    from loopy.symbolic import GroupHardwareAxisIndex, LocalHardwareAxisIndex

    assert isinstance(tag, UniqueTag)
    if isinstance(tag, GroupIndexTag):
        hw_axis_expr = GroupHardwareAxisIndex(tag.axis)
    elif isinstance(tag, LocalIndexTag):
        hw_axis_expr = LocalHardwareAxisIndex(tag.axis)
    else:
        raise RuntimeError("unexpected hw tag type")

    other_inames_with_same_tag = [
            other_iname for other_iname in kernel.all_inames()
            if isinstance(kernel.iname_to_tag.get(other_iname), UniqueTag)
            and kernel.iname_to_tag.get(other_iname).key == tag.key
            and other_iname != iname]

    # {{{ 'implement' hardware axis boundaries

    if isinstance(tag, LocalIndexTag):
        hw_axis_size = local_size[tag.axis]
    elif isinstance(tag, GroupIndexTag):
        hw_axis_size = global_size[tag.axis]
    else:
        raise RuntimeError("unknown hardware parallel tag")

    result = []

    bounds = kernel.get_iname_bounds(iname)
    domain = kernel.get_inames_domain(iname)

    # It's ok to find a bound that's too "loose". The conditional
    # generators will mop up after us.
    from loopy.isl_helpers import static_min_of_pw_aff
    lower_bound = static_min_of_pw_aff(bounds.lower_bound_pw_aff,
            constants_only=False)

    # These bounds are 'implemented' by the hardware. Make sure
    # that the downstream conditional generators realize that.
    if not isinstance(hw_axis_size, int):
        hw_axis_size, lower_bound = isl.align_two(hw_axis_size, lower_bound)

    from loopy.isl_helpers import make_slab
    slab = make_slab(domain.get_space(), iname,
            lower_bound, lower_bound+hw_axis_size)
    codegen_state = codegen_state.intersect(slab)

    from loopy.symbolic import pw_aff_to_expr
    hw_axis_expr = hw_axis_expr + pw_aff_to_expr(lower_bound)

    # }}}

    slabs = get_slab_decomposition(
            kernel, iname, sched_index, codegen_state)

    if other_inames_with_same_tag and len(slabs) > 1:
        raise RuntimeError("cannot do slab decomposition on inames that share "
                "a tag with other inames")

    result = []

    from loopy.codegen import add_comment

    for slab_name, slab in slabs:
        cmt = "%s slab for '%s'" % (slab_name, iname)
        if len(slabs) == 1:
            cmt = None

        # Have the conditional infrastructure generate the
        # slabbing conditionals.
        slabbed_kernel = intersect_kernel_with_slab(kernel, slab, iname)
        new_codegen_state = codegen_state.copy_and_assign(iname, hw_axis_expr)

        inner = set_up_hw_parallel_loops(
                slabbed_kernel, sched_index,
                new_codegen_state, hw_inames_left)

        result.append(add_comment(cmt, inner))

    from loopy.codegen import gen_code_block
    return gen_code_block(result)
Exemplo n.º 15
0
def pack_and_unpack_args_for_call_for_single_kernel(kernel,
                                                    callables_table,
                                                    call_name,
                                                    args_to_pack=None,
                                                    args_to_unpack=None):
    """
    Returns a a copy of *kernel* with instructions appended to copy the
    arguments in *args* to match the alignment expected by the *call_name* in
    the kernel. The arguments are copied back to *args* with the appropriate
    data layout.

    :arg call_name: An instance of :class:`str` denoting the function call in
        the *kernel*.
    :arg args_to_unpack: A list of the arguments as instances of :class:`str` which
        must be packed. If set *None*, it is interpreted that all the array
        arguments would be packed.
    :arg args_to_unpack: A list of the arguments as instances of :class:`str`
        which must be unpacked. If set *None*, it is interpreted that
        all the array arguments should be unpacked.
    """
    assert isinstance(kernel, LoopKernel)
    new_domains = []
    new_tmps = kernel.temporary_variables.copy()
    old_insn_to_new_insns = {}

    for insn in kernel.instructions:
        if not isinstance(insn, CallInstruction):
            # pack and unpack call only be done for CallInstructions.
            continue
        if insn.expression.function.name not in callables_table:
            continue

        in_knl_callable = callables_table[insn.expression.function.name]

        if in_knl_callable.name != call_name:
            # not the function we're looking for.
            continue
        in_knl_callable = in_knl_callable.with_packing_for_args()

        vng = kernel.get_var_name_generator()
        ing = kernel.get_instruction_id_generator()

        parameters = insn.expression.parameters
        if args_to_pack is None:
            args_to_pack = [
                par.subscript.aggregate.name
                for par in parameters + insn.assignees
                if isinstance(par, SubArrayRef) and (par.swept_inames)
            ]
        if args_to_unpack is None:
            args_to_unpack = [
                par.subscript.aggregate.name
                for par in parameters + insn.assignees
                if isinstance(par, SubArrayRef) and (par.swept_inames)
            ]

        # {{{ sanity checks for args

        assert isinstance(args_to_pack, list)
        assert isinstance(args_to_unpack, list)

        for arg in args_to_pack:
            found_sub_array_ref = False

            for par in parameters + insn.assignees:
                # checking that the given args is a sub array ref
                if isinstance(par,
                              SubArrayRef) and (par.subscript.aggregate.name
                                                == arg):
                    found_sub_array_ref = True
                    break
            if not found_sub_array_ref:
                raise LoopyError(
                    "No match found for packing arg '%s' of call '%s' "
                    "at insn '%s'." % (arg, call_name, insn.id))
        for arg in args_to_unpack:
            if arg not in args_to_pack:
                raise LoopyError("Argument %s should be packed in order to be "
                                 "unpacked." % arg)

        # }}}

        packing_insns = []
        unpacking_insns = []

        # {{{ handling ilp tags

        from loopy.kernel.data import IlpBaseTag, VectorizeTag
        import islpy as isl
        from pymbolic import var

        dim_type = isl.dim_type.set
        ilp_inames = {
            iname
            for iname in insn.within_inames if all(
                isinstance(tag, (IlpBaseTag, VectorizeTag))
                for tag in kernel.iname_to_tags.get(iname, []))
        }
        new_ilp_inames = set()
        ilp_inames_map = {}
        for iname in ilp_inames:
            new_iname_name = vng(iname + "_ilp")
            ilp_inames_map[var(iname)] = var(new_iname_name)
            new_ilp_inames.add(new_iname_name)
        for iname in ilp_inames:
            new_domain = kernel.get_inames_domain(iname).copy()
            for i in range(new_domain.n_dim()):
                old_iname = new_domain.get_dim_name(dim_type, i)
                if old_iname in ilp_inames:
                    new_domain = new_domain.set_dim_name(
                        dim_type, i, ilp_inames_map[var(old_iname)].name)
            new_domains.append(new_domain)

        # }}}

        from pymbolic.mapper.substitutor import make_subst_func
        from loopy.symbolic import SubstitutionMapper

        # dict to store the new assignees and parameters, the mapping pattern
        # from arg_id to parameters is identical to InKernelCallable.arg_id_to_dtype
        id_to_parameters = tuple(enumerate(parameters)) + tuple(
            (-i - 1, assignee) for i, assignee in enumerate(insn.assignees))
        new_id_to_parameters = {}

        for arg_id, p in id_to_parameters:
            if isinstance(p, SubArrayRef) and (p.subscript.aggregate.name
                                               in args_to_pack):
                new_pack_inames = ilp_inames_map.copy(
                )  # packing-specific inames
                new_unpack_inames = ilp_inames_map.copy(
                )  # unpacking-specific iname

                new_pack_inames = {
                    iname: var(vng(iname.name + "_pack"))
                    for iname in p.swept_inames
                }
                new_unpack_inames = {
                    iname: var(vng(iname.name + "_unpack"))
                    for iname in p.swept_inames
                }

                # Updating the domains corresponding to the new inames.
                for iname in p.swept_inames:
                    new_domain_pack = kernel.get_inames_domain(
                        iname.name).copy()
                    new_domain_unpack = kernel.get_inames_domain(
                        iname.name).copy()
                    for i in range(new_domain_pack.n_dim()):
                        old_iname = new_domain_pack.get_dim_name(dim_type, i)
                        if var(old_iname) in new_pack_inames:
                            new_domain_pack = new_domain_pack.set_dim_name(
                                dim_type, i,
                                new_pack_inames[var(old_iname)].name)
                            new_domain_unpack = new_domain_unpack.set_dim_name(
                                dim_type, i,
                                new_unpack_inames[var(old_iname)].name)
                    new_domains.append(new_domain_pack)
                    new_domains.append(new_domain_unpack)

                arg = p.subscript.aggregate.name
                pack_name = vng(arg + "_pack")

                from loopy.kernel.data import (TemporaryVariable,
                                               temp_var_scope)

                if arg in kernel.arg_dict:
                    arg_in_caller = kernel.arg_dict[arg]
                else:
                    arg_in_caller = kernel.temporary_variables[arg]

                pack_tmp = TemporaryVariable(
                    name=pack_name,
                    dtype=arg_in_caller.dtype,
                    dim_tags=in_knl_callable.arg_id_to_descr[arg_id].dim_tags,
                    shape=in_knl_callable.arg_id_to_descr[arg_id].shape,
                    scope=temp_var_scope.PRIVATE,
                )

                new_tmps[pack_name] = pack_tmp

                from loopy import Assignment
                pack_subst_mapper = SubstitutionMapper(
                    make_subst_func(new_pack_inames))
                unpack_subst_mapper = SubstitutionMapper(
                    make_subst_func(new_unpack_inames))

                # {{{ getting the lhs for packing and rhs for unpacking

                from loopy.isl_helpers import simplify_via_aff, make_slab

                flatten_index = simplify_via_aff(
                    sum(dim_tag.stride * idx for dim_tag, idx in zip(
                        arg_in_caller.dim_tags, p.subscript.index_tuple)))

                new_indices = []
                for dim_tag in in_knl_callable.arg_id_to_descr[
                        arg_id].dim_tags:
                    ind = flatten_index // dim_tag.stride
                    flatten_index -= (dim_tag.stride * ind)
                    new_indices.append(ind)

                new_indices = tuple(simplify_via_aff(i) for i in new_indices)

                pack_lhs_assignee = pack_subst_mapper(
                    var(pack_name).index(new_indices))
                unpack_rhs = unpack_subst_mapper(
                    var(pack_name).index(new_indices))

                # }}}

                packing_insns.append(
                    Assignment(
                        assignee=pack_lhs_assignee,
                        expression=pack_subst_mapper.map_subscript(
                            p.subscript),
                        within_inames=insn.within_inames - ilp_inames
                        | {new_pack_inames[i].name
                           for i in p.swept_inames} | (new_ilp_inames),
                        depends_on=insn.depends_on,
                        id=ing(insn.id + "_pack"),
                        depends_on_is_final=True))

                if p.subscript.aggregate.name in args_to_unpack:
                    unpacking_insns.append(
                        Assignment(
                            expression=unpack_rhs,
                            assignee=unpack_subst_mapper.map_subscript(
                                p.subscript),
                            within_inames=insn.within_inames - ilp_inames | {
                                new_unpack_inames[i].name
                                for i in p.swept_inames
                            } | (new_ilp_inames),
                            id=ing(insn.id + "_unpack"),
                            depends_on=frozenset([insn.id]),
                            depends_on_is_final=True))

                # {{{ creating the sweep inames for the new sub array refs

                updated_swept_inames = []

                for _ in in_knl_callable.arg_id_to_descr[arg_id].shape:
                    updated_swept_inames.append(var(vng("i_packsweep_" + arg)))

                ctx = kernel.isl_context
                space = isl.Space.create_from_names(
                    ctx, set=[iname.name for iname in updated_swept_inames])
                iname_set = isl.BasicSet.universe(space)
                for iname, axis_length in zip(
                        updated_swept_inames,
                        in_knl_callable.arg_id_to_descr[arg_id].shape):
                    iname_set = iname_set & make_slab(space, iname.name, 0,
                                                      axis_length)
                new_domains = new_domains + [iname_set]

                # }}}

                new_id_to_parameters[arg_id] = SubArrayRef(
                    tuple(updated_swept_inames),
                    (var(pack_name).index(tuple(updated_swept_inames))))
            else:
                new_id_to_parameters[arg_id] = p

        if packing_insns:
            subst_mapper = SubstitutionMapper(make_subst_func(ilp_inames_map))
            new_call_insn = insn.with_transformed_expressions(subst_mapper)
            new_params = tuple(
                subst_mapper(new_id_to_parameters[i])
                for i, _ in enumerate(parameters))
            new_assignees = tuple(
                subst_mapper(new_id_to_parameters[-i - 1])
                for i, _ in enumerate(insn.assignees))
            new_call_insn = new_call_insn.copy(
                depends_on=new_call_insn.depends_on
                | {pack.id
                   for pack in packing_insns},
                within_inames=new_call_insn.within_inames - ilp_inames |
                (new_ilp_inames),
                expression=new_call_insn.expression.function(*new_params),
                assignees=new_assignees)
            old_insn_to_new_insns[insn.id] = (packing_insns + [new_call_insn] +
                                              unpacking_insns)

    if old_insn_to_new_insns:
        new_instructions = []
        for insn in kernel.instructions:
            if insn.id in old_insn_to_new_insns:
                # Replacing the current instruction with the group of
                # instructions including the packing and unpacking instructions
                new_instructions.extend(old_insn_to_new_insns[insn.id])
            else:
                # for the instructions that depend on the call instruction that
                # are to be packed and unpacked, we need to add the complete
                # instruction block as a dependency for them.
                new_depends_on = insn.depends_on
                if insn.depends_on & set(old_insn_to_new_insns):
                    # need to add the unpack instructions on dependencies.
                    for old_insn_id in insn.depends_on & set(
                            old_insn_to_new_insns):
                        new_depends_on |= frozenset(
                            i.id for i in old_insn_to_new_insns[old_insn_id])
                new_instructions.append(insn.copy(depends_on=new_depends_on))
        kernel = kernel.copy(domains=kernel.domains + new_domains,
                             instructions=new_instructions,
                             temporary_variables=new_tmps)

    return kernel
Exemplo n.º 16
0
    def map_subscript(self, expr):
        WalkMapper.map_subscript(self, expr)

        from pymbolic.primitives import Variable
        assert isinstance(expr.aggregate, Variable)

        shape = None
        var_name = expr.aggregate.name
        if var_name in self.kernel.arg_dict:
            arg = self.kernel.arg_dict[var_name]
            shape = arg.shape
        elif var_name in self.kernel.temporary_variables:
            tv = self.kernel.temporary_variables[var_name]
            shape = tv.shape

        if shape is not None:
            subscript = expr.index

            if not isinstance(subscript, tuple):
                subscript = (subscript,)

            from loopy.symbolic import get_dependencies, get_access_range

            available_vars = set(self.domain.get_var_dict())
            shape_deps = set()
            for shape_axis in shape:
                if shape_axis is not None:
                    shape_deps.update(get_dependencies(shape_axis))

            if not (get_dependencies(subscript) <= available_vars
                    and shape_deps <= available_vars):
                return

            if len(subscript) != len(shape):
                raise LoopyError("subscript to '%s' in '%s' has the wrong "
                        "number of indices (got: %d, expected: %d)" % (
                            expr.aggregate.name, expr,
                            len(subscript), len(shape)))

            try:
                access_range = get_access_range(self.domain, subscript,
                        self.kernel.assumptions)
            except isl.Error:
                # Likely: index was non-linear, nothing we can do.
                return
            except TypeError:
                # Likely: index was non-linear, nothing we can do.
                return

            shape_domain = isl.BasicSet.universe(access_range.get_space())
            for idim in range(len(subscript)):
                shape_axis = shape[idim]

                if shape_axis is not None:
                    from loopy.isl_helpers import make_slab
                    slab = make_slab(
                            shape_domain.get_space(), (dim_type.in_, idim),
                            0, shape_axis)

                    shape_domain = shape_domain.intersect(slab)

            if not access_range.is_subset(shape_domain):
                raise LoopyError("'%s' in instruction '%s' "
                        "accesses out-of-bounds array element"
                        % (expr, self.insn_id))
Exemplo n.º 17
0
def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func,
        hw_inames_left=None):
    kernel = codegen_state.kernel

    from loopy.kernel.data import (UniqueTag, HardwareConcurrentTag,
                LocalIndexTag, GroupIndexTag)

    from loopy.schedule import get_insn_ids_for_block_at
    insn_ids_for_block = get_insn_ids_for_block_at(kernel.schedule, schedule_index)

    if hw_inames_left is None:
        all_inames_by_insns = set()
        for insn_id in insn_ids_for_block:
            all_inames_by_insns |= kernel.insn_inames(insn_id)

        hw_inames_left = [iname for iname in all_inames_by_insns
                if kernel.iname_tags_of_type(iname, HardwareConcurrentTag)]

    if not hw_inames_left:
        return next_func(codegen_state)

    global_size, local_size = kernel.get_grid_sizes_for_insn_ids(
            insn_ids_for_block)

    hw_inames_left = hw_inames_left[:]
    iname = hw_inames_left.pop()

    from loopy.symbolic import GroupHardwareAxisIndex, LocalHardwareAxisIndex

    tag, = kernel.iname_tags_of_type(iname, UniqueTag, max_num=1, min_num=1)

    if isinstance(tag, GroupIndexTag):
        hw_axis_expr = GroupHardwareAxisIndex(tag.axis)
    elif isinstance(tag, LocalIndexTag):
        hw_axis_expr = LocalHardwareAxisIndex(tag.axis)
    else:
        raise RuntimeError("unexpected hw tag type")

    other_inames_with_same_tag = [
        other_iname for other_iname in kernel.all_inames()
        if (kernel.iname_tags_of_type(other_iname, UniqueTag)
            and other_iname != iname
            and any(_tag.key == tag.key
                    for _tag in kernel.iname_tags(other_iname)
                    if _tag))]

    # {{{ 'implement' hardware axis boundaries

    if isinstance(tag, LocalIndexTag):
        hw_axis_size = local_size[tag.axis]
    elif isinstance(tag, GroupIndexTag):
        hw_axis_size = global_size[tag.axis]
    else:
        raise RuntimeError("unknown hardware parallel tag")

    result = []

    bounds = kernel.get_iname_bounds(iname)
    domain = kernel.get_inames_domain(iname)

    # It's ok to find a bound that's too "loose". The conditional
    # generators will mop up after us.
    from loopy.isl_helpers import static_min_of_pw_aff
    lower_bound = static_min_of_pw_aff(bounds.lower_bound_pw_aff,
            constants_only=False)

    # These bounds are 'implemented' by the hardware. Make sure
    # that the downstream conditional generators realize that.
    if not isinstance(hw_axis_size, int):
        hw_axis_size, lower_bound = isl.align_two(hw_axis_size, lower_bound)

    from loopy.isl_helpers import make_slab
    slab = make_slab(domain.get_space(), iname,
            lower_bound, lower_bound+hw_axis_size)
    codegen_state = codegen_state.intersect(slab)

    from loopy.symbolic import pw_aff_to_expr
    hw_axis_expr = hw_axis_expr + pw_aff_to_expr(lower_bound)

    # }}}

    slabs = get_slab_decomposition(kernel, iname)

    if other_inames_with_same_tag and len(slabs) > 1:
        raise RuntimeError("cannot do slab decomposition on inames that share "
                "a tag with other inames")

    result = []

    for slab_name, slab in slabs:
        if len(slabs) > 1:
            result.append(
                    codegen_state.ast_builder.emit_comment(
                        "%s slab for '%s'" % (slab_name, iname)))

        # Have the conditional infrastructure generate the
        # slabbing conditionals.
        slabbed_kernel = intersect_kernel_with_slab(kernel, slab, iname)
        new_codegen_state = (codegen_state
                .copy_and_assign(iname, hw_axis_expr)
                .copy(kernel=slabbed_kernel))

        inner = set_up_hw_parallel_loops(
                new_codegen_state, schedule_index, next_func,
                hw_inames_left)

        result.append(inner)

    return merge_codegen_results(codegen_state, result)
Exemplo n.º 18
0
    def __setitem__(self, index, value):
        """
        Registers an assignment in order to make the indices represented by
        ``index`` to ``value``.

        :arg index: An instance of :class:`int`, or :class:`slice` or
            :class:`numloopy.ArraySymbol`.

        :arg value: An instance of :class:`numloopy.ArraySymbol`, with the same
            shape as represented by ``index``.
        """
        if isinstance(index, (Number, slice, ArraySymbol)):
            index = (index, )
        assert isinstance(index, tuple)

        # current heuristic: assumes that the dereferenced guys are
        # always arguments and not temporary variables, maybe  we need to fix
        # this later?
        try:
            arg_name = self.stack.substs_to_arrays[self.name]
        except KeyError:
            inames = tuple(
                self.stack.name_generator(based_on="i") for _ in self.shape)
            arg_name = self.stack.name_generator(based_on="arr")
            insn = lp.Assignment(
                assignee=parse('{}[{}]'.format(arg_name, ', '.join(inames))),
                expression=parse('{}({})'.format(self.name,
                                                 ', '.join(inames))))
            self.stack.register_implicit_assignment(insn)

            space = isl.Space.create_from_names(isl.DEFAULT_CONTEXT, inames)
            domain = isl.BasicSet.universe(space)

            for iname_name, axis_length in zip(inames, self.shape):
                domain &= make_slab(space, iname_name, 0, axis_length)

            self.stack.domains.append(domain)

        # now handling the second assignment

        try:
            inames, iname_lens = zip(*tuple(
                (self.stack.name_generator(based_on="i"), axis_len)
                for idx, axis_len in zip(index, self.shape)
                if isinstance(idx, slice) or isinstance(idx, ArraySymbol)))
            space = isl.Space.create_from_names(isl.DEFAULT_CONTEXT, inames)
            domain = isl.BasicSet.universe(space)

            for iname_name, axis_length in zip(inames, iname_lens):
                domain &= make_slab(space, iname_name, 0, axis_length)

            self.stack.domains.append(domain)
        except ValueError:
            inames = ()
            iname_lens = ()

        indices = []
        _k = 0
        for idx in index:
            if isinstance(idx, slice):
                indices.append(Variable(inames[_k]))
                _k += 1
            elif isinstance(idx, ArraySymbol):
                indices.append(Variable(idx.name)(Variable(inames[_k])))
                _k += 1
            else:
                indices.append(idx)
        assert _k == len(inames)
        indices = tuple(indices)

        if isinstance(value, ArraySymbol):
            insn = lp.Assignment(assignee=Subscript(Variable(arg_name),
                                                    indices),
                                 expression='{}({})'.format(
                                     value.name,
                                     ', '.join(str(iname)
                                               for iname in inames)))
        elif isinstance(value, Number):
            insn = lp.Assignment(assignee=Subscript(Variable(arg_name),
                                                    indices),
                                 expression=value)
        else:
            raise TypeError("arrays can be only assigned with number or other "
                            "arrays")
        self.stack.register_implicit_assignment(insn)
        if self.name not in self.stack.substs_to_arrays:
            subst_name = self.stack.name_generator(based_on="subst")
            inames = tuple(
                self.stack.name_generator(based_on='i') for _ in self.shape)
            rule = lp.SubstitutionRule(
                subst_name,
                inames,
                expression=Subscript(
                    Variable(arg_name),
                    tuple(Variable(iname) for iname in inames)))
            self.stack.register_substitution(rule)
            self.stack.data.append(self.copy(name=arg_name))

            self.stack.substs_to_arrays[subst_name] = arg_name
            self.name = subst_name
Exemplo n.º 19
0
 def _make_slab(self, space, iname, start, stop):
     from loopy.isl_helpers import make_slab
     return make_slab(space, iname, start, stop)