示例#1
0
文件: tools.py 项目: damian-666/loopy
    def base_index_and_length(self, set, iname, context=None):
        if not isinstance(iname, int):
            iname_to_dim = set.space.get_var_dict()
            idx = iname_to_dim[iname][1]
        else:
            idx = iname

        lower_bound_pw_aff = self.dim_min(set, idx)
        upper_bound_pw_aff = self.dim_max(set, idx)

        from loopy.diagnostic import StaticValueFindingError
        from loopy.isl_helpers import (static_max_of_pw_aff,
                                       static_min_of_pw_aff,
                                       static_value_of_pw_aff)
        from loopy.symbolic import pw_aff_to_expr

        # {{{ first: try to find static lower bound value

        try:
            base_index_aff = static_value_of_pw_aff(lower_bound_pw_aff,
                                                    constants_only=False,
                                                    context=context)
        except StaticValueFindingError:
            base_index_aff = None

        if base_index_aff is not None:
            base_index = pw_aff_to_expr(base_index_aff)

            size = pw_aff_to_expr(
                static_max_of_pw_aff(upper_bound_pw_aff - base_index_aff + 1,
                                     constants_only=False,
                                     context=context))

            return base_index, size

        # }}}

        # {{{ if that didn't work, try finding a lower bound

        base_index_aff = static_min_of_pw_aff(lower_bound_pw_aff,
                                              constants_only=False,
                                              context=context)

        base_index = pw_aff_to_expr(base_index_aff)

        size = pw_aff_to_expr(
            static_max_of_pw_aff(upper_bound_pw_aff - base_index_aff + 1,
                                 constants_only=False,
                                 context=context))

        return base_index, size
示例#2
0
文件: tools.py 项目: cmsquared/loopy
    def base_index_and_length(self, set, iname, context=None):
        if not isinstance(iname, int):
            iname_to_dim = set.space.get_var_dict()
            idx = iname_to_dim[iname][1]
        else:
            idx = iname

        lower_bound_pw_aff = self.dim_min(set, idx)
        upper_bound_pw_aff = self.dim_max(set, idx)

        from loopy.diagnostic import StaticValueFindingError
        from loopy.isl_helpers import (
                static_max_of_pw_aff,
                static_min_of_pw_aff,
                static_value_of_pw_aff)
        from loopy.symbolic import pw_aff_to_expr

        # {{{ first: try to find static lower bound value

        try:
            base_index_aff = static_value_of_pw_aff(
                    lower_bound_pw_aff, constants_only=False,
                    context=context)
        except StaticValueFindingError:
            base_index_aff = None

        if base_index_aff is not None:
            base_index = pw_aff_to_expr(base_index_aff)

            size = pw_aff_to_expr(static_max_of_pw_aff(
                    upper_bound_pw_aff - base_index_aff + 1, constants_only=False,
                    context=context))

            return base_index, size

        # }}}

        # {{{ if that didn't work, try finding a lower bound

        base_index_aff = static_min_of_pw_aff(
                lower_bound_pw_aff, constants_only=False,
                context=context)

        base_index = pw_aff_to_expr(base_index_aff)

        size = pw_aff_to_expr(static_max_of_pw_aff(
                upper_bound_pw_aff - base_index_aff + 1, constants_only=False,
                context=context))

        return base_index, size
示例#3
0
文件: loop.py 项目: navjotk/loopy
def generate_unroll_loop(kernel, sched_index, codegen_state):
    iname = kernel.schedule[sched_index].iname

    bounds = kernel.get_iname_bounds(iname, constants_only=True)

    from loopy.isl_helpers import (
            static_max_of_pw_aff, static_value_of_pw_aff)
    from loopy.symbolic import pw_aff_to_expr

    length_aff = static_max_of_pw_aff(bounds.size, constants_only=True)

    if not length_aff.is_cst():
        raise LoopyError(
                "length of unrolled loop '%s' is not a constant, "
                "cannot unroll")

    length = int(pw_aff_to_expr(length_aff))

    try:
        lower_bound_aff = static_value_of_pw_aff(
                bounds.lower_bound_pw_aff.coalesce(),
                constants_only=False)
    except Exception as e:
        raise type(e)("while finding lower bound of '%s': " % iname)

    result = []

    for i in range(length):
        idx_aff = lower_bound_aff + i
        new_codegen_state = codegen_state.fix(iname, idx_aff)
        result.append(
                build_loop_nest(kernel, sched_index+1, new_codegen_state))

    return gen_code_block(result)
示例#4
0
def generate_unroll_loop(codegen_state, sched_index):
    kernel = codegen_state.kernel

    iname = kernel.schedule[sched_index].iname

    bounds = kernel.get_iname_bounds(iname, constants_only=True)

    from loopy.isl_helpers import (static_max_of_pw_aff,
                                   static_value_of_pw_aff)
    from loopy.symbolic import pw_aff_to_expr

    length_aff = static_max_of_pw_aff(bounds.size, constants_only=True)

    if not length_aff.is_cst():
        raise LoopyError("length of unrolled loop '%s' is not a constant, "
                         "cannot unroll")

    length = int(pw_aff_to_expr(length_aff))

    try:
        lower_bound_aff = static_value_of_pw_aff(
            bounds.lower_bound_pw_aff.coalesce(), constants_only=False)
    except Exception as e:
        raise type(e)("while finding lower bound of '%s': " % iname)

    result = []

    for i in range(length):
        idx_aff = lower_bound_aff + i
        new_codegen_state = codegen_state.fix(iname, idx_aff)
        result.append(build_loop_nest(new_codegen_state, sched_index + 1))

    return merge_codegen_results(codegen_state, result)
示例#5
0
 def _get_int_iname_size(iname):
     from loopy.isl_helpers import static_max_of_pw_aff
     from loopy.symbolic import pw_aff_to_expr
     size = pw_aff_to_expr(
         static_max_of_pw_aff(kernel.get_iname_bounds(iname).size,
                              constants_only=True))
     assert isinstance(size, six.integer_types)
     return size
示例#6
0
 def _get_int_iname_size(iname):
     from loopy.isl_helpers import static_max_of_pw_aff
     from loopy.symbolic import pw_aff_to_expr
     size = pw_aff_to_expr(
             static_max_of_pw_aff(
                 kernel.get_iname_bounds(iname).size,
                 constants_only=True))
     assert isinstance(size, six.integer_types)
     return size
示例#7
0
def get_arg_descriptor_for_expression(kernel, expr):
    """
    :returns: a :class:`ArrayArgDescriptor` or a :class:`ValueArgDescriptor`
        describing the argument expression *expr* which occurs
        in a call in the code of *kernel*.
    """
    from loopy.symbolic import (SubArrayRef, pw_aff_to_expr,
                                SweptInameStrideCollector)
    from loopy.kernel.data import TemporaryVariable, ArrayArg

    if isinstance(expr, SubArrayRef):
        name = expr.subscript.aggregate.name
        arg = kernel.get_var_descriptor(name)

        if not isinstance(arg, (TemporaryVariable, ArrayArg)):
            raise LoopyError("unsupported argument type "
                             "'%s' of '%s' in call statement" %
                             (type(arg).__name__, expr.name))

        aspace = arg.address_space

        from loopy.kernel.array import FixedStrideArrayDimTag as DimTag
        sub_dim_tags = []
        sub_shape = []

        # This helps in identifying identities like
        # "2*(i//2) + i%2" := "i"
        # See the kernel in
        # test_callables.py::test_shape_translation_through_sub_array_refs

        from loopy.symbolic import simplify_using_aff
        linearized_index = simplify_using_aff(
            kernel,
            sum(dim_tag.stride * iname for dim_tag, iname in zip(
                arg.dim_tags, expr.subscript.index_tuple)))

        strides_as_dict = SweptInameStrideCollector(
            tuple(iname.name for iname in expr.swept_inames))(linearized_index)
        sub_dim_tags = tuple(
            # Not all swept inames necessarily occur in the expression.
            DimTag(strides_as_dict.get(iname, 0))
            for iname in expr.swept_inames)
        sub_shape = tuple(
            pw_aff_to_expr(
                kernel.get_iname_bounds(iname.name).upper_bound_pw_aff -
                kernel.get_iname_bounds(iname.name).lower_bound_pw_aff) + 1
            for iname in expr.swept_inames)

        return ArrayArgDescriptor(address_space=aspace,
                                  dim_tags=sub_dim_tags,
                                  shape=sub_shape)
    else:
        ExpressionIsScalarChecker(kernel)(expr)
        return ValueArgDescriptor()
示例#8
0
文件: loop.py 项目: inducer/loopy
def generate_vectorize_loop(codegen_state, sched_index):
    kernel = codegen_state.kernel

    iname = kernel.schedule[sched_index].iname

    bounds = kernel.get_iname_bounds(iname, constants_only=True)

    from loopy.isl_helpers import (
            static_max_of_pw_aff, static_value_of_pw_aff)
    from loopy.symbolic import pw_aff_to_expr

    length_aff = static_max_of_pw_aff(bounds.size, constants_only=True)

    if not length_aff.is_cst():
        warn(kernel, "vec_upper_not_const",
                "upper bound for vectorized loop '%s' is not a constant, "
                "cannot vectorize--unrolling instead")
        return generate_unroll_loop(codegen_state, sched_index)

    length = int(pw_aff_to_expr(length_aff))

    try:
        lower_bound_aff = static_value_of_pw_aff(
                bounds.lower_bound_pw_aff.coalesce(),
                constants_only=False)
    except Exception as e:
        raise type(e)("while finding lower bound of '%s': " % iname)

    if not lower_bound_aff.plain_is_zero():
        warn(kernel, "vec_lower_not_0",
                "lower bound for vectorized loop '%s' is not zero, "
                "cannot vectorize--unrolling instead")
        return generate_unroll_loop(codegen_state, sched_index)

    # {{{ 'implement' vectorization bounds

    domain = kernel.get_inames_domain(iname)

    from loopy.isl_helpers import make_slab
    slab = make_slab(domain.get_space(), iname,
            lower_bound_aff, lower_bound_aff+length)
    codegen_state = codegen_state.intersect(slab)

    # }}}

    from loopy.codegen import VectorizationInfo
    new_codegen_state = codegen_state.copy(
            vectorization_info=VectorizationInfo(
                iname=iname,
                length=length,
                space=length_aff.space))

    return build_loop_nest(new_codegen_state, sched_index+1)
示例#9
0
文件: loop.py 项目: tj-sun/loopy
def generate_vectorize_loop(codegen_state, sched_index):
    kernel = codegen_state.kernel

    iname = kernel.schedule[sched_index].iname

    bounds = kernel.get_iname_bounds(iname, constants_only=True)

    from loopy.isl_helpers import (
            static_max_of_pw_aff, static_value_of_pw_aff)
    from loopy.symbolic import pw_aff_to_expr

    length_aff = static_max_of_pw_aff(bounds.size, constants_only=True)

    if not length_aff.is_cst():
        warn(kernel, "vec_upper_not_const",
                "upper bound for vectorized loop '%s' is not a constant, "
                "cannot vectorize--unrolling instead")
        return generate_unroll_loop(kernel, sched_index, codegen_state)

    length = int(pw_aff_to_expr(length_aff))

    try:
        lower_bound_aff = static_value_of_pw_aff(
                bounds.lower_bound_pw_aff.coalesce(),
                constants_only=False)
    except Exception as e:
        raise type(e)("while finding lower bound of '%s': " % iname)

    if not lower_bound_aff.plain_is_zero():
        warn(kernel, "vec_lower_not_0",
                "lower bound for vectorized loop '%s' is not zero, "
                "cannot vectorize--unrolling instead")
        return generate_unroll_loop(kernel, sched_index, codegen_state)

    # {{{ 'implement' vectorization bounds

    domain = kernel.get_inames_domain(iname)

    from loopy.isl_helpers import make_slab
    slab = make_slab(domain.get_space(), iname,
            lower_bound_aff, lower_bound_aff+length)
    codegen_state = codegen_state.intersect(slab)

    # }}}

    from loopy.codegen import VectorizationInfo
    new_codegen_state = codegen_state.copy(
            vectorization_info=VectorizationInfo(
                iname=iname,
                length=length,
                space=length_aff.space))

    return build_loop_nest(new_codegen_state, sched_index+1)
示例#10
0
文件: tools.py 项目: rckirby/loopy
    def base_index_and_length(self, set, iname, context=None):
        if not isinstance(iname, int):
            iname_to_dim = set.space.get_var_dict()
            idx = iname_to_dim[iname][1]
        else:
            idx = iname

        lower_bound_pw_aff = self.dim_min(set, idx)
        upper_bound_pw_aff = self.dim_max(set, idx)

        from loopy.isl_helpers import static_max_of_pw_aff, static_value_of_pw_aff
        from loopy.symbolic import pw_aff_to_expr

        size = pw_aff_to_expr(static_max_of_pw_aff(
                upper_bound_pw_aff - lower_bound_pw_aff + 1, constants_only=False,
                context=context))
        try:
            base_index = pw_aff_to_expr(
                    static_value_of_pw_aff(lower_bound_pw_aff, constants_only=False,
                        context=context))
        except Exception as e:
            raise type(e)("while finding lower bound of '%s': %s" % (iname, str(e)))

        return base_index, size
示例#11
0
    def make_new_loop_index(inner, outer):
        # These two expressions are equivalent. Benchmarking between the
        # two was inconclusive, although one is shorter.

        if 0:
            # Triggers isl issues in check pass.
            return (inner + pw_aff_to_expr(chunk_floor) * outer +
                    pw_aff_to_expr(chunk_diff) * Min(
                        (outer, pw_aff_to_expr(chunk_mod))))
        else:
            return (inner + pw_aff_to_expr(chunk_ceil) * Min(
                (outer, pw_aff_to_expr(chunk_mod))) +
                    pw_aff_to_expr(chunk_floor) * (outer - Min(
                        (outer, pw_aff_to_expr(chunk_mod)))))
示例#12
0
文件: __init__.py 项目: rckirby/loopy
    def fix(self, iname, aff):
        new_impl_domain = self.implemented_domain

        impl_space = self.implemented_domain.get_space()
        if iname not in impl_space.get_var_dict():
            new_impl_domain = new_impl_domain.add_dims(isl.dim_type.set, 1).set_dim_name(
                isl.dim_type.set, new_impl_domain.dim(isl.dim_type.set), iname
            )
            impl_space = new_impl_domain.get_space()

        from loopy.isl_helpers import iname_rel_aff

        iname_plus_lb_aff = iname_rel_aff(impl_space, iname, "==", aff)

        from loopy.symbolic import pw_aff_to_expr

        cns = isl.Constraint.equality_from_aff(iname_plus_lb_aff)
        expr = pw_aff_to_expr(aff)

        new_impl_domain = new_impl_domain.add_constraint(cns)
        return self.copy_and_assign(iname, expr).copy(implemented_domain=new_impl_domain)
示例#13
0
    def fix(self, iname, aff):
        new_impl_domain = self.implemented_domain

        impl_space = self.implemented_domain.get_space()
        if iname not in impl_space.get_var_dict():
            new_impl_domain = (new_impl_domain.add_dims(
                isl.dim_type.set,
                1).set_dim_name(isl.dim_type.set,
                                new_impl_domain.dim(isl.dim_type.set), iname))
            impl_space = new_impl_domain.get_space()

        from loopy.isl_helpers import iname_rel_aff
        iname_plus_lb_aff = iname_rel_aff(impl_space, iname, "==", aff)

        from loopy.symbolic import pw_aff_to_expr
        cns = isl.Constraint.equality_from_aff(iname_plus_lb_aff)
        expr = pw_aff_to_expr(aff)

        new_impl_domain = new_impl_domain.add_constraint(cns)
        return self.copy_and_assign(
            iname, expr).copy(implemented_domain=new_impl_domain)
示例#14
0
def duplicate_private_temporaries_for_ilp_and_vec(kernel):
    logger.debug("%s: duplicate temporaries for ilp" % kernel.name)

    wmap = kernel.writer_map()

    from loopy.kernel.data import IlpBaseTag, VectorizeTag

    var_to_new_ilp_inames = {}

    # {{{ find variables that need extra indices

    for tv in six.itervalues(kernel.temporary_variables):
        for writer_insn_id in wmap.get(tv.name, []):
            writer_insn = kernel.id_to_insn[writer_insn_id]
            ilp_inames = frozenset(iname
                    for iname in kernel.insn_inames(writer_insn)
                    if isinstance(
                        kernel.iname_to_tag.get(iname),
                        (IlpBaseTag, VectorizeTag)))

            referenced_ilp_inames = (ilp_inames
                    & writer_insn.write_dependency_names())

            new_ilp_inames = ilp_inames - referenced_ilp_inames

            if not new_ilp_inames:
                break

            if tv.name in var_to_new_ilp_inames:
                if new_ilp_inames != set(var_to_new_ilp_inames[tv.name]):
                    raise LoopyError("instruction '%s' requires adding "
                            "indices for ILP inames '%s' on var '%s', but previous "
                            "instructions required inames '%s'"
                            % (writer_insn_id, ", ".join(new_ilp_inames),
                                ", ".join(var_to_new_ilp_inames[tv.name])))

                continue

            var_to_new_ilp_inames[tv.name] = set(new_ilp_inames)

    # }}}

    # {{{ find ilp iname lengths

    from loopy.isl_helpers import static_max_of_pw_aff
    from loopy.symbolic import pw_aff_to_expr

    ilp_iname_to_length = {}
    for ilp_inames in six.itervalues(var_to_new_ilp_inames):
        for iname in ilp_inames:
            if iname in ilp_iname_to_length:
                continue

            bounds = kernel.get_iname_bounds(iname, constants_only=True)
            ilp_iname_to_length[iname] = int(pw_aff_to_expr(
                        static_max_of_pw_aff(bounds.size, constants_only=True)))

            assert static_max_of_pw_aff(
                    bounds.lower_bound_pw_aff, constants_only=True).plain_is_zero()

    # }}}

    # {{{ change temporary variables

    new_temp_vars = kernel.temporary_variables.copy()
    for tv_name, inames in six.iteritems(var_to_new_ilp_inames):
        tv = new_temp_vars[tv_name]
        extra_shape = tuple(ilp_iname_to_length[iname] for iname in inames)

        shape = tv.shape
        if shape is None:
            shape = ()

        dim_tags = ["c"] * (len(shape) + len(extra_shape))
        for i, iname in enumerate(inames):
            if isinstance(kernel.iname_to_tag.get(iname), VectorizeTag):
                dim_tags[len(shape) + i] = "vec"

        new_temp_vars[tv.name] = tv.copy(shape=shape + extra_shape,
                # Forget what you knew about data layout,
                # create from scratch.
                dim_tags=dim_tags)

    # }}}

    from pymbolic import var
    eiii = ExtraInameIndexInserter(
            dict((var_name, tuple(var(iname) for iname in inames))
                for var_name, inames in six.iteritems(var_to_new_ilp_inames)))

    new_insns = [
            insn.with_transformed_expressions(eiii)
            for insn in kernel.instructions]

    return kernel.copy(
        temporary_variables=new_temp_vars,
        instructions=new_insns)
示例#15
0
def test_pw_aff_to_conditional_expr():
    from loopy.symbolic import pw_aff_to_expr
    cond = isl.PwAff("[i] -> { [(0)] : i = 0; [(-1 + i)] : i > 0 }")
    expr = pw_aff_to_expr(cond)
    assert str(expr) == "0 if i == 0 else -1 + i"
示例#16
0
文件: creation.py 项目: rckirby/loopy
def guess_arg_shape_if_requested(kernel, default_order):
    new_args = []

    import loopy as lp
    from loopy.kernel.array import ArrayBase
    from loopy.symbolic import SubstitutionRuleExpander, AccessRangeMapper

    submap = SubstitutionRuleExpander(kernel.substitutions)

    for arg in kernel.args:
        if isinstance(arg, ArrayBase) and arg.shape is lp.auto:
            armap = AccessRangeMapper(kernel, arg.name)

            try:
                for insn in kernel.instructions:
                    if isinstance(insn, lp.ExpressionInstruction):
                        armap(submap(insn.assignee), kernel.insn_inames(insn))
                        armap(submap(insn.expression), kernel.insn_inames(insn))
            except TypeError as e:
                from traceback import print_exc
                print_exc()

                from loopy.diagnostic import LoopyError
                raise LoopyError(
                        "Failed to (automatically, as requested) find "
                        "shape/strides for argument '%s'. "
                        "Specifying the shape manually should get rid of this. "
                        "The following error occurred: %s"
                        % (arg.name, str(e)))

            if armap.access_range is None:
                if armap.bad_subscripts:
                    raise RuntimeError("cannot determine access range for '%s': "
                            "undetermined index in subscript(s) '%s'"
                            % (arg.name, ", ".join(
                                    str(i) for i in armap.bad_subscripts)))

                # no subscripts found, let's call it a scalar
                shape = ()
            else:
                from loopy.isl_helpers import static_max_of_pw_aff
                from loopy.symbolic import pw_aff_to_expr

                shape = []
                for i in range(armap.access_range.dim(dim_type.set)):
                    try:
                        shape.append(
                                pw_aff_to_expr(static_max_of_pw_aff(
                                    kernel.cache_manager.dim_max(
                                        armap.access_range, i) + 1,
                                    constants_only=False)))
                    except:
                        print("While trying to find shape axis %d of "
                                "argument '%s', the following "
                                "exception occurred:" % (i, arg.name),
                                file=sys.stderr)
                        raise

                shape = tuple(shape)

            if arg.shape is lp.auto:
                arg = arg.copy(shape=shape)

            try:
                arg.strides
            except AttributeError:
                pass
            else:
                if arg.strides is lp.auto:
                    from loopy.kernel.data import make_strides
                    arg = arg.copy(strides=make_strides(shape, default_order))

        new_args.append(arg)

    return kernel.copy(args=new_args)
示例#17
0
文件: loop.py 项目: navjotk/loopy
def set_up_hw_parallel_loops(kernel, sched_index, codegen_state,
        hw_inames_left=None):
    from loopy.kernel.data import (
            UniqueTag, HardwareParallelTag, LocalIndexTag, GroupIndexTag)

    if hw_inames_left is None:
        hw_inames_left = [iname
                for iname in kernel.all_inames()
                if isinstance(kernel.iname_to_tag.get(iname), HardwareParallelTag)]

    if not hw_inames_left:
        return build_loop_nest(kernel, sched_index, codegen_state)

    global_size, local_size = kernel.get_grid_sizes()

    hw_inames_left = hw_inames_left[:]
    iname = hw_inames_left.pop()

    tag = kernel.iname_to_tag.get(iname)

    from loopy.symbolic import GroupHardwareAxisIndex, LocalHardwareAxisIndex

    assert isinstance(tag, UniqueTag)
    if isinstance(tag, GroupIndexTag):
        hw_axis_expr = GroupHardwareAxisIndex(tag.axis)
    elif isinstance(tag, LocalIndexTag):
        hw_axis_expr = LocalHardwareAxisIndex(tag.axis)
    else:
        raise RuntimeError("unexpected hw tag type")

    other_inames_with_same_tag = [
            other_iname for other_iname in kernel.all_inames()
            if isinstance(kernel.iname_to_tag.get(other_iname), UniqueTag)
            and kernel.iname_to_tag.get(other_iname).key == tag.key
            and other_iname != iname]

    # {{{ 'implement' hardware axis boundaries

    if isinstance(tag, LocalIndexTag):
        hw_axis_size = local_size[tag.axis]
    elif isinstance(tag, GroupIndexTag):
        hw_axis_size = global_size[tag.axis]
    else:
        raise RuntimeError("unknown hardware parallel tag")

    result = []

    bounds = kernel.get_iname_bounds(iname)
    domain = kernel.get_inames_domain(iname)

    # It's ok to find a bound that's too "loose". The conditional
    # generators will mop up after us.
    from loopy.isl_helpers import static_min_of_pw_aff
    lower_bound = static_min_of_pw_aff(bounds.lower_bound_pw_aff,
            constants_only=False)

    # These bounds are 'implemented' by the hardware. Make sure
    # that the downstream conditional generators realize that.
    if not isinstance(hw_axis_size, int):
        hw_axis_size, lower_bound = isl.align_two(hw_axis_size, lower_bound)

    from loopy.isl_helpers import make_slab
    slab = make_slab(domain.get_space(), iname,
            lower_bound, lower_bound+hw_axis_size)
    codegen_state = codegen_state.intersect(slab)

    from loopy.symbolic import pw_aff_to_expr
    hw_axis_expr = hw_axis_expr + pw_aff_to_expr(lower_bound)

    # }}}

    slabs = get_slab_decomposition(
            kernel, iname, sched_index, codegen_state)

    if other_inames_with_same_tag and len(slabs) > 1:
        raise RuntimeError("cannot do slab decomposition on inames that share "
                "a tag with other inames")

    result = []

    from loopy.codegen import add_comment

    for slab_name, slab in slabs:
        cmt = "%s slab for '%s'" % (slab_name, iname)
        if len(slabs) == 1:
            cmt = None

        # Have the conditional infrastructure generate the
        # slabbing conditionals.
        slabbed_kernel = intersect_kernel_with_slab(kernel, slab, iname)
        new_codegen_state = codegen_state.copy_and_assign(iname, hw_axis_expr)

        inner = set_up_hw_parallel_loops(
                slabbed_kernel, sched_index,
                new_codegen_state, hw_inames_left)

        result.append(add_comment(cmt, inner))

    from loopy.codegen import gen_code_block
    return gen_code_block(result)
示例#18
0
文件: loop.py 项目: inducer/loopy
def generate_sequential_loop_dim_code(codegen_state, sched_index):
    kernel = codegen_state.kernel

    ecm = codegen_state.expression_to_code_mapper
    loop_iname = kernel.schedule[sched_index].iname

    slabs = get_slab_decomposition(kernel, loop_iname)

    from loopy.codegen.bounds import get_usable_inames_for_conditional

    # Note: this does not include loop_iname itself!
    usable_inames = get_usable_inames_for_conditional(kernel, sched_index)
    domain = kernel.get_inames_domain(loop_iname)

    result = []

    for slab_name, slab in slabs:
        cmt = "%s slab for '%s'" % (slab_name, loop_iname)
        if len(slabs) == 1:
            cmt = None

        # {{{ find bounds

        aligned_domain = isl.align_spaces(domain, slab, across_dim_types=True,
                obj_bigger_ok=True)

        dom_and_slab = aligned_domain & slab

        assumptions_non_param = isl.BasicSet.from_params(kernel.assumptions)
        dom_and_slab, assumptions_non_param = isl.align_two(
                dom_and_slab, assumptions_non_param)
        dom_and_slab = dom_and_slab & assumptions_non_param

        # move inames that are usable into parameters
        moved_inames = []
        for das_iname in sorted(dom_and_slab.get_var_names(dim_type.set)):
            if das_iname in usable_inames:
                moved_inames.append(das_iname)
                dt, idx = dom_and_slab.get_var_dict()[das_iname]
                dom_and_slab = dom_and_slab.move_dims(
                        dim_type.param, dom_and_slab.dim(dim_type.param),
                        dt, idx, 1)

        _, loop_iname_idx = dom_and_slab.get_var_dict()[loop_iname]

        impl_domain = isl.align_spaces(
            codegen_state.implemented_domain,
            dom_and_slab,
            obj_bigger_ok=True,
            across_dim_types=True
            ).params()

        lbound = (
                kernel.cache_manager.dim_min(
                    dom_and_slab, loop_iname_idx)
                .gist(kernel.assumptions)
                .gist(impl_domain)
                .coalesce())
        ubound = (
            kernel.cache_manager.dim_max(
                dom_and_slab, loop_iname_idx)
            .gist(kernel.assumptions)
            .gist(impl_domain)
            .coalesce())

        # }}}

        # {{{ find implemented loop, build inner code

        from loopy.symbolic import pw_aff_to_pw_aff_implemented_by_expr
        impl_lbound = pw_aff_to_pw_aff_implemented_by_expr(lbound)
        impl_ubound = pw_aff_to_pw_aff_implemented_by_expr(ubound)

        # impl_loop may be overapproximated
        from loopy.isl_helpers import make_loop_bounds_from_pwaffs
        impl_loop = make_loop_bounds_from_pwaffs(
                dom_and_slab.space,
                loop_iname,
                impl_lbound,
                impl_ubound)

        for moved_iname in moved_inames:
            # move moved_iname to 'set' dim_type in impl_loop
            dt, idx = impl_loop.get_var_dict()[moved_iname]
            impl_loop = impl_loop.move_dims(
                    dim_type.set, impl_loop.dim(dim_type.set),
                    dt, idx, 1)

        new_codegen_state = (
                codegen_state
                .intersect(impl_loop)
                .copy(kernel=intersect_kernel_with_slab(
                    kernel, slab, loop_iname)))

        inner = build_loop_nest(new_codegen_state, sched_index+1)

        # }}}

        if cmt is not None:
            result.append(codegen_state.ast_builder.emit_comment(cmt))

        astb = codegen_state.ast_builder

        from loopy.symbolic import pw_aff_to_expr

        if impl_ubound.is_equal(impl_lbound):
            # single-trip, generate just a variable assignment, not a loop
            inner = merge_codegen_results(codegen_state, [
                astb.emit_initializer(
                    codegen_state,
                    kernel.index_dtype, loop_iname,
                    ecm(pw_aff_to_expr(lbound), PREC_NONE, "i"),
                    is_const=True),
                astb.emit_blank_line(),
                inner,
                ])
            result.append(
                    inner.with_new_ast(
                        codegen_state,
                        astb.ast_block_scope_class(
                            inner.current_ast(codegen_state))))

        else:
            inner_ast = inner.current_ast(codegen_state)

            from loopy.isl_helpers import simplify_pw_aff

            result.append(
                inner.with_new_ast(
                    codegen_state,
                    astb.emit_sequential_loop(
                        codegen_state, loop_iname, kernel.index_dtype,
                        pw_aff_to_expr(simplify_pw_aff(lbound, kernel.assumptions)),
                        pw_aff_to_expr(simplify_pw_aff(ubound, kernel.assumptions)),
                        inner_ast)))

    return merge_codegen_results(codegen_state, result)
示例#19
0
文件: iname.py 项目: navjotk/loopy
def join_inames(kernel, inames, new_iname=None, tag=None, within=None):
    """
    :arg inames: fastest varying last
    :arg within: a stack match as understood by
        :func:`loopy.context_matching.parse_stack_match`.
    """

    # now fastest varying first
    inames = inames[::-1]

    if new_iname is None:
        new_iname = kernel.get_var_name_generator()("_and_".join(inames))

    from loopy.kernel.tools import DomainChanger
    domch = DomainChanger(kernel, frozenset(inames))
    for iname in inames:
        if kernel.get_home_domain_index(iname) != domch.leaf_domain_index:
            raise LoopyError("iname '%s' is not 'at home' in the "
                    "join's leaf domain" % iname)

    new_domain = domch.domain
    new_dim_idx = new_domain.dim(dim_type.set)
    new_domain = new_domain.add_dims(dim_type.set, 1)
    new_domain = new_domain.set_dim_name(dim_type.set, new_dim_idx, new_iname)

    joint_aff = zero = isl.Aff.zero_on_domain(new_domain.space)
    subst_dict = {}
    base_divisor = 1

    from pymbolic import var

    for i, iname in enumerate(inames):
        iname_dt, iname_idx = zero.get_space().get_var_dict()[iname]
        iname_aff = zero.add_coefficient_val(iname_dt, iname_idx, 1)

        joint_aff = joint_aff + base_divisor*iname_aff

        bounds = kernel.get_iname_bounds(iname, constants_only=True)

        from loopy.isl_helpers import (
                static_max_of_pw_aff, static_value_of_pw_aff)
        from loopy.symbolic import pw_aff_to_expr

        length = int(pw_aff_to_expr(
            static_max_of_pw_aff(bounds.size, constants_only=True)))

        try:
            lower_bound_aff = static_value_of_pw_aff(
                    bounds.lower_bound_pw_aff.coalesce(),
                    constants_only=False)
        except Exception as e:
            raise type(e)("while finding lower bound of '%s': " % iname)

        my_val = var(new_iname) // base_divisor
        if i+1 < len(inames):
            my_val %= length
        my_val += pw_aff_to_expr(lower_bound_aff)
        subst_dict[iname] = my_val

        base_divisor *= length

    from loopy.isl_helpers import iname_rel_aff
    new_domain = new_domain.add_constraint(
            isl.Constraint.equality_from_aff(
                iname_rel_aff(new_domain.get_space(), new_iname, "==", joint_aff)))

    for i, iname in enumerate(inames):
        iname_to_dim = new_domain.get_space().get_var_dict()
        iname_dt, iname_idx = iname_to_dim[iname]

        if within is None:
            new_domain = new_domain.project_out(iname_dt, iname_idx, 1)

    def subst_forced_iname_deps(fid):
        result = set()
        for iname in fid:
            if iname in inames:
                result.add(new_iname)
            else:
                result.add(iname)

        return frozenset(result)

    new_insns = [
            insn.copy(
                forced_iname_deps=subst_forced_iname_deps(insn.forced_iname_deps))
            for insn in kernel.instructions]

    kernel = (kernel
            .copy(
                instructions=new_insns,
                domains=domch.get_domains_with(new_domain),
                applied_iname_rewrites=kernel.applied_iname_rewrites + [subst_dict]
                ))

    from loopy.context_matching import parse_stack_match
    within = parse_stack_match(within)

    from pymbolic.mapper.substitutor import make_subst_func
    rule_mapping_context = SubstitutionRuleMappingContext(
            kernel.substitutions, kernel.get_var_name_generator())
    ijoin = _InameJoiner(rule_mapping_context, within,
            make_subst_func(subst_dict),
            inames, new_iname)

    kernel = rule_mapping_context.finish_kernel(
            ijoin.map_kernel(kernel))

    if tag is not None:
        kernel = tag_inames(kernel, {new_iname: tag})

    return kernel
示例#20
0
文件: loop.py 项目: inducer/loopy
def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func,
        hw_inames_left=None):
    kernel = codegen_state.kernel

    from loopy.kernel.data import (UniqueTag, HardwareConcurrentTag,
                LocalIndexTag, GroupIndexTag)

    from loopy.schedule import get_insn_ids_for_block_at
    insn_ids_for_block = get_insn_ids_for_block_at(kernel.schedule, schedule_index)

    if hw_inames_left is None:
        all_inames_by_insns = set()
        for insn_id in insn_ids_for_block:
            all_inames_by_insns |= kernel.insn_inames(insn_id)

        hw_inames_left = [iname for iname in all_inames_by_insns
                if kernel.iname_tags_of_type(iname, HardwareConcurrentTag)]

    if not hw_inames_left:
        return next_func(codegen_state)

    global_size, local_size = kernel.get_grid_sizes_for_insn_ids(
            insn_ids_for_block)

    hw_inames_left = hw_inames_left[:]
    iname = hw_inames_left.pop()

    from loopy.symbolic import GroupHardwareAxisIndex, LocalHardwareAxisIndex

    tag, = kernel.iname_tags_of_type(iname, UniqueTag, max_num=1, min_num=1)

    if isinstance(tag, GroupIndexTag):
        hw_axis_expr = GroupHardwareAxisIndex(tag.axis)
    elif isinstance(tag, LocalIndexTag):
        hw_axis_expr = LocalHardwareAxisIndex(tag.axis)
    else:
        raise RuntimeError("unexpected hw tag type")

    other_inames_with_same_tag = [
        other_iname for other_iname in kernel.all_inames()
        if (kernel.iname_tags_of_type(other_iname, UniqueTag)
            and other_iname != iname
            and any(_tag.key == tag.key
                    for _tag in kernel.iname_tags(other_iname)
                    if _tag))]

    # {{{ 'implement' hardware axis boundaries

    if isinstance(tag, LocalIndexTag):
        hw_axis_size = local_size[tag.axis]
    elif isinstance(tag, GroupIndexTag):
        hw_axis_size = global_size[tag.axis]
    else:
        raise RuntimeError("unknown hardware parallel tag")

    result = []

    bounds = kernel.get_iname_bounds(iname)
    domain = kernel.get_inames_domain(iname)

    # It's ok to find a bound that's too "loose". The conditional
    # generators will mop up after us.
    from loopy.isl_helpers import static_min_of_pw_aff
    lower_bound = static_min_of_pw_aff(bounds.lower_bound_pw_aff,
            constants_only=False)

    # These bounds are 'implemented' by the hardware. Make sure
    # that the downstream conditional generators realize that.
    if not isinstance(hw_axis_size, int):
        hw_axis_size, lower_bound = isl.align_two(hw_axis_size, lower_bound)

    from loopy.isl_helpers import make_slab
    slab = make_slab(domain.get_space(), iname,
            lower_bound, lower_bound+hw_axis_size)
    codegen_state = codegen_state.intersect(slab)

    from loopy.symbolic import pw_aff_to_expr
    hw_axis_expr = hw_axis_expr + pw_aff_to_expr(lower_bound)

    # }}}

    slabs = get_slab_decomposition(kernel, iname)

    if other_inames_with_same_tag and len(slabs) > 1:
        raise RuntimeError("cannot do slab decomposition on inames that share "
                "a tag with other inames")

    result = []

    for slab_name, slab in slabs:
        if len(slabs) > 1:
            result.append(
                    codegen_state.ast_builder.emit_comment(
                        "%s slab for '%s'" % (slab_name, iname)))

        # Have the conditional infrastructure generate the
        # slabbing conditionals.
        slabbed_kernel = intersect_kernel_with_slab(kernel, slab, iname)
        new_codegen_state = (codegen_state
                .copy_and_assign(iname, hw_axis_expr)
                .copy(kernel=slabbed_kernel))

        inner = set_up_hw_parallel_loops(
                new_codegen_state, schedule_index, next_func,
                hw_inames_left)

        result.append(inner)

    return merge_codegen_results(codegen_state, result)
示例#21
0
文件: test_isl.py 项目: inducer/loopy
def test_pw_aff_to_conditional_expr():
    from loopy.symbolic import pw_aff_to_expr
    cond = isl.PwAff("[i] -> { [(0)] : i = 0; [(-1 + i)] : i > 0 }")
    expr = pw_aff_to_expr(cond)
    assert str(expr) == "If(i == 0, 0, -1 + i)"
示例#22
0
def set_up_hw_parallel_loops(codegen_state,
                             schedule_index,
                             next_func,
                             hw_inames_left=None):
    kernel = codegen_state.kernel

    from loopy.kernel.data import (UniqueTag, HardwareConcurrentTag,
                                   LocalIndexTag, GroupIndexTag, VectorizeTag)

    from loopy.schedule import get_insn_ids_for_block_at
    insn_ids_for_block = get_insn_ids_for_block_at(kernel.schedule,
                                                   schedule_index)

    if hw_inames_left is None:
        all_inames_by_insns = set()
        for insn_id in insn_ids_for_block:
            all_inames_by_insns |= kernel.insn_inames(insn_id)

        hw_inames_left = [
            iname for iname in all_inames_by_insns
            if kernel.iname_tags_of_type(iname, HardwareConcurrentTag)
            and not kernel.iname_tags_of_type(iname, VectorizeTag)
        ]

    if not hw_inames_left:
        return next_func(codegen_state)

    global_size, local_size = kernel.get_grid_sizes_for_insn_ids(
        insn_ids_for_block)

    hw_inames_left = hw_inames_left[:]
    iname = hw_inames_left.pop()

    from loopy.symbolic import GroupHardwareAxisIndex, LocalHardwareAxisIndex

    tag, = kernel.iname_tags_of_type(iname, UniqueTag, max_num=1, min_num=1)

    if isinstance(tag, GroupIndexTag):
        hw_axis_expr = GroupHardwareAxisIndex(tag.axis)
    elif isinstance(tag, LocalIndexTag):
        hw_axis_expr = LocalHardwareAxisIndex(tag.axis)
    else:
        raise RuntimeError("unexpected hw tag type")

    other_inames_with_same_tag = [
        other_iname for other_iname in kernel.all_inames()
        if (kernel.iname_tags_of_type(other_iname, UniqueTag)
            and other_iname != iname and any(
                _tag.key == tag.key
                for _tag in kernel.iname_tags(other_iname) if _tag))
    ]

    # {{{ 'implement' hardware axis boundaries

    if isinstance(tag, LocalIndexTag):
        hw_axis_size = local_size[tag.axis]
    elif isinstance(tag, GroupIndexTag):
        hw_axis_size = global_size[tag.axis]
    else:
        raise RuntimeError("unknown hardware parallel tag")

    result = []

    bounds = kernel.get_iname_bounds(iname)
    domain = kernel.get_inames_domain(iname)

    # It's ok to find a bound that's too "loose". The conditional
    # generators will mop up after us.
    from loopy.isl_helpers import static_min_of_pw_aff
    lower_bound = static_min_of_pw_aff(bounds.lower_bound_pw_aff,
                                       constants_only=False)

    # These bounds are 'implemented' by the hardware. Make sure
    # that the downstream conditional generators realize that.
    if not isinstance(hw_axis_size, int):
        hw_axis_size, lower_bound = isl.align_two(hw_axis_size, lower_bound)

    from loopy.isl_helpers import make_slab
    slab = make_slab(domain.get_space(), iname, lower_bound,
                     lower_bound + hw_axis_size)
    codegen_state = codegen_state.intersect(slab)

    from loopy.symbolic import pw_aff_to_expr
    hw_axis_expr = hw_axis_expr + pw_aff_to_expr(lower_bound)

    # }}}

    slabs = get_slab_decomposition(kernel, iname)

    if other_inames_with_same_tag and len(slabs) > 1:
        raise RuntimeError("cannot do slab decomposition on inames that share "
                           "a tag with other inames")

    result = []

    for slab_name, slab in slabs:
        if len(slabs) > 1:
            result.append(
                codegen_state.ast_builder.emit_comment("%s slab for '%s'" %
                                                       (slab_name, iname)))

        # Have the conditional infrastructure generate the
        # slabbing conditionals.
        slabbed_kernel = intersect_kernel_with_slab(kernel, slab, iname)
        new_codegen_state = (codegen_state.copy_and_assign(
            iname, hw_axis_expr).copy(kernel=slabbed_kernel))

        inner = set_up_hw_parallel_loops(new_codegen_state, schedule_index,
                                         next_func, hw_inames_left)

        result.append(inner)

    return merge_codegen_results(codegen_state, result)
示例#23
0
 def tup_to_exprs(tup):
     from loopy.symbolic import pw_aff_to_expr
     return tuple(pw_aff_to_expr(i, int_ok=True) for i in tup)
示例#24
0
def privatize_temporaries_with_inames(
        kernel, privatizing_inames, only_var_names=None):
    """This function provides each loop iteration of the *privatizing_inames*
    with its own private entry in the temporaries it accesses (possibly
    restricted to *only_var_names*).

    This is accomplished implicitly as part of generating instruction-level
    parallelism by the "ILP" tag and accessible separately through this
    transformation.

    Example::

        for imatrix, i
            acc = 0
            for k
                acc = acc + a[imatrix, i, k] * vec[k]
            end
        end

    might become::

        for imatrix, i
            acc[imatrix] = 0
            for k
                acc[imatrix] = acc[imatrix] + a[imatrix, i, k] * vec[k]
            end
        end

    facilitating loop interchange of the *imatrix* loop.
    .. versionadded:: 2018.1
    """

    if isinstance(privatizing_inames, str):
        privatizing_inames = frozenset(
                s.strip()
                for s in privatizing_inames.split(","))

    if isinstance(only_var_names, str):
        only_var_names = frozenset(
                s.strip()
                for s in only_var_names.split(","))

    wmap = kernel.writer_map()

    var_to_new_priv_axis_iname = {}

    # {{{ find variables that need extra indices

    for tv in six.itervalues(kernel.temporary_variables):
        if only_var_names is not None and tv.name not in only_var_names:
            continue

        for writer_insn_id in wmap.get(tv.name, []):
            writer_insn = kernel.id_to_insn[writer_insn_id]

            priv_axis_inames = kernel.insn_inames(writer_insn) & privatizing_inames

            referenced_priv_axis_inames = (priv_axis_inames
                    & writer_insn.write_dependency_names())

            new_priv_axis_inames = priv_axis_inames - referenced_priv_axis_inames

            if not new_priv_axis_inames:
                break

            if tv.name in var_to_new_priv_axis_iname:
                if new_priv_axis_inames != set(var_to_new_priv_axis_iname[tv.name]):
                    raise LoopyError("instruction '%s' requires adding "
                            "indices for privatizing var '%s' on iname(s) '%s', "
                            "but previous instructions required inames '%s'"
                            % (writer_insn_id, tv.name,
                                ", ".join(new_priv_axis_inames),
                                ", ".join(var_to_new_priv_axis_iname[tv.name])))

                continue

            var_to_new_priv_axis_iname[tv.name] = set(new_priv_axis_inames)

    # }}}

    # {{{ find ilp iname lengths

    from loopy.isl_helpers import static_max_of_pw_aff
    from loopy.symbolic import pw_aff_to_expr

    priv_axis_iname_to_length = {}
    for priv_axis_inames in six.itervalues(var_to_new_priv_axis_iname):
        for iname in priv_axis_inames:
            if iname in priv_axis_iname_to_length:
                continue

            bounds = kernel.get_iname_bounds(iname, constants_only=False)
            priv_axis_iname_to_length[iname] = pw_aff_to_expr(
                        static_max_of_pw_aff(bounds.size, constants_only=False))

            assert static_max_of_pw_aff(
                    bounds.lower_bound_pw_aff, constants_only=True).plain_is_zero()

    # }}}

    # {{{ change temporary variables

    from loopy.kernel.data import VectorizeTag

    new_temp_vars = kernel.temporary_variables.copy()
    for tv_name, inames in six.iteritems(var_to_new_priv_axis_iname):
        tv = new_temp_vars[tv_name]
        extra_shape = tuple(priv_axis_iname_to_length[iname] for iname in inames)

        shape = tv.shape
        if shape is None:
            shape = ()

        dim_tags = ["c"] * (len(shape) + len(extra_shape))
        for i, iname in enumerate(inames):
            if kernel.iname_tags_of_type(iname, VectorizeTag):
                dim_tags[len(shape) + i] = "vec"

        new_temp_vars[tv.name] = tv.copy(shape=shape + extra_shape,
                # Forget what you knew about data layout,
                # create from scratch.
                dim_tags=dim_tags,
                dim_names=None)

    # }}}

    from pymbolic import var
    var_to_extra_iname = dict(
            (var_name, tuple(var(iname) for iname in inames))
            for var_name, inames in six.iteritems(var_to_new_priv_axis_iname))

    new_insns = []

    for insn in kernel.instructions:
        eiii = ExtraInameIndexInserter(var_to_extra_iname)
        new_insn = insn.with_transformed_expressions(eiii)
        if not eiii.seen_priv_axis_inames <= insn.within_inames:
            raise LoopyError(
                    "Kernel '%s': Instruction '%s': touched variable that "
                    "(for privatization, e.g. as performed for ILP) "
                    "required iname(s) '%s', but that the instruction was not "
                    "previously within the iname(s). To remedy this, first promote"
                    "the instruction into the iname."
                    % (kernel.name, insn.id, ", ".join(
                        eiii.seen_priv_axis_inames - insn.within_inames)))

        new_insns.append(new_insn)

    return kernel.copy(
        temporary_variables=new_temp_vars,
        instructions=new_insns)
示例#25
0
    def base_index_and_length(self,
                              set,
                              iname,
                              context=None,
                              n_allowed_params_in_length=None):
        """
        :arg n_allowed_params_in_length: Simplifies the 'length'
            argument so that only the first that many params
            (in the domain of *set*) occur.
        """
        if not isinstance(iname, int):
            iname_to_dim = set.space.get_var_dict()
            idx = iname_to_dim[iname][1]
        else:
            idx = iname

        lower_bound_pw_aff = self.dim_min(set, idx)
        upper_bound_pw_aff = self.dim_max(set, idx)

        from loopy.diagnostic import StaticValueFindingError
        from loopy.isl_helpers import (static_max_of_pw_aff,
                                       static_min_of_pw_aff,
                                       static_value_of_pw_aff,
                                       find_max_of_pwaff_with_params)
        from loopy.symbolic import pw_aff_to_expr

        # {{{ first: try to find static lower bound value

        try:
            base_index_aff = static_value_of_pw_aff(lower_bound_pw_aff,
                                                    constants_only=False,
                                                    context=context)
        except StaticValueFindingError:
            base_index_aff = None

        if base_index_aff is not None:
            base_index = pw_aff_to_expr(base_index_aff)

            length = find_max_of_pwaff_with_params(
                upper_bound_pw_aff - base_index_aff + 1,
                n_allowed_params_in_length)
            length = pw_aff_to_expr(
                static_max_of_pw_aff(length,
                                     constants_only=False,
                                     context=context))

            return base_index, length

        # }}}

        # {{{ if that didn't work, try finding a lower bound

        base_index_aff = static_min_of_pw_aff(lower_bound_pw_aff,
                                              constants_only=False,
                                              context=context)

        base_index = pw_aff_to_expr(base_index_aff)

        length = find_max_of_pwaff_with_params(
            upper_bound_pw_aff - base_index_aff + 1,
            n_allowed_params_in_length)
        length = pw_aff_to_expr(
            static_max_of_pw_aff(length, constants_only=False,
                                 context=context))

        return base_index, length
示例#26
0
文件: ilp.py 项目: spillai/loopy
def add_axes_to_temporaries_for_ilp_and_vec(kernel, iname=None):
    if iname is not None:
        logger.debug("%s: add axes to temporaries for ilp" % kernel.name)

    wmap = kernel.writer_map()

    from loopy.kernel.data import IlpBaseTag, VectorizeTag

    var_to_new_ilp_inames = {}

    # {{{ find variables that need extra indices

    for tv in six.itervalues(kernel.temporary_variables):
        for writer_insn_id in wmap.get(tv.name, []):
            writer_insn = kernel.id_to_insn[writer_insn_id]

            if iname is None:
                ilp_inames = frozenset(
                    iname for iname in kernel.insn_inames(writer_insn)
                    if isinstance(kernel.iname_to_tag.get(iname), (
                        IlpBaseTag, VectorizeTag)))
            else:
                if not isinstance(kernel.iname_to_tag.get(iname),
                                  (IlpBaseTag, VectorizeTag)):
                    raise LoopyError("'%s' is not an ILP iname" % iname)

                ilp_inames = frozenset([iname])

            referenced_ilp_inames = (ilp_inames
                                     & writer_insn.write_dependency_names())

            new_ilp_inames = ilp_inames - referenced_ilp_inames

            if not new_ilp_inames:
                break

            if tv.name in var_to_new_ilp_inames:
                if new_ilp_inames != set(var_to_new_ilp_inames[tv.name]):
                    raise LoopyError(
                        "instruction '%s' requires adding "
                        "indices for ILP inames '%s' on var '%s', but previous "
                        "instructions required inames '%s'" %
                        (writer_insn_id, ", ".join(new_ilp_inames), ", ".join(
                            var_to_new_ilp_inames[tv.name])))

                continue

            var_to_new_ilp_inames[tv.name] = set(new_ilp_inames)

    # }}}

    # {{{ find ilp iname lengths

    from loopy.isl_helpers import static_max_of_pw_aff
    from loopy.symbolic import pw_aff_to_expr

    ilp_iname_to_length = {}
    for ilp_inames in six.itervalues(var_to_new_ilp_inames):
        for iname in ilp_inames:
            if iname in ilp_iname_to_length:
                continue

            bounds = kernel.get_iname_bounds(iname, constants_only=True)
            ilp_iname_to_length[iname] = int(
                pw_aff_to_expr(
                    static_max_of_pw_aff(bounds.size, constants_only=True)))

            assert static_max_of_pw_aff(bounds.lower_bound_pw_aff,
                                        constants_only=True).plain_is_zero()

    # }}}

    # {{{ change temporary variables

    new_temp_vars = kernel.temporary_variables.copy()
    for tv_name, inames in six.iteritems(var_to_new_ilp_inames):
        tv = new_temp_vars[tv_name]
        extra_shape = tuple(ilp_iname_to_length[iname] for iname in inames)

        shape = tv.shape
        if shape is None:
            shape = ()

        dim_tags = ["c"] * (len(shape) + len(extra_shape))
        for i, iname in enumerate(inames):
            if isinstance(kernel.iname_to_tag.get(iname), VectorizeTag):
                dim_tags[len(shape) + i] = "vec"

        new_temp_vars[tv.name] = tv.copy(
            shape=shape + extra_shape,
            # Forget what you knew about data layout,
            # create from scratch.
            dim_tags=dim_tags,
            dim_names=None)

    # }}}

    from pymbolic import var
    eiii = ExtraInameIndexInserter(
        dict((var_name, tuple(var(iname) for iname in inames))
             for var_name, inames in six.iteritems(var_to_new_ilp_inames)))

    new_insns = [
        insn.with_transformed_expressions(eiii) for insn in kernel.instructions
    ]

    return kernel.copy(temporary_variables=new_temp_vars,
                       instructions=new_insns)
示例#27
0
def join_inames(kernel, inames, new_iname=None, tag=None, within=None):
    """
    :arg inames: fastest varying last
    :arg within: a stack match as understood by
        :func:`loopy.match.parse_stack_match`.
    """

    # now fastest varying first
    inames = inames[::-1]

    if new_iname is None:
        new_iname = kernel.get_var_name_generator()("_and_".join(inames))

    from loopy.kernel.tools import DomainChanger
    domch = DomainChanger(kernel, frozenset(inames))
    for iname in inames:
        if kernel.get_home_domain_index(iname) != domch.leaf_domain_index:
            raise LoopyError("iname '%s' is not 'at home' in the "
                             "join's leaf domain" % iname)

    new_domain = domch.domain
    new_dim_idx = new_domain.dim(dim_type.set)
    new_domain = new_domain.add_dims(dim_type.set, 1)
    new_domain = new_domain.set_dim_name(dim_type.set, new_dim_idx, new_iname)

    joint_aff = zero = isl.Aff.zero_on_domain(new_domain.space)
    subst_dict = {}
    base_divisor = 1

    from pymbolic import var

    for i, iname in enumerate(inames):
        iname_dt, iname_idx = zero.get_space().get_var_dict()[iname]
        iname_aff = zero.add_coefficient_val(iname_dt, iname_idx, 1)

        joint_aff = joint_aff + base_divisor * iname_aff

        bounds = kernel.get_iname_bounds(iname, constants_only=True)

        from loopy.isl_helpers import (static_max_of_pw_aff,
                                       static_value_of_pw_aff)
        from loopy.symbolic import pw_aff_to_expr

        length = int(
            pw_aff_to_expr(
                static_max_of_pw_aff(bounds.size, constants_only=True)))

        try:
            lower_bound_aff = static_value_of_pw_aff(
                bounds.lower_bound_pw_aff.coalesce(), constants_only=False)
        except Exception as e:
            raise type(e)("while finding lower bound of '%s': " % iname)

        my_val = var(new_iname) // base_divisor
        if i + 1 < len(inames):
            my_val %= length
        my_val += pw_aff_to_expr(lower_bound_aff)
        subst_dict[iname] = my_val

        base_divisor *= length

    from loopy.isl_helpers import iname_rel_aff
    new_domain = new_domain.add_constraint(
        isl.Constraint.equality_from_aff(
            iname_rel_aff(new_domain.get_space(), new_iname, "==", joint_aff)))

    for i, iname in enumerate(inames):
        iname_to_dim = new_domain.get_space().get_var_dict()
        iname_dt, iname_idx = iname_to_dim[iname]

        if within is None:
            new_domain = new_domain.project_out(iname_dt, iname_idx, 1)

    def subst_within_inames(fid):
        result = set()
        for iname in fid:
            if iname in inames:
                result.add(new_iname)
            else:
                result.add(iname)

        return frozenset(result)

    new_insns = [
        insn.copy(within_inames=subst_within_inames(insn.within_inames))
        for insn in kernel.instructions
    ]

    kernel = (kernel.copy(
        instructions=new_insns,
        domains=domch.get_domains_with(new_domain),
        applied_iname_rewrites=kernel.applied_iname_rewrites + [subst_dict]))

    from loopy.match import parse_stack_match
    within = parse_stack_match(within)

    from pymbolic.mapper.substitutor import make_subst_func
    rule_mapping_context = SubstitutionRuleMappingContext(
        kernel.substitutions, kernel.get_var_name_generator())
    ijoin = _InameJoiner(rule_mapping_context, within,
                         make_subst_func(subst_dict), inames, new_iname)

    kernel = rule_mapping_context.finish_kernel(ijoin.map_kernel(kernel))

    if tag is not None:
        kernel = tag_inames(kernel, {new_iname: tag})

    return kernel
示例#28
0
def generate_sequential_loop_dim_code(codegen_state, sched_index):
    kernel = codegen_state.kernel

    ecm = codegen_state.expression_to_code_mapper
    loop_iname = kernel.schedule[sched_index].iname

    slabs = get_slab_decomposition(kernel, loop_iname)

    from loopy.codegen.bounds import get_usable_inames_for_conditional

    # Note: this does not include loop_iname itself!
    usable_inames = get_usable_inames_for_conditional(kernel, sched_index)
    domain = kernel.get_inames_domain(loop_iname)

    result = []

    for slab_name, slab in slabs:
        cmt = "%s slab for '%s'" % (slab_name, loop_iname)
        if len(slabs) == 1:
            cmt = None

        # {{{ find bounds

        aligned_domain = isl.align_spaces(domain, slab, obj_bigger_ok=True)

        dom_and_slab = aligned_domain & slab

        assumptions_non_param = isl.BasicSet.from_params(kernel.assumptions)
        dom_and_slab, assumptions_non_param = isl.align_two(
            dom_and_slab, assumptions_non_param)
        dom_and_slab = dom_and_slab & assumptions_non_param

        # move inames that are usable into parameters
        moved_inames = []
        for das_iname in sorted(dom_and_slab.get_var_names(dim_type.set)):
            if das_iname in usable_inames:
                moved_inames.append(das_iname)
                dt, idx = dom_and_slab.get_var_dict()[das_iname]
                dom_and_slab = dom_and_slab.move_dims(
                    dim_type.param, dom_and_slab.dim(dim_type.param), dt, idx,
                    1)

        _, loop_iname_idx = dom_and_slab.get_var_dict()[loop_iname]

        impl_domain = isl.align_spaces(codegen_state.implemented_domain,
                                       dom_and_slab,
                                       obj_bigger_ok=True).params()

        lbound = (kernel.cache_manager.dim_min(
            dom_and_slab, loop_iname_idx).gist(
                kernel.assumptions).gist(impl_domain).coalesce())
        ubound = (kernel.cache_manager.dim_max(
            dom_and_slab, loop_iname_idx).gist(
                kernel.assumptions).gist(impl_domain).coalesce())

        # }}}

        # {{{ find implemented loop, build inner code

        from loopy.symbolic import pw_aff_to_pw_aff_implemented_by_expr
        impl_lbound = pw_aff_to_pw_aff_implemented_by_expr(lbound)
        impl_ubound = pw_aff_to_pw_aff_implemented_by_expr(ubound)

        # impl_loop may be overapproximated
        from loopy.isl_helpers import make_loop_bounds_from_pwaffs
        impl_loop = make_loop_bounds_from_pwaffs(dom_and_slab.space,
                                                 loop_iname, impl_lbound,
                                                 impl_ubound)

        for moved_iname in moved_inames:
            # move moved_iname to 'set' dim_type in impl_loop
            dt, idx = impl_loop.get_var_dict()[moved_iname]
            impl_loop = impl_loop.move_dims(dim_type.set,
                                            impl_loop.dim(dim_type.set), dt,
                                            idx, 1)

        new_codegen_state = (codegen_state.intersect(impl_loop).copy(
            kernel=intersect_kernel_with_slab(kernel, slab, loop_iname)))

        inner = build_loop_nest(new_codegen_state, sched_index + 1)

        # }}}

        if cmt is not None:
            result.append(codegen_state.ast_builder.emit_comment(cmt))

        astb = codegen_state.ast_builder

        from loopy.symbolic import pw_aff_to_expr

        if impl_ubound.is_equal(impl_lbound):
            # single-trip, generate just a variable assignment, not a loop
            inner = merge_codegen_results(codegen_state, [
                astb.emit_initializer(codegen_state,
                                      kernel.index_dtype,
                                      loop_iname,
                                      ecm(pw_aff_to_expr(lbound), PREC_NONE,
                                          "i"),
                                      is_const=True),
                astb.emit_blank_line(),
                inner,
            ])
            result.append(
                inner.with_new_ast(
                    codegen_state,
                    astb.ast_block_scope_class(
                        inner.current_ast(codegen_state))))

        else:
            inner_ast = inner.current_ast(codegen_state)

            from loopy.isl_helpers import simplify_pw_aff

            result.append(
                inner.with_new_ast(
                    codegen_state,
                    astb.emit_sequential_loop(
                        codegen_state, loop_iname, kernel.index_dtype,
                        pw_aff_to_expr(
                            simplify_pw_aff(lbound, kernel.assumptions)),
                        pw_aff_to_expr(
                            simplify_pw_aff(ubound, kernel.assumptions)),
                        inner_ast)))

    return merge_codegen_results(codegen_state, result)
示例#29
0
文件: __init__.py 项目: arghdos/loopy
 def tup_to_exprs(tup):
     from loopy.symbolic import pw_aff_to_expr
     return tuple(pw_aff_to_expr(i, int_ok=True) for i in tup)
示例#30
0
def privatize_temporaries_with_inames(kernel,
                                      privatizing_inames,
                                      only_var_names=None):
    """This function provides each loop iteration of the *privatizing_inames*
    with its own private entry in the temporaries it accesses (possibly
    restricted to *only_var_names*).

    This is accomplished implicitly as part of generating instruction-level
    parallelism by the "ILP" tag and accessible separately through this
    transformation.

    Example::

        for imatrix, i
            acc = 0
            for k
                acc = acc + a[imatrix, i, k] * vec[k]
            end
        end

    might become::

        for imatrix, i
            acc[imatrix] = 0
            for k
                acc[imatrix] = acc[imatrix] + a[imatrix, i, k] * vec[k]
            end
        end

    facilitating loop interchange of the *imatrix* loop.
    .. versionadded:: 2018.1
    """

    if isinstance(privatizing_inames, str):
        privatizing_inames = frozenset(s.strip()
                                       for s in privatizing_inames.split(","))

    if isinstance(only_var_names, str):
        only_var_names = frozenset(s.strip()
                                   for s in only_var_names.split(","))

    wmap = kernel.writer_map()

    var_to_new_priv_axis_iname = {}

    # {{{ find variables that need extra indices

    for tv in kernel.temporary_variables.values():
        if only_var_names is not None and tv.name not in only_var_names:
            continue

        for writer_insn_id in wmap.get(tv.name, []):
            writer_insn = kernel.id_to_insn[writer_insn_id]

            priv_axis_inames = writer_insn.within_inames & privatizing_inames

            referenced_priv_axis_inames = (
                priv_axis_inames
                & writer_insn.write_dependency_names())

            new_priv_axis_inames = priv_axis_inames - referenced_priv_axis_inames

            if not new_priv_axis_inames:
                break

            if tv.name in var_to_new_priv_axis_iname:
                if new_priv_axis_inames != set(
                        var_to_new_priv_axis_iname[tv.name]):
                    raise LoopyError(
                        "instruction '%s' requires adding "
                        "indices for privatizing var '%s' on iname(s) '%s', "
                        "but previous instructions required inames '%s'" %
                        (writer_insn_id, tv.name,
                         ", ".join(new_priv_axis_inames), ", ".join(
                             var_to_new_priv_axis_iname[tv.name])))

                continue

            var_to_new_priv_axis_iname[tv.name] = set(new_priv_axis_inames)

    # }}}

    # {{{ find ilp iname lengths

    from loopy.isl_helpers import static_max_of_pw_aff
    from loopy.symbolic import pw_aff_to_expr

    priv_axis_iname_to_length = {}
    iname_to_lbound = {}
    for priv_axis_inames in var_to_new_priv_axis_iname.values():
        for iname in priv_axis_inames:
            if iname in priv_axis_iname_to_length:
                continue

            bounds = kernel.get_iname_bounds(iname, constants_only=False)
            priv_axis_iname_to_length[iname] = pw_aff_to_expr(
                static_max_of_pw_aff(bounds.size, constants_only=False))
            iname_to_lbound[iname] = pw_aff_to_expr(bounds.lower_bound_pw_aff)

    # }}}

    # {{{ change temporary variables

    from loopy.kernel.data import VectorizeTag

    new_temp_vars = kernel.temporary_variables.copy()
    for tv_name, inames in var_to_new_priv_axis_iname.items():
        tv = new_temp_vars[tv_name]
        extra_shape = tuple(priv_axis_iname_to_length[iname]
                            for iname in inames)

        shape = tv.shape
        if shape is None:
            shape = ()

        dim_tags = ["c"] * (len(shape) + len(extra_shape))
        for i, iname in enumerate(inames):
            if kernel.iname_tags_of_type(iname, VectorizeTag):
                dim_tags[len(shape) + i] = "vec"

        new_temp_vars[tv.name] = tv.copy(
            shape=shape + extra_shape,
            # Forget what you knew about data layout,
            # create from scratch.
            dim_tags=dim_tags,
            dim_names=None)

    # }}}

    from pymbolic import var
    var_to_extra_iname = {
        var_name: tuple(var(iname) for iname in inames)
        for var_name, inames in var_to_new_priv_axis_iname.items()
    }

    new_insns = []

    for insn in kernel.instructions:
        eiii = ExtraInameIndexInserter(var_to_extra_iname, iname_to_lbound)
        new_insn = insn.with_transformed_expressions(eiii)
        if not eiii.seen_priv_axis_inames <= insn.within_inames:
            raise LoopyError(
                "Kernel '%s': Instruction '%s': touched variable that "
                "(for privatization, e.g. as performed for ILP) "
                "required iname(s) '%s', but that the instruction was not "
                "previously within the iname(s). To remedy this, first promote"
                "the instruction into the iname." %
                (kernel.name, insn.id,
                 ", ".join(eiii.seen_priv_axis_inames - insn.within_inames)))

        new_insns.append(new_insn)

    return kernel.copy(temporary_variables=new_temp_vars,
                       instructions=new_insns)
示例#31
0
def guess_var_shape(kernel, var_name):
    from loopy.symbolic import SubstitutionRuleExpander, AccessRangeMapper

    armap = AccessRangeMapper(kernel, var_name)

    submap = SubstitutionRuleExpander(kernel.substitutions)

    def run_through_armap(expr):
        armap(submap(expr), kernel.insn_inames(insn))
        return expr

    try:
        for insn in kernel.instructions:
            insn.with_transformed_expressions(run_through_armap)
    except TypeError as e:
        from traceback import print_exc
        print_exc()

        raise LoopyError(
            "Failed to (automatically, as requested) find "
            "shape/strides for variable '%s'. "
            "Specifying the shape manually should get rid of this. "
            "The following error occurred: %s" % (var_name, str(e)))

    if armap.access_range is None:
        if armap.bad_subscripts:
            from loopy.symbolic import LinearSubscript
            if any(
                    isinstance(sub, LinearSubscript)
                    for sub in armap.bad_subscripts):
                raise LoopyError(
                    "cannot determine access range for '%s': "
                    "linear subscript(s) in '%s'" %
                    (var_name, ", ".join(str(i)
                                         for i in armap.bad_subscripts)))

            n_axes_in_subscripts = set(
                len(sub.index_tuple) for sub in armap.bad_subscripts)

            if len(n_axes_in_subscripts) != 1:
                raise RuntimeError("subscripts of '%s' with differing "
                                   "numbers of axes were found" % var_name)

            n_axes, = n_axes_in_subscripts

            if n_axes == 1:
                # Leave shape undetermined--we can live with that for 1D.
                shape = (None, )
            else:
                raise LoopyError(
                    "cannot determine access range for '%s': "
                    "undetermined index in subscript(s) '%s'" %
                    (var_name, ", ".join(str(i)
                                         for i in armap.bad_subscripts)))

        else:
            # no subscripts found, let's call it a scalar
            shape = ()
    else:
        from loopy.isl_helpers import static_max_of_pw_aff
        from loopy.symbolic import pw_aff_to_expr

        shape = []
        for i in range(armap.access_range.dim(dim_type.set)):
            try:
                shape.append(
                    pw_aff_to_expr(
                        static_max_of_pw_aff(kernel.cache_manager.dim_max(
                            armap.access_range, i) + 1,
                                             constants_only=False)))
            except:
                print("While trying to find shape axis %d of "
                      "variable '%s', the following "
                      "exception occurred:" % (i, var_name),
                      file=sys.stderr)
                print("*** ADVICE: You may need to manually specify the "
                      "shape of argument '%s'." % (var_name),
                      file=sys.stderr)
                raise

        shape = tuple(shape)

    return shape