Python align_two 예제들, islpy.align_two Python 예제들

예제 #1

0

파일 보기

파일: isl_helpers.py 프로젝트: shwina/loopy

def make_loop_bounds_from_pwaffs(space, iname, lbound, ubound):
    dt, pos = space.get_var_dict()[iname]
    iname_pwaff = isl.PwAff.var_on_domain(space, dt, pos)

    iname_pwaff, lbound = isl.align_two(iname_pwaff, lbound)
    iname_pwaff, ubound = isl.align_two(iname_pwaff, ubound)
    assert iname_pwaff.space == lbound.space
    assert iname_pwaff.space == ubound.space

    return (iname_pwaff.ge_set(lbound) & iname_pwaff.le_set(ubound))

예제 #2

0

파일 보기

파일: isl_helpers.py 프로젝트: navjotk/loopy

def make_slab_from_bound_pwaffs(space, iname, lbound, ubound):
    dt, pos = space.get_var_dict()[iname]
    iname_pwaff = isl.PwAff.var_on_domain(space, dt, pos)

    iname_pwaff, lbound = isl.align_two(iname_pwaff, lbound)
    iname_pwaff, ubound = isl.align_two(iname_pwaff, ubound)
    assert iname_pwaff.space == lbound.space
    assert iname_pwaff.space == ubound.space

    return convexify(
            iname_pwaff.ge_set(lbound)
            &
            iname_pwaff.le_set(ubound))

예제 #3

0

파일 보기

파일: loop.py 프로젝트: navjotk/loopy

def intersect_kernel_with_slab(kernel, slab, iname):
    from loopy.kernel.tools import DomainChanger

    domch = DomainChanger(kernel, (iname,))
    orig_domain = domch.get_original_domain()
    orig_domain, slab = isl.align_two(orig_domain, slab)
    return domch.get_kernel_with(orig_domain & slab)

예제 #4

0

파일 보기

def check_bounds(kernel):
    """
    Performs out-of-bound check for every array access.
    """
    from loopy.kernel.instruction import get_insn_domain
    temp_var_names = set(kernel.temporary_variables)
    acm = _AccessCheckMapper(kernel)
    kernel_assumptions_is_universe = kernel.assumptions.is_universe()
    for insn in kernel.instructions:
        domain = get_insn_domain(insn, kernel)

        # data-dependent bounds? can't do much
        if set(domain.get_var_names(dim_type.param)) & temp_var_names:
            continue

        if kernel_assumptions_is_universe:
            domain_with_assumptions = domain
        else:
            domain, assumptions = isl.align_two(domain, kernel.assumptions)
            domain_with_assumptions = domain & assumptions

        def run_acm(expr):
            acm(expr, domain_with_assumptions, insn.id)
            return expr

        insn.with_transformed_expressions(run_acm)

예제 #5

0

파일 보기

    def augment_domain_with_sweep(self,
                                  domain,
                                  new_non1_storage_axis_names,
                                  boxify_sweep=False):

        renamed_aug_domain = self.aug_domain
        first_storage_index = (renamed_aug_domain.dim(dim_type.set) -
                               len(self.non1_storage_shape))

        inon1 = 0
        for i, old_name in enumerate(self.storage_axis_names):
            if not self.non1_storage_axis_flags[i]:
                continue

            new_name = new_non1_storage_axis_names[inon1]

            assert (renamed_aug_domain.get_dim_name(
                dim_type.set, first_storage_index + inon1) == old_name)
            renamed_aug_domain = renamed_aug_domain.set_dim_name(
                dim_type.set, first_storage_index + inon1, new_name)

            inon1 += 1

        # Order of arguments to align_two matters--'domain' should be the
        # 'guiding' ordering.
        renamed_aug_domain, domain = isl.align_two(renamed_aug_domain, domain)

        domain = domain & renamed_aug_domain

        from loopy.isl_helpers import convexify, boxify
        if boxify_sweep:
            return boxify(self.kernel.cache_manager, domain,
                          new_non1_storage_axis_names, self.kernel.assumptions)
        else:
            return convexify(domain)

예제 #6

0

파일 보기

def intersect_kernel_with_slab(kernel, slab, iname):
    from loopy.kernel.tools import DomainChanger

    domch = DomainChanger(kernel, (iname, ))
    orig_domain = domch.get_original_domain()
    orig_domain, slab = isl.align_two(slab, orig_domain)
    return domch.get_kernel_with(orig_domain & slab)

예제 #7

0

파일 보기

파일: array_buffer_map.py 프로젝트: cmsquared/loopy

    def augment_domain_with_sweep(self, domain, new_non1_storage_axis_names,
            boxify_sweep=False):

        renamed_aug_domain = self.aug_domain
        first_storage_index = (renamed_aug_domain.dim(dim_type.set)
                - len(self.non1_storage_shape))

        inon1 = 0
        for i, old_name in enumerate(self.storage_axis_names):
            if not self.non1_storage_axis_flags[i]:
                continue

            new_name = new_non1_storage_axis_names[inon1]

            assert (
                    renamed_aug_domain.get_dim_name(
                        dim_type.set, first_storage_index+inon1)
                    == old_name)
            renamed_aug_domain = renamed_aug_domain.set_dim_name(
                    dim_type.set, first_storage_index+inon1, new_name)

            inon1 += 1

        # Order of arguments to align_two matters--'domain' should be the
        # 'guiding' ordering.
        renamed_aug_domain, domain = isl.align_two(renamed_aug_domain, domain)

        domain = domain & renamed_aug_domain

        from loopy.isl_helpers import convexify, boxify
        if boxify_sweep:
            return boxify(self.kernel.cache_manager, domain,
                    new_non1_storage_axis_names, self.kernel.assumptions)
        else:
            return convexify(domain)

예제 #8

0

파일 보기

파일: symbolic.py 프로젝트: dokempf/loopy

def get_access_range(domain, subscript, assumptions):
    domain, assumptions = isl.align_two(domain, assumptions)
    domain = domain & assumptions
    del assumptions

    dims = len(subscript)

    # we build access_map as a set because (idiocy!) Affs
    # cannot live on maps.

    # dims: [domain](dn)[storage]
    access_map = domain

    if isinstance(access_map, isl.BasicSet):
        access_map = isl.Set.from_basic_set(access_map)

    dn = access_map.dim(dim_type.set)
    access_map = access_map.insert_dims(dim_type.set, dn, dims)

    for idim in range(dims):
        idx_aff = aff_from_expr(access_map.get_space(), subscript[idim])
        idx_aff = idx_aff.set_coefficient_val(dim_type.in_, dn + idim, -1)

        access_map = access_map.add_constraint(
            isl.Constraint.equality_from_aff(idx_aff))

    access_map_as_map = isl.Map.universe(access_map.get_space())
    access_map_as_map = access_map_as_map.intersect_range(access_map)
    access_map = access_map_as_map.move_dims(dim_type.in_, 0, dim_type.out, 0,
                                             dn)
    del access_map_as_map

    return access_map.range()

예제 #9

0

파일 보기

파일: instruction.py 프로젝트: cmsquared/loopy

def to_codegen_result(
        codegen_state, insn_id, domain, check_inames, required_preds, ast):
    from loopy.codegen.bounds import get_bounds_checks
    from loopy.symbolic import constraint_to_expr

    bounds_checks = get_bounds_checks(
            domain, check_inames,
            codegen_state.implemented_domain, overapproximate=False)
    bounds_check_set = isl.Set.universe(domain.get_space()) \
            .add_constraints(bounds_checks)
    bounds_check_set, new_implemented_domain = isl.align_two(
            bounds_check_set, codegen_state.implemented_domain)
    new_implemented_domain = new_implemented_domain & bounds_check_set

    if bounds_check_set.is_empty():
        return None

    condition_exprs = [
            constraint_to_expr(cns)
            for cns in bounds_checks]

    condition_exprs.extend(
            required_preds - codegen_state.implemented_predicates)

    if condition_exprs:
        from pymbolic.primitives import LogicalAnd
        from pymbolic.mapper.stringifier import PREC_NONE
        ast = codegen_state.ast_builder.emit_if(
                codegen_state.expression_to_code_mapper(
                    LogicalAnd(tuple(condition_exprs)), PREC_NONE),
                ast)

    return CodeGenerationResult.new(
            codegen_state, insn_id, ast, new_implemented_domain)

예제 #10

0

파일 보기

파일: test_isl.py 프로젝트: egnlife/islpy

def test_isl_align_two():
    a1 = isl.Aff("[t0, t1, t2] -> { [(32)] }")
    a2 = isl.Aff("[t1, t0] -> { [(0)] }")

    a1_aligned, a2_aligned = isl.align_two(a1, a2)
    assert a1_aligned == isl.Aff("[t1, t0, t2] -> { [(32)] }")
    assert a2_aligned == isl.Aff("[t1, t0, t2] -> { [(0)] }")

    b1 = isl.BasicSet("[n0, n1, n2] -> { [i0, i1] : }")
    b2 = isl.BasicSet("[n0, n2, n1, n3] -> { [i1, i0, i2] : }")

    b1_aligned, b2_aligned = isl.align_two(b1, b2)
    assert b1_aligned == isl.BasicSet(
        "[n0, n2, n1, n3] -> { [i1, i0, i2] :  }")
    assert b2_aligned == isl.BasicSet(
        "[n0, n2, n1, n3] -> { [i1, i0, i2] :  }")

예제 #11

0

파일 보기

파일: instruction.py 프로젝트: navjotk/loopy

def wrap_in_conditionals(codegen_state, domain, check_inames, required_preds, stmt):
    from loopy.codegen.bounds import get_bounds_checks, constraint_to_code
    bounds_checks = get_bounds_checks(
            domain, check_inames,
            codegen_state.implemented_domain, overapproximate=False)
    bounds_check_set = isl.Set.universe(domain.get_space()) \
            .add_constraints(bounds_checks)
    bounds_check_set, new_implemented_domain = isl.align_two(
            bounds_check_set, codegen_state.implemented_domain)
    new_implemented_domain = new_implemented_domain & bounds_check_set

    if bounds_check_set.is_empty():
        return None, None

    condition_codelets = [
            constraint_to_code(
                codegen_state.expression_to_code_mapper, cns)
            for cns in bounds_checks]

    condition_codelets.extend(
            required_preds - codegen_state.implemented_predicates)

    if condition_codelets:
        from cgen import If
        stmt = If("\n&& ".join(condition_codelets), stmt)

    return stmt, new_implemented_domain

예제 #12

0

파일 보기

파일: __init__.py 프로젝트: arghdos/loopy

    def get_iname_bounds(self, iname, constants_only=False):
        domain = self.get_inames_domain(frozenset([iname]))

        assumptions = self.assumptions.project_out_except(
            set(domain.get_var_dict(dim_type.param)), [dim_type.param])

        aligned_assumptions, domain = isl.align_two(assumptions, domain)

        dom_intersect_assumptions = aligned_assumptions & domain

        if constants_only:
            # Kill all variable dependencies
            dom_intersect_assumptions = dom_intersect_assumptions.project_out_except(
                [iname], [dim_type.param, dim_type.set])

        iname_idx = dom_intersect_assumptions.get_var_dict()[iname][1]

        lower_bound_pw_aff = (self.cache_manager.dim_min(
            dom_intersect_assumptions, iname_idx).coalesce())
        upper_bound_pw_aff = (self.cache_manager.dim_max(
            dom_intersect_assumptions, iname_idx).coalesce())

        class BoundsRecord(ImmutableRecord):
            pass

        size = (upper_bound_pw_aff - lower_bound_pw_aff + 1)
        size = size.gist(assumptions)

        return BoundsRecord(lower_bound_pw_aff=lower_bound_pw_aff,
                            upper_bound_pw_aff=upper_bound_pw_aff,
                            size=size)

예제 #13

0

파일 보기

파일: expression.py 프로젝트: KrisJe/loopy

    def map_floor_div(self, expr, type_context):
        from loopy.symbolic import get_dependencies
        iname_deps = get_dependencies(expr) & self.kernel.all_inames()
        domain = self.kernel.get_inames_domain(iname_deps)

        assumption_non_param = isl.BasicSet.from_params(
            self.kernel.assumptions)
        assumptions, domain = isl.align_two(assumption_non_param, domain)
        domain = domain & assumptions

        from loopy.isl_helpers import is_nonnegative
        num_nonneg = is_nonnegative(expr.numerator, domain)
        den_nonneg = is_nonnegative(expr.denominator, domain)

        def seen_func(name):
            idt = self.kernel.index_dtype
            from loopy.codegen import SeenFunction
            self.codegen_state.seen_functions.add(
                SeenFunction(name, name, (idt, idt)))

        if den_nonneg:
            if num_nonneg:
                # parenthesize to avoid negative signs being dragged in from the
                # outside by associativity
                return (self.rec(expr.numerator, type_context) //
                        self.rec(expr.denominator, type_context))
            else:
                seen_func("int_floor_div_pos_b")
                return var("int_floor_div_pos_b")(self.rec(
                    expr.numerator, 'i'), self.rec(expr.denominator, 'i'))
        else:
            seen_func("int_floor_div")
            return var("int_floor_div")(self.rec(expr.numerator, 'i'),
                                        self.rec(expr.denominator, 'i'))

예제 #14

0

파일 보기

def to_codegen_result(codegen_state, insn_id, domain, check_inames,
                      required_preds, ast):
    from loopy.codegen.bounds import get_bounds_checks
    from loopy.symbolic import constraint_to_expr

    bounds_checks = get_bounds_checks(domain,
                                      check_inames,
                                      codegen_state.implemented_domain,
                                      overapproximate=False)
    bounds_check_set = isl.Set.universe(domain.get_space()) \
            .add_constraints(bounds_checks)
    bounds_check_set, new_implemented_domain = isl.align_two(
        bounds_check_set, codegen_state.implemented_domain)
    new_implemented_domain = new_implemented_domain & bounds_check_set

    if bounds_check_set.is_empty():
        return None

    condition_exprs = [constraint_to_expr(cns) for cns in bounds_checks]

    condition_exprs.extend(required_preds -
                           codegen_state.implemented_predicates)

    if condition_exprs:
        from pymbolic.primitives import LogicalAnd
        from pymbolic.mapper.stringifier import PREC_NONE
        ast = codegen_state.ast_builder.emit_if(
            codegen_state.expression_to_code_mapper(
                LogicalAnd(tuple(condition_exprs)), PREC_NONE), ast)

    return CodeGenerationResult.new(codegen_state, insn_id, ast,
                                    new_implemented_domain)

예제 #15

0

파일 보기

def get_bounds_checks(domain, check_inames, implemented_domain,
        overapproximate):
    if isinstance(domain, isl.BasicSet):
        domain = isl.Set.from_basic_set(domain)
    domain = domain.remove_redundancies()
    result = domain.eliminate_except(check_inames, [dim_type.set])

    if overapproximate:
        # This is ok, because we're really looking for the
        # projection, with no remaining constraints from
        # the eliminated variables.
        result = result.remove_divs()
    else:
        result = result.compute_divs()

    result, implemented_domain = isl.align_two(result, implemented_domain)
    result = result.gist(implemented_domain)

    if overapproximate:
        result = result.remove_divs()
    else:
        result = result.compute_divs()

    from loopy.isl_helpers import convexify
    result = convexify(result)
    return result.get_constraints()

예제 #16

0

파일 보기

def _get_new_implemented_domain(kernel, chk_domain, implemented_domain):

    chk_domain, implemented_domain = isl.align_two(chk_domain,
                                                   implemented_domain)
    chk_domain = chk_domain.gist(implemented_domain)

    new_implemented_domain = implemented_domain & chk_domain
    return chk_domain, new_implemented_domain

예제 #17

0

파일 보기

    def _is_access_descriptor_in_footprint_inner(self, storage_axis_exprs):
        # Make all inames except the sweep parameters. (The footprint may depend on
        # those.) (I.e. only leave sweep inames as out parameters.)

        global_s2s_par_dom = move_to_par_from_out(
                self.stor2sweep,
                except_inames=frozenset(self.primed_sweep_inames)).domain()

        arg_inames = (
                set(global_s2s_par_dom.get_var_names(dim_type.param))
                & self.kernel.all_inames())

        for arg in storage_axis_exprs:
            arg_inames.update(get_dependencies(arg))
        arg_inames = frozenset(arg_inames)

        from loopy.kernel import CannotBranchDomainTree
        try:
            usage_domain = self.kernel.get_inames_domain(arg_inames)
        except CannotBranchDomainTree:
            return False

        for i in range(usage_domain.dim(dim_type.set)):
            iname = usage_domain.get_dim_name(dim_type.set, i)
            if iname in self.sweep_inames:
                usage_domain = usage_domain.set_dim_name(
                        dim_type.set, i, iname+"'")

        stor2sweep = build_per_access_storage_to_domain_map(
                storage_axis_exprs,
                usage_domain, self.storage_axis_names,
                self.prime_sweep_inames)

        if stor2sweep is None:
            # happens if there are no indices
            # -> yes, in footprint
            return True

        if isinstance(stor2sweep, isl.BasicMap):
            stor2sweep = isl.Map.from_basic_map(stor2sweep)

        stor2sweep = stor2sweep.intersect_range(usage_domain)

        stor2sweep = move_to_par_from_out(stor2sweep,
                except_inames=frozenset(self.primed_sweep_inames))

        s2s_domain = stor2sweep.domain()
        s2s_domain, aligned_g_s2s_parm_dom = isl.align_two(
                s2s_domain, global_s2s_par_dom)

        arg_restrictions = (
                aligned_g_s2s_parm_dom
                .eliminate(dim_type.set, 0,
                    aligned_g_s2s_parm_dom.dim(dim_type.set))
                .remove_divs())

        return (arg_restrictions & s2s_domain).is_subset(
                aligned_g_s2s_parm_dom)

예제 #18

0

파일 보기

파일: array_buffer_map.py 프로젝트: cmsquared/loopy

    def _is_access_descriptor_in_footprint_inner(self, storage_axis_exprs):
        # Make all inames except the sweep parameters. (The footprint may depend on
        # those.) (I.e. only leave sweep inames as out parameters.)

        global_s2s_par_dom = move_to_par_from_out(
                self.stor2sweep,
                except_inames=frozenset(self.primed_sweep_inames)).domain()

        arg_inames = (
                set(global_s2s_par_dom.get_var_names(dim_type.param))
                & self.kernel.all_inames())

        for arg in storage_axis_exprs:
            arg_inames.update(get_dependencies(arg))
        arg_inames = frozenset(arg_inames)

        from loopy.kernel import CannotBranchDomainTree
        try:
            usage_domain = self.kernel.get_inames_domain(arg_inames)
        except CannotBranchDomainTree:
            return False

        for i in range(usage_domain.dim(dim_type.set)):
            iname = usage_domain.get_dim_name(dim_type.set, i)
            if iname in self.sweep_inames:
                usage_domain = usage_domain.set_dim_name(
                        dim_type.set, i, iname+"'")

        stor2sweep = build_per_access_storage_to_domain_map(
                storage_axis_exprs,
                usage_domain, self.storage_axis_names,
                self.prime_sweep_inames)

        if stor2sweep is None:
            # happens if there are no indices
            # -> yes, in footprint
            return True

        if isinstance(stor2sweep, isl.BasicMap):
            stor2sweep = isl.Map.from_basic_map(stor2sweep)

        stor2sweep = stor2sweep.intersect_range(usage_domain)

        stor2sweep = move_to_par_from_out(stor2sweep,
                except_inames=frozenset(self.primed_sweep_inames))

        s2s_domain = stor2sweep.domain()
        s2s_domain, aligned_g_s2s_parm_dom = isl.align_two(
                s2s_domain, global_s2s_par_dom)

        arg_restrictions = (
                aligned_g_s2s_parm_dom
                .eliminate(dim_type.set, 0,
                    aligned_g_s2s_parm_dom.dim(dim_type.set))
                .remove_divs())

        return (arg_restrictions & s2s_domain).is_subset(
                aligned_g_s2s_parm_dom)

예제 #19

0

파일 보기

파일: loopy_utils.py 프로젝트: zjuchenll/polymath

def _combine_domains_from_kernel(knl):
    import islpy as isl
    result = None
    for dom in knl.domains:
        if result is None:
            result = dom
        else:
            aligned_dom, aligned_result = isl.align_two(
                    dom, result, across_dim_types=True)
            result = aligned_result & aligned_dom

    return result

예제 #20

0

파일 보기

파일: loopy_utils.py 프로젝트: zjuchenll/polymath

def _combine_domains_from_list(domain_strs):
    from loopy.kernel.creation import parse_domains
    import islpy as isl
    domains = parse_domains(domain_strs, {})
    result = None
    for dom in domains:
        if result is None:
            result = dom
        else:
            aligned_dom, aligned_result = isl.align_two(
                dom, result, across_dim_types=True)
            result = aligned_result & aligned_dom
    return result

예제 #21

0

파일 보기

파일: isl_helpers.py 프로젝트: navjotk/loopy

def make_slab(space, iname, start, stop):
    zero = isl.Aff.zero_on_domain(space)

    if isinstance(start, (isl.Aff, isl.PwAff)):
        start, zero = isl.align_two(pw_aff_to_aff(start), zero)
    if isinstance(stop, (isl.Aff, isl.PwAff)):
        stop, zero = isl.align_two(pw_aff_to_aff(stop), zero)

    space = zero.get_domain_space()

    from pymbolic.primitives import Expression
    from loopy.symbolic import aff_from_expr
    if isinstance(start, Expression):
        start = aff_from_expr(space, start)
    if isinstance(stop, Expression):
        stop = aff_from_expr(space, stop)

    if isinstance(start, int):
        start = zero + start
    if isinstance(stop, int):
        stop = zero + stop

    if isinstance(iname, str):
        iname_dt, iname_idx = zero.get_space().get_var_dict()[iname]
    else:
        iname_dt, iname_idx = iname

    iname_aff = zero.add_coefficient_val(iname_dt, iname_idx, 1)

    result = (isl.BasicSet.universe(space)
            # start <= iname
            .add_constraint(isl.Constraint.inequality_from_aff(
                iname_aff - start))
            # iname < stop
            .add_constraint(isl.Constraint.inequality_from_aff(
                stop-1 - iname_aff)))

    return result

예제 #22

0

파일 보기

파일: isl_helpers.py 프로젝트: shwina/loopy

def make_slab(space, iname, start, stop):
    zero = isl.Aff.zero_on_domain(space)

    if isinstance(start, (isl.Aff, isl.PwAff)):
        start, zero = isl.align_two(pw_aff_to_aff(start), zero)
    if isinstance(stop, (isl.Aff, isl.PwAff)):
        stop, zero = isl.align_two(pw_aff_to_aff(stop), zero)

    space = zero.get_domain_space()

    from pymbolic.primitives import Expression
    from loopy.symbolic import aff_from_expr
    if isinstance(start, Expression):
        start = aff_from_expr(space, start)
    if isinstance(stop, Expression):
        stop = aff_from_expr(space, stop)

    if isinstance(start, int):
        start = zero + start
    if isinstance(stop, int):
        stop = zero + stop

    if isinstance(iname, str):
        iname_dt, iname_idx = zero.get_space().get_var_dict()[iname]
    else:
        iname_dt, iname_idx = iname

    iname_aff = zero.add_coefficient_val(iname_dt, iname_idx, 1)

    result = (
        isl.BasicSet.universe(space)
        # start <= iname
        .add_constraint(isl.Constraint.inequality_from_aff(iname_aff - start))
        # iname < stop
        .add_constraint(
            isl.Constraint.inequality_from_aff(stop - 1 - iname_aff)))

    return result

예제 #23

0

파일 보기

파일: iname.py 프로젝트: navjotk/loopy

def assume(kernel, assumptions):
    if isinstance(assumptions, str):
        assumptions_set_str = "[%s] -> { : %s}" \
                % (",".join(s for s in kernel.outer_params()),
                    assumptions)
        assumptions = isl.BasicSet.read_from_str(kernel.domains[0].get_ctx(),
                assumptions_set_str)

    if not isinstance(assumptions, isl.BasicSet):
        raise TypeError("'assumptions' must be a BasicSet or a string")

    old_assumptions, new_assumptions = isl.align_two(kernel.assumptions, assumptions)

    return kernel.copy(
            assumptions=old_assumptions.params() & new_assumptions.params())

예제 #24

0

파일 보기

파일: parameter.py 프로젝트: dokempf/loopy

def assume(kernel, assumptions):
    """Include an assumption about :ref:`domain-parameters` in the kernel, e.g.
    `n mod 4 = 0`.

    :arg assumptions: a :class:`islpy.BasicSet` or a string representation of
        the assumptions in :ref:`isl-syntax`.
    """
    if isinstance(assumptions, str):
        assumptions_set_str = "[%s] -> { : %s}" % (",".join(s for s in kernel.outer_params()), assumptions)
        assumptions = isl.BasicSet.read_from_str(kernel.domains[0].get_ctx(), assumptions_set_str)

    if not isinstance(assumptions, isl.BasicSet):
        raise TypeError("'assumptions' must be a BasicSet or a string")

    old_assumptions, new_assumptions = isl.align_two(kernel.assumptions, assumptions)

    return kernel.copy(assumptions=old_assumptions.params() & new_assumptions.params())

예제 #25

0

파일 보기

파일: isl_helpers.py 프로젝트: navjotk/loopy

def boxify(cache_manager, domain, box_inames, context):
    var_dict = domain.get_var_dict(dim_type.set)
    box_iname_indices = [var_dict[iname][1] for iname in box_inames]
    n_nonbox_inames = min(box_iname_indices)

    assert box_iname_indices == list(range(
            n_nonbox_inames, domain.dim(dim_type.set)))

    n_old_parameters = domain.dim(dim_type.param)
    domain = domain.move_dims(
            dim_type.param, n_old_parameters, dim_type.set, 0, n_nonbox_inames)

    result = domain
    zero = isl.Aff.zero_on_domain(result.space)

    for i in range(len(box_iname_indices)):
        result = result.eliminate(dim_type.set, i, 1)

        iname_aff = zero.add_coefficient_val(dim_type.in_, i, 1)

        def add_in_dims(aff):
            return aff.add_dims(dim_type.in_, len(box_inames))

        iname_min = add_in_dims(cache_manager.dim_min(domain, i)).coalesce()
        iname_max = add_in_dims(cache_manager.dim_max(domain, i)).coalesce()

        iname_slab = (iname_min.le_set(iname_aff)
                .intersect(iname_max.ge_set(iname_aff)))

        for i, iname in enumerate(box_inames):
            iname_slab = iname_slab.set_dim_name(dim_type.set, i, iname)

        if context is not None:
            iname_slab, context = isl.align_two(iname_slab, context)
            iname_slab = iname_slab.gist(context)
        iname_slab = iname_slab.coalesce()

        result = result & iname_slab

    result = result.move_dims(
            dim_type.set, 0, dim_type.param, n_old_parameters, n_nonbox_inames)

    return convexify(result)

예제 #26

0

파일 보기

파일: isl_helpers.py 프로젝트: shwina/loopy

def boxify(cache_manager, domain, box_inames, context):
    var_dict = domain.get_var_dict(dim_type.set)
    box_iname_indices = [var_dict[iname][1] for iname in box_inames]
    n_nonbox_inames = min(box_iname_indices)

    assert box_iname_indices == list(
        range(n_nonbox_inames, domain.dim(dim_type.set)))

    n_old_parameters = domain.dim(dim_type.param)
    domain = domain.move_dims(dim_type.param, n_old_parameters, dim_type.set,
                              0, n_nonbox_inames)

    result = domain
    zero = isl.Aff.zero_on_domain(result.space)

    for i in range(len(box_iname_indices)):
        result = result.eliminate(dim_type.set, i, 1)

        iname_aff = zero.add_coefficient_val(dim_type.in_, i, 1)

        def add_in_dims(aff):
            return aff.add_dims(dim_type.in_, len(box_inames))

        iname_min = add_in_dims(cache_manager.dim_min(domain, i)).coalesce()
        iname_max = add_in_dims(cache_manager.dim_max(domain, i)).coalesce()

        iname_slab = (iname_min.le_set(iname_aff).intersect(
            iname_max.ge_set(iname_aff)))

        for i, iname in enumerate(box_inames):
            iname_slab = iname_slab.set_dim_name(dim_type.set, i, iname)

        if context is not None:
            iname_slab, context = isl.align_two(iname_slab, context)
            iname_slab = iname_slab.gist(context)
        iname_slab = iname_slab.coalesce()

        result = result & iname_slab

    result = result.move_dims(dim_type.set, 0, dim_type.param,
                              n_old_parameters, n_nonbox_inames)

    return convexify(result)

예제 #27

0

파일 보기

파일: expression.py 프로젝트: navjotk/loopy

    def map_floor_div(self, expr, enclosing_prec, type_context):
        from loopy.symbolic import get_dependencies

        iname_deps = get_dependencies(expr) & self.kernel.all_inames()
        domain = self.kernel.get_inames_domain(iname_deps)

        assumption_non_param = isl.BasicSet.from_params(self.kernel.assumptions)
        assumptions, domain = isl.align_two(assumption_non_param, domain)
        domain = domain & assumptions

        from loopy.isl_helpers import is_nonnegative

        num_nonneg = is_nonnegative(expr.numerator, domain)
        den_nonneg = is_nonnegative(expr.denominator, domain)

        def seen_func(name):
            idt = self.kernel.index_dtype
            from loopy.codegen import SeenFunction

            self.codegen_state.seen_functions.add(SeenFunction(name, name, (idt, idt)))

        if den_nonneg:
            if num_nonneg:
                # parenthesize to avoid negative signs being dragged in from the
                # outside by associativity
                return "(%s / %s)" % (
                    self.rec(expr.numerator, PREC_PRODUCT, type_context),
                    # analogous to ^{-1}
                    self.rec(expr.denominator, PREC_POWER, type_context),
                )
            else:
                seen_func("int_floor_div_pos_b")
                return "int_floor_div_pos_b(%s, %s)" % (
                    self.rec(expr.numerator, PREC_NONE, "i"),
                    self.rec(expr.denominator, PREC_NONE, "i"),
                )
        else:
            seen_func("int_floor_div")
            return "int_floor_div(%s, %s)" % (
                self.rec(expr.numerator, PREC_NONE, "i"),
                self.rec(expr.denominator, PREC_NONE, "i"),
            )

예제 #28

0

파일 보기

파일: parameter.py 프로젝트: connorjward/loopy

def assume(kernel, assumptions):
    """Include an assumption about :ref:`domain-parameters` in the kernel, e.g.
    `n mod 4 = 0`.

    :arg assumptions: a :class:`islpy.BasicSet` or a string representation of
        the assumptions in :ref:`isl-syntax`.
    """
    if isinstance(assumptions, str):
        assumptions_set_str = "[%s] -> { : %s}" \
                % (",".join(s for s in kernel.outer_params()),
                    assumptions)
        assumptions = isl.BasicSet.read_from_str(kernel.domains[0].get_ctx(),
                                                 assumptions_set_str)

    if not isinstance(assumptions, isl.BasicSet):
        raise TypeError("'assumptions' must be a BasicSet or a string")

    old_assumptions, new_assumptions = isl.align_two(kernel.assumptions,
                                                     assumptions)

    return kernel.copy(assumptions=old_assumptions.params()
                       & new_assumptions.params())

예제 #29

0

파일 보기

파일: expression.py 프로젝트: yueyedeai/loopy

    def map_integer_div_operator(self, base_func_name, op_func, expr, type_context):
        from loopy.symbolic import get_dependencies
        iname_deps = get_dependencies(expr) & self.kernel.all_inames()
        domain = self.kernel.get_inames_domain(iname_deps)

        assumption_non_param = isl.BasicSet.from_params(self.kernel.assumptions)
        assumptions, domain = isl.align_two(assumption_non_param, domain)
        domain = domain & assumptions

        from loopy.isl_helpers import is_nonnegative
        num_nonneg = is_nonnegative(expr.numerator, domain)
        den_nonneg = is_nonnegative(expr.denominator, domain)

        result_dtype = self.infer_type(expr)
        suffix = result_dtype.numpy_dtype.type.__name__

        def seen_func(name):
            from loopy.codegen import SeenFunction
            self.codegen_state.seen_functions.add(
                    SeenFunction(
                        name, "%s_%s" % (name, suffix),
                        (result_dtype, result_dtype)))

        if den_nonneg:
            if num_nonneg:
                return op_func(
                        self.rec(expr.numerator, type_context),
                        self.rec(expr.denominator, type_context))
            else:
                seen_func("%s_pos_b" % base_func_name)
                return var("%s_pos_b_%s" % (base_func_name, suffix))(
                        self.rec(expr.numerator, 'i'),
                        self.rec(expr.denominator, 'i'))
        else:
            seen_func(base_func_name)
            return var("%s_%s" % (base_func_name, suffix))(
                    self.rec(expr.numerator, 'i'),
                    self.rec(expr.denominator, 'i'))

예제 #30

0

파일 보기

def to_codegen_result(
        codegen_state, insn_id, domain, check_inames, required_preds, ast):
    # {{{ get bounds check

    chk_domain = isl.Set.from_basic_set(domain)
    chk_domain = chk_domain.remove_redundancies()
    chk_domain = codegen_state.kernel.cache_manager.eliminate_except(chk_domain,
            check_inames, (dim_type.set,))

    chk_domain, implemented_domain = isl.align_two(
            chk_domain, codegen_state.implemented_domain)
    chk_domain = chk_domain.gist(implemented_domain)

    # }}}

    new_implemented_domain = implemented_domain & chk_domain

    if chk_domain.is_empty():
        return None

    condition_exprs = []
    if not chk_domain.plain_is_universe():
        from loopy.symbolic import set_to_cond_expr
        condition_exprs.append(set_to_cond_expr(chk_domain))

    condition_exprs.extend(
            required_preds - codegen_state.implemented_predicates)

    if condition_exprs:
        from pymbolic.primitives import LogicalAnd
        from pymbolic.mapper.stringifier import PREC_NONE
        ast = codegen_state.ast_builder.emit_if(
                codegen_state.expression_to_code_mapper(
                    LogicalAnd(tuple(condition_exprs)), PREC_NONE),
                ast)

    return CodeGenerationResult.new(
            codegen_state, insn_id, ast, new_implemented_domain)

예제 #31

0

파일 보기

파일: __init__.py 프로젝트: cmsquared/loopy

    def combine_domains(self, domains):
        """
        :arg domains: domain indices of domains to be combined. More 'dominant'
            domains (those which get most say on the actual dim_type of an iname)
            must be later in the order.
        """
        assert isinstance(domains, tuple)  # for caching

        if not domains:
            return isl.BasicSet.universe(isl.Space.set_alloc(
                self.isl_context, 0, 0))

        result = None
        for dom_index in domains:
            dom = self.domains[dom_index]
            if result is None:
                result = dom
            else:
                aligned_dom, aligned_result = isl.align_two(
                        dom, result, across_dim_types=True)
                result = aligned_result & aligned_dom

        return result

예제 #32

0

파일 보기

파일: __init__.py 프로젝트: arghdos/loopy

    def combine_domains(self, domains):
        """
        :arg domains: domain indices of domains to be combined. More 'dominant'
            domains (those which get most say on the actual dim_type of an iname)
            must be later in the order.
        """
        assert isinstance(domains, tuple)  # for caching

        if not domains:
            return isl.BasicSet.universe(
                isl.Space.set_alloc(self.isl_context, 0, 0))

        result = None
        for dom_index in domains:
            dom = self.domains[dom_index]
            if result is None:
                result = dom
            else:
                aligned_dom, aligned_result = isl.align_two(
                    dom, result, across_dim_types=True)
                result = aligned_result & aligned_dom

        return result

예제 #33

0

파일 보기

파일: instruction.py 프로젝트: inducer/loopy

def to_codegen_result(
        codegen_state, insn_id, domain, check_inames, required_preds, ast):
    # {{{ get bounds check

    chk_domain = isl.Set.from_basic_set(domain)
    chk_domain = chk_domain.remove_redundancies()
    chk_domain = chk_domain.eliminate_except(check_inames, [dim_type.set])

    chk_domain, implemented_domain = isl.align_two(
            chk_domain, codegen_state.implemented_domain)
    chk_domain = chk_domain.gist(implemented_domain)

    # }}}

    new_implemented_domain = implemented_domain & chk_domain

    if chk_domain.is_empty():
        return None

    condition_exprs = []
    if not chk_domain.plain_is_universe():
        from loopy.symbolic import set_to_cond_expr
        condition_exprs.append(set_to_cond_expr(chk_domain))

    condition_exprs.extend(
            required_preds - codegen_state.implemented_predicates)

    if condition_exprs:
        from pymbolic.primitives import LogicalAnd
        from pymbolic.mapper.stringifier import PREC_NONE
        ast = codegen_state.ast_builder.emit_if(
                codegen_state.expression_to_code_mapper(
                    LogicalAnd(tuple(condition_exprs)), PREC_NONE),
                ast)

    return CodeGenerationResult.new(
            codegen_state, insn_id, ast, new_implemented_domain)

예제 #34

0

파일 보기

파일: symbolic.py 프로젝트: cmsquared/loopy

def get_access_range(domain, subscript, assumptions):
    domain, assumptions = isl.align_two(domain,
            assumptions)
    domain = domain & assumptions
    del assumptions

    dims = len(subscript)

    # we build access_map as a set because (idiocy!) Affs
    # cannot live on maps.

    # dims: [domain](dn)[storage]
    access_map = domain

    if isinstance(access_map, isl.BasicSet):
        access_map = isl.Set.from_basic_set(access_map)

    dn = access_map.dim(dim_type.set)
    access_map = access_map.insert_dims(dim_type.set, dn, dims)

    for idim in range(dims):
        idx_aff = aff_from_expr(access_map.get_space(),
                subscript[idim])
        idx_aff = idx_aff.set_coefficient_val(
                dim_type.in_, dn+idim, -1)

        access_map = access_map.add_constraint(
                isl.Constraint.equality_from_aff(idx_aff))

    access_map_as_map = isl.Map.universe(access_map.get_space())
    access_map_as_map = access_map_as_map.intersect_range(access_map)
    access_map = access_map_as_map.move_dims(
            dim_type.in_, 0,
            dim_type.out, 0, dn)
    del access_map_as_map

    return access_map.range()

예제 #35

0

파일 보기

파일: __init__.py 프로젝트: cmsquared/loopy

    def get_iname_bounds(self, iname, constants_only=False):
        domain = self.get_inames_domain(frozenset([iname]))

        assumptions = self.assumptions.project_out_except(
                set(domain.get_var_dict(dim_type.param)), [dim_type.param])

        aligned_assumptions, domain = isl.align_two(assumptions, domain)

        dom_intersect_assumptions = aligned_assumptions & domain

        if constants_only:
            # Kill all variable dependencies
            dom_intersect_assumptions = dom_intersect_assumptions.project_out_except(
                    [iname], [dim_type.param, dim_type.set])

        iname_idx = dom_intersect_assumptions.get_var_dict()[iname][1]

        lower_bound_pw_aff = (
                self.cache_manager.dim_min(
                    dom_intersect_assumptions, iname_idx)
                .coalesce())
        upper_bound_pw_aff = (
                self.cache_manager.dim_max(
                    dom_intersect_assumptions, iname_idx)
                .coalesce())

        class BoundsRecord(Record):
            pass

        size = (upper_bound_pw_aff - lower_bound_pw_aff + 1)
        size = size.gist(assumptions)

        return BoundsRecord(
                lower_bound_pw_aff=lower_bound_pw_aff,
                upper_bound_pw_aff=upper_bound_pw_aff,
                size=size)

예제 #36

0

파일 보기

def set_up_hw_parallel_loops(codegen_state,
                             schedule_index,
                             next_func,
                             hw_inames_left=None):
    kernel = codegen_state.kernel

    from loopy.kernel.data import (UniqueTag, HardwareConcurrentTag,
                                   LocalIndexTag, GroupIndexTag, VectorizeTag)

    from loopy.schedule import get_insn_ids_for_block_at
    insn_ids_for_block = get_insn_ids_for_block_at(kernel.schedule,
                                                   schedule_index)

    if hw_inames_left is None:
        all_inames_by_insns = set()
        for insn_id in insn_ids_for_block:
            all_inames_by_insns |= kernel.insn_inames(insn_id)

        hw_inames_left = [
            iname for iname in all_inames_by_insns
            if kernel.iname_tags_of_type(iname, HardwareConcurrentTag)
            and not kernel.iname_tags_of_type(iname, VectorizeTag)
        ]

    if not hw_inames_left:
        return next_func(codegen_state)

    global_size, local_size = kernel.get_grid_sizes_for_insn_ids(
        insn_ids_for_block)

    hw_inames_left = hw_inames_left[:]
    iname = hw_inames_left.pop()

    from loopy.symbolic import GroupHardwareAxisIndex, LocalHardwareAxisIndex

    tag, = kernel.iname_tags_of_type(iname, UniqueTag, max_num=1, min_num=1)

    if isinstance(tag, GroupIndexTag):
        hw_axis_expr = GroupHardwareAxisIndex(tag.axis)
    elif isinstance(tag, LocalIndexTag):
        hw_axis_expr = LocalHardwareAxisIndex(tag.axis)
    else:
        raise RuntimeError("unexpected hw tag type")

    other_inames_with_same_tag = [
        other_iname for other_iname in kernel.all_inames()
        if (kernel.iname_tags_of_type(other_iname, UniqueTag)
            and other_iname != iname and any(
                _tag.key == tag.key
                for _tag in kernel.iname_tags(other_iname) if _tag))
    ]

    # {{{ 'implement' hardware axis boundaries

    if isinstance(tag, LocalIndexTag):
        hw_axis_size = local_size[tag.axis]
    elif isinstance(tag, GroupIndexTag):
        hw_axis_size = global_size[tag.axis]
    else:
        raise RuntimeError("unknown hardware parallel tag")

    result = []

    bounds = kernel.get_iname_bounds(iname)
    domain = kernel.get_inames_domain(iname)

    # It's ok to find a bound that's too "loose". The conditional
    # generators will mop up after us.
    from loopy.isl_helpers import static_min_of_pw_aff
    lower_bound = static_min_of_pw_aff(bounds.lower_bound_pw_aff,
                                       constants_only=False)

    # These bounds are 'implemented' by the hardware. Make sure
    # that the downstream conditional generators realize that.
    if not isinstance(hw_axis_size, int):
        hw_axis_size, lower_bound = isl.align_two(hw_axis_size, lower_bound)

    from loopy.isl_helpers import make_slab
    slab = make_slab(domain.get_space(), iname, lower_bound,
                     lower_bound + hw_axis_size)
    codegen_state = codegen_state.intersect(slab)

    from loopy.symbolic import pw_aff_to_expr
    hw_axis_expr = hw_axis_expr + pw_aff_to_expr(lower_bound)

    # }}}

    slabs = get_slab_decomposition(kernel, iname)

    if other_inames_with_same_tag and len(slabs) > 1:
        raise RuntimeError("cannot do slab decomposition on inames that share "
                           "a tag with other inames")

    result = []

    for slab_name, slab in slabs:
        if len(slabs) > 1:
            result.append(
                codegen_state.ast_builder.emit_comment("%s slab for '%s'" %
                                                       (slab_name, iname)))

        # Have the conditional infrastructure generate the
        # slabbing conditionals.
        slabbed_kernel = intersect_kernel_with_slab(kernel, slab, iname)
        new_codegen_state = (codegen_state.copy_and_assign(
            iname, hw_axis_expr).copy(kernel=slabbed_kernel))

        inner = set_up_hw_parallel_loops(new_codegen_state, schedule_index,
                                         next_func, hw_inames_left)

        result.append(inner)

    return merge_codegen_results(codegen_state, result)

예제 #37

0

파일 보기

파일: loop.py 프로젝트: inducer/loopy

def generate_sequential_loop_dim_code(codegen_state, sched_index):
    kernel = codegen_state.kernel

    ecm = codegen_state.expression_to_code_mapper
    loop_iname = kernel.schedule[sched_index].iname

    slabs = get_slab_decomposition(kernel, loop_iname)

    from loopy.codegen.bounds import get_usable_inames_for_conditional

    # Note: this does not include loop_iname itself!
    usable_inames = get_usable_inames_for_conditional(kernel, sched_index)
    domain = kernel.get_inames_domain(loop_iname)

    result = []

    for slab_name, slab in slabs:
        cmt = "%s slab for '%s'" % (slab_name, loop_iname)
        if len(slabs) == 1:
            cmt = None

        # {{{ find bounds

        aligned_domain = isl.align_spaces(domain, slab, across_dim_types=True,
                obj_bigger_ok=True)

        dom_and_slab = aligned_domain & slab

        assumptions_non_param = isl.BasicSet.from_params(kernel.assumptions)
        dom_and_slab, assumptions_non_param = isl.align_two(
                dom_and_slab, assumptions_non_param)
        dom_and_slab = dom_and_slab & assumptions_non_param

        # move inames that are usable into parameters
        moved_inames = []
        for das_iname in sorted(dom_and_slab.get_var_names(dim_type.set)):
            if das_iname in usable_inames:
                moved_inames.append(das_iname)
                dt, idx = dom_and_slab.get_var_dict()[das_iname]
                dom_and_slab = dom_and_slab.move_dims(
                        dim_type.param, dom_and_slab.dim(dim_type.param),
                        dt, idx, 1)

        _, loop_iname_idx = dom_and_slab.get_var_dict()[loop_iname]

        impl_domain = isl.align_spaces(
            codegen_state.implemented_domain,
            dom_and_slab,
            obj_bigger_ok=True,
            across_dim_types=True
            ).params()

        lbound = (
                kernel.cache_manager.dim_min(
                    dom_and_slab, loop_iname_idx)
                .gist(kernel.assumptions)
                .gist(impl_domain)
                .coalesce())
        ubound = (
            kernel.cache_manager.dim_max(
                dom_and_slab, loop_iname_idx)
            .gist(kernel.assumptions)
            .gist(impl_domain)
            .coalesce())

        # }}}

        # {{{ find implemented loop, build inner code

        from loopy.symbolic import pw_aff_to_pw_aff_implemented_by_expr
        impl_lbound = pw_aff_to_pw_aff_implemented_by_expr(lbound)
        impl_ubound = pw_aff_to_pw_aff_implemented_by_expr(ubound)

        # impl_loop may be overapproximated
        from loopy.isl_helpers import make_loop_bounds_from_pwaffs
        impl_loop = make_loop_bounds_from_pwaffs(
                dom_and_slab.space,
                loop_iname,
                impl_lbound,
                impl_ubound)

        for moved_iname in moved_inames:
            # move moved_iname to 'set' dim_type in impl_loop
            dt, idx = impl_loop.get_var_dict()[moved_iname]
            impl_loop = impl_loop.move_dims(
                    dim_type.set, impl_loop.dim(dim_type.set),
                    dt, idx, 1)

        new_codegen_state = (
                codegen_state
                .intersect(impl_loop)
                .copy(kernel=intersect_kernel_with_slab(
                    kernel, slab, loop_iname)))

        inner = build_loop_nest(new_codegen_state, sched_index+1)

        # }}}

        if cmt is not None:
            result.append(codegen_state.ast_builder.emit_comment(cmt))

        astb = codegen_state.ast_builder

        from loopy.symbolic import pw_aff_to_expr

        if impl_ubound.is_equal(impl_lbound):
            # single-trip, generate just a variable assignment, not a loop
            inner = merge_codegen_results(codegen_state, [
                astb.emit_initializer(
                    codegen_state,
                    kernel.index_dtype, loop_iname,
                    ecm(pw_aff_to_expr(lbound), PREC_NONE, "i"),
                    is_const=True),
                astb.emit_blank_line(),
                inner,
                ])
            result.append(
                    inner.with_new_ast(
                        codegen_state,
                        astb.ast_block_scope_class(
                            inner.current_ast(codegen_state))))

        else:
            inner_ast = inner.current_ast(codegen_state)

            from loopy.isl_helpers import simplify_pw_aff

            result.append(
                inner.with_new_ast(
                    codegen_state,
                    astb.emit_sequential_loop(
                        codegen_state, loop_iname, kernel.index_dtype,
                        pw_aff_to_expr(simplify_pw_aff(lbound, kernel.assumptions)),
                        pw_aff_to_expr(simplify_pw_aff(ubound, kernel.assumptions)),
                        inner_ast)))

    return merge_codegen_results(codegen_state, result)

예제 #38

0

파일 보기

파일: precompute.py 프로젝트: inducer/loopy

 def add_assumptions(d):
     assumption_non_param = isl.BasicSet.from_params(kernel.assumptions)
     assumptions, domain = isl.align_two(assumption_non_param, d)
     return assumptions & domain

예제 #39

0

파일 보기

 def add_assumptions(d):
     assumption_non_param = isl.BasicSet.from_params(kernel.assumptions)
     assumptions, domain = isl.align_two(assumption_non_param, d)
     return assumptions & domain

예제 #40

0

파일 보기

파일: loop.py 프로젝트: cmsquared/loopy

def fuse_loop_domains(kernel):
    from loopy.kernel.tools import is_domain_dependent_on_inames

    while True:
        lnm = potential_loop_nest_map(kernel)
        parents_per_domain = kernel.parents_per_domain()
        all_parents_per_domain = kernel.all_parents_per_domain()

        new_domains = None

        for inner_iname, outer_inames in six.iteritems(lnm):
            for outer_iname in outer_inames:
                # {{{ check if it's safe to fuse

                inner_domain_idx = kernel.get_home_domain_index(inner_iname)
                outer_domain_idx = kernel.get_home_domain_index(outer_iname)

                if inner_domain_idx == outer_domain_idx:
                    break

                if (
                        outer_domain_idx in all_parents_per_domain[inner_domain_idx]
                        and not
                        outer_domain_idx == parents_per_domain[inner_domain_idx]):
                    # Outer domain is not a direct parent of the inner
                    # domain. Unable to fuse.
                    continue

                outer_dom = kernel.domains[outer_domain_idx]
                inner_dom = kernel.domains[inner_domain_idx]

                outer_inames = set(outer_dom.get_var_names(isl.dim_type.set))
                if is_domain_dependent_on_inames(kernel, inner_domain_idx,
                        outer_inames):
                    # Bounds of inner domain depend on outer domain.
                    # Unable to fuse.
                    continue

                # }}}

                new_domains = kernel.domains[:]
                min_idx = min(inner_domain_idx, outer_domain_idx)
                max_idx = max(inner_domain_idx, outer_domain_idx)

                del new_domains[max_idx]
                del new_domains[min_idx]

                outer_dom, inner_dom = isl.align_two(outer_dom, inner_dom)

                new_domains.insert(min_idx, inner_dom & outer_dom)
                break

            if new_domains:
                break

        if not new_domains:
            # Nothing was accomplished in the last loop trip, time to quit.
            break

        kernel = kernel.copy(domains=new_domains)

    return kernel

예제 #41

0

파일 보기

파일: check.py 프로젝트: navjotk/loopy

def check_implemented_domains(kernel, implemented_domains, code=None):
    from islpy import dim_type

    from islpy import align_two

    last_idomains = None
    last_insn_inames = None

    for insn_id, idomains in six.iteritems(implemented_domains):
        insn = kernel.id_to_insn[insn_id]

        assert idomains

        insn_inames = kernel.insn_inames(insn)

        # {{{ if we've checked the same thing before, no need to check it again

        if last_idomains is not None and last_insn_inames is not None:
            if idomains == last_idomains and insn_inames == last_insn_inames:
                continue

        last_idomains = idomains
        last_insn_inames = insn_inames

        # }}}

        insn_impl_domain = idomains[0]
        for idomain in idomains[1:]:
            insn_impl_domain = insn_impl_domain | idomain
        assumption_non_param = isl.BasicSet.from_params(kernel.assumptions)
        assumptions, insn_impl_domain = align_two(
                assumption_non_param, insn_impl_domain)
        insn_impl_domain = (
                (insn_impl_domain & assumptions)
                .project_out_except(insn_inames, [dim_type.set]))

        insn_domain = kernel.get_inames_domain(insn_inames)
        insn_parameters = frozenset(insn_domain.get_var_names(dim_type.param))
        assumptions, insn_domain = align_two(assumption_non_param, insn_domain)
        desired_domain = ((insn_domain & assumptions)
            .project_out_except(insn_inames, [dim_type.set])
            .project_out_except(insn_parameters, [dim_type.param]))

        insn_impl_domain = (insn_impl_domain
                .project_out_except(insn_parameters, [dim_type.param]))
        insn_impl_domain, desired_domain = align_two(
                insn_impl_domain, desired_domain)

        if insn_impl_domain != desired_domain:
            i_minus_d = insn_impl_domain - desired_domain
            d_minus_i = desired_domain - insn_impl_domain

            parameter_inames = set(
                    insn_domain.get_dim_name(dim_type.param, i)
                    for i in range(insn_domain.dim(dim_type.param)))

            lines = []
            for kind, diff_set, gist_domain in [
                    ("implemented, but not desired", i_minus_d,
                        desired_domain.gist(insn_impl_domain)),
                    ("desired, but not implemented", d_minus_i,
                        insn_impl_domain.gist(desired_domain))]:

                if diff_set.is_empty():
                    continue

                diff_set = diff_set.coalesce()
                pt = diff_set.sample_point()
                assert not pt.is_void()

                #pt_set = isl.Set.from_point(pt)
                #lines.append("point implemented: %s" % (pt_set <= insn_impl_domain))
                #lines.append("point desired: %s" % (pt_set <= desired_domain))

                iname_to_dim = pt.get_space().get_var_dict()
                point_axes = []
                for iname in kernel.insn_inames(insn) | parameter_inames:
                    tp, dim = iname_to_dim[iname]
                    point_axes.append("%s=%d" % (
                        iname, pt.get_coordinate_val(tp, dim).to_python()))

                lines.append(
                        "sample point in %s: %s" % (kind, ", ".join(point_axes)))
                lines.append(
                        "gist of %s: %s" % (kind, gist_domain))

            if code is not None:
                print(79*"-")
                print("CODE:")
                print(79*"-")
                from loopy.compiled import get_highlighted_cl_code
                print(get_highlighted_cl_code(code))
                print(79*"-")

            raise LoopyError("sanity check failed--implemented and desired "
                    "domain for instruction '%s' do not match\n\n"
                    "implemented: %s\n\n"
                    "desired:%s\n\n%s"
                    % (insn_id, insn_impl_domain, desired_domain, "\n".join(lines)))

    # placate the assert at the call site
    return True

예제 #42

0

파일 보기

파일: control.py 프로젝트: shigh/loopy

    def build_insn_group(sched_index_info_entries,
                         codegen_state,
                         done_group_lengths=set()):
        """
        :arg done_group_lengths: A set of group lengths (integers) that grows
            from empty to include the longest found group and downwards with every
            recursive call.  It serves to prevent infinite recursion by preventing
            recursive calls from doing anything about groups that are too small.
        """

        # The rough plan here is that build_insn_group starts out with the
        # entirety of the current schedule item's downward siblings (i.e. all
        # the ones up to the next LeaveLoop). It will then iterate upward to
        # find the largest usable conditional hoist group.
        #
        # It will then call itself recursively, telling its recursive instances
        # to ignore the hoist group it just found by adding that group length
        # to done_group_length. (It'll also chop the set of schedule indices
        # considered down so that a callee cannot find a *longer* hoist group.)
        #
        # Upon return the hoist is wrapped around the returned code and
        # build_insn_group calls itself for the remainder of schedule indices
        # that were not in the hoist group.

        if not sched_index_info_entries:
            return []

        origin_si_entry = sched_index_info_entries[0]
        current_iname_set = origin_si_entry.admissible_cond_inames
        current_pred_set = (origin_si_entry.required_predicates -
                            codegen_state.implemented_predicates)

        # {{{ grow schedule item group

        # Keep growing schedule item group as long as group fulfills minimum
        # size requirement.

        bounds_check_cache = BoundsCheckCache(kernel,
                                              codegen_state.implemented_domain)

        found_hoists = []

        candidate_group_length = 1
        while candidate_group_length <= len(sched_index_info_entries):
            if candidate_group_length in done_group_lengths:
                candidate_group_length += 1
                continue

            current_iname_set = (
                current_iname_set
                & sched_index_info_entries[candidate_group_length -
                                           1].admissible_cond_inames)
            current_pred_set = (
                current_pred_set
                & sched_index_info_entries[candidate_group_length -
                                           1].required_predicates)

            # {{{ see which inames are actually used in group

            # And only generate conditionals for those.
            used_inames = set()
            for sched_index_info_entry in \
                    sched_index_info_entries[0:candidate_group_length]:
                used_inames |= sched_index_info_entry.used_inames_within

            # }}}

            only_unshared_inames = kernel.remove_inames_for_shared_hw_axes(
                current_iname_set & used_inames)

            bounds_checks = bounds_check_cache(only_unshared_inames)

            if (bounds_checks  # found a bounds check
                    or current_pred_set or candidate_group_length == 1):
                # length-1 must always be an option to reach the recursion base
                # case below
                found_hoists.append(
                    (candidate_group_length, bounds_checks, current_pred_set))

            if not bounds_checks and not current_pred_set:
                # already no more checks possible, let's not waste time
                # checking longer groups.
                break

            candidate_group_length += 1

        # }}}

        # pick largest such group
        group_length, bounds_checks, pred_checks = max(found_hoists)

        check_set = None
        for cns in bounds_checks:
            cns_set = (isl.BasicSet.universe(
                cns.get_space()).add_constraint(cns))

            if check_set is None:
                check_set = cns_set
            else:
                check_set, cns_set = isl.align_two(check_set, cns_set)
                check_set = check_set.intersect(cns_set)

        if check_set is None:
            new_codegen_state = codegen_state
            is_empty = False
        else:
            is_empty = check_set.is_empty()
            new_codegen_state = codegen_state.intersect(check_set)

        if pred_checks:
            new_codegen_state = new_codegen_state.copy(
                implemented_predicates=new_codegen_state.implemented_predicates
                | pred_checks)

        if is_empty:
            result = []
        else:
            if group_length == 1:
                # group only contains starting schedule item
                def gen_code(inner_codegen_state):
                    result = []
                    for i in origin_si_entry.schedule_indices:
                        inner = generate_code_for_sched_index(
                            inner_codegen_state, i)

                        if inner is not None:
                            result.append(inner)

                    return result

            else:
                # recurse with a bigger done_group_lengths
                def gen_code(inner_codegen_state):
                    return build_insn_group(
                        sched_index_info_entries[0:group_length],
                        inner_codegen_state,
                        done_group_lengths=(done_group_lengths
                                            | set([group_length])))

            # gen_code returns a list

            if bounds_checks or pred_checks:
                from loopy.symbolic import constraint_to_expr

                prev_gen_code = gen_code

                def gen_code(inner_codegen_state):
                    condition_exprs = [
                        constraint_to_expr(cns) for cns in bounds_checks
                    ] + [pred_chk for pred_chk in pred_checks]

                    prev_result = prev_gen_code(inner_codegen_state)

                    return [
                        wrap_in_if(
                            inner_codegen_state, condition_exprs,
                            merge_codegen_results(codegen_state, prev_result))
                    ]

                cannot_vectorize = False
                if new_codegen_state.vectorization_info is not None:
                    from loopy.isl_helpers import obj_involves_variable
                    for cond in bounds_checks:
                        if obj_involves_variable(
                                cond,
                                new_codegen_state.vectorization_info.iname):
                            cannot_vectorize = True
                            break

                if cannot_vectorize:

                    def gen_code_wrapper(inner_codegen_state):
                        # gen_code returns a list, but this needs to return a
                        # GeneratedCode instance.

                        return gen_code(inner_codegen_state)

                    result = [new_codegen_state.unvectorize(gen_code_wrapper)]
                else:
                    result = gen_code(new_codegen_state)

            else:
                result = gen_code(new_codegen_state)

        return result + build_insn_group(
            sched_index_info_entries[group_length:], codegen_state)

예제 #43

0

파일 보기

파일: callable.py 프로젝트: connorjward/loopy

def _inline_call_instruction(caller_knl, callee_knl, call_insn):
    """
    Returns a copy of *caller_knl* with the *call_insn* in the *kernel*
    replaced by inlining *callee_knl* into it within it.

    :arg call_insn: An instance of `loopy.CallInstruction` of the call-site.
    """
    import pymbolic.primitives as prim
    from pymbolic.mapper.substitutor import make_subst_func
    from loopy.kernel.data import ValueArg

    # {{{ sanity checks

    assert call_insn.expression.function.name == callee_knl.name

    # }}}

    callee_label = callee_knl.name[:4] + "_"
    vng = caller_knl.get_var_name_generator()
    ing = caller_knl.get_instruction_id_generator()

    # {{{ construct callee->caller name mappings

    # name_map: Mapping[str, str]
    # A mapping from variable names in the callee kernel's namespace to
    # the ones they would be referred by in the caller's namespace post inlining.
    name_map = {}

    # only consider temporary variables and inames, arguments would be mapping
    # according to the invocation in call_insn.
    for name in (callee_knl.all_inames()
                 | set(callee_knl.temporary_variables.keys())):
        new_name = vng(callee_label + name)
        name_map[name] = new_name

    # }}}

    # {{{ iname_to_tags

    # new_inames: caller's inames post inlining
    new_inames = caller_knl.inames

    for old_name, callee_iname in callee_knl.inames.items():
        new_name = name_map[old_name]
        new_inames[new_name] = callee_iname.copy(name=new_name)

    # }}}

    # {{{ register callee's temps as caller's

    # new_temps: caller's temps post inlining
    new_temps = caller_knl.temporary_variables.copy()

    for name, tv in callee_knl.temporary_variables.items():
        new_temps[name_map[name]] = tv.copy(name=name_map[name])

    # }}}

    # {{{ get callee args -> parameters passed to the call

    arg_map = {}  # callee arg name -> caller symbols (e.g. SubArrayRef)

    assignees = call_insn.assignees  # writes
    parameters = call_insn.expression.parameters  # reads

    from loopy.kernel.function_interface import get_kw_pos_association
    kw_to_pos, pos_to_kw = get_kw_pos_association(callee_knl)

    for i, par in enumerate(parameters):
        arg_map[pos_to_kw[i]] = par

    for i, assignee in enumerate(assignees):
        arg_map[pos_to_kw[-i - 1]] = assignee

    # }}}

    # {{{ process domains/assumptions

    # rename inames
    new_domains = callee_knl.domains.copy()
    for old_iname in callee_knl.all_inames():
        new_domains = [
            rename_iname(dom, old_iname, name_map[old_iname])
            for dom in new_domains
        ]

    # realize domains' dim params in terms of caller's variables
    new_assumptions = callee_knl.assumptions
    for callee_arg_name, param_expr in arg_map.items():
        if isinstance(callee_knl.arg_dict[callee_arg_name], ValueArg):
            new_domains = [
                substitute_into_domain(
                    dom, callee_arg_name, param_expr,
                    get_valid_domain_param_names(caller_knl))
                for dom in new_domains
            ]

            new_assumptions = substitute_into_domain(
                new_assumptions, callee_arg_name, param_expr,
                get_valid_domain_param_names(caller_knl))

    # }}}

    # {{{ rename inames/temporaries in the program

    rule_mapping_context = SubstitutionRuleMappingContext(
        callee_knl.substitutions, vng)
    subst_func = make_subst_func({
        old_name: prim.Variable(new_name)
        for old_name, new_name in name_map.items()
    })
    inames_temps_renamer = RuleAwareSubstitutionMapper(
        rule_mapping_context, subst_func, within=lambda *args: True)

    callee_knl = rule_mapping_context.finish_kernel(
        inames_temps_renamer.map_kernel(callee_knl))

    # }}}

    # {{{ map callee's expressions to get expressions after inlining

    rule_mapping_context = SubstitutionRuleMappingContext(
        callee_knl.substitutions, vng)
    smap = KernelArgumentSubstitutor(rule_mapping_context, caller_knl,
                                     callee_knl, arg_map)

    callee_knl = rule_mapping_context.finish_kernel(
        smap.map_kernel(callee_knl))

    # }}}

    # {{{ generate new ids for instructions

    insn_id_map = {}
    for insn in callee_knl.instructions:
        insn_id_map[insn.id] = ing(callee_label + insn.id)

    # }}}

    # {{{ use NoOp to mark the start and end of callee kernel

    from loopy.kernel.instruction import NoOpInstruction

    noop_start = NoOpInstruction(id=ing(callee_label + "_start"),
                                 within_inames=call_insn.within_inames,
                                 depends_on=call_insn.depends_on)
    noop_end = NoOpInstruction(id=call_insn.id,
                               within_inames=call_insn.within_inames,
                               depends_on=frozenset(insn_id_map.values()))

    # }}}

    # {{{ map callee's instruction ids

    inlined_insns = [noop_start]

    for insn in callee_knl.instructions:
        new_within_inames = (frozenset(name_map[iname]
                                       for iname in insn.within_inames)
                             | call_insn.within_inames)
        new_depends_on = (frozenset(insn_id_map[dep]
                                    for dep in insn.depends_on)
                          | {noop_start.id})
        new_no_sync_with = frozenset(
            (insn_id_map[id], scope) for id, scope in insn.no_sync_with)
        new_id = insn_id_map[insn.id]

        if isinstance(insn, Assignment):
            new_atomicity = tuple(
                type(atomicity)(name_map[atomicity.var_name])
                for atomicity in insn.atomicity)
            insn = insn.copy(id=insn_id_map[insn.id],
                             within_inames=new_within_inames,
                             depends_on=new_depends_on,
                             tags=insn.tags | call_insn.tags,
                             atomicity=new_atomicity,
                             no_sync_with=new_no_sync_with)
        else:
            insn = insn.copy(id=new_id,
                             within_inames=new_within_inames,
                             depends_on=new_depends_on,
                             tags=insn.tags | call_insn.tags,
                             no_sync_with=new_no_sync_with)
        inlined_insns.append(insn)

    inlined_insns.append(noop_end)

    # }}}

    # {{{ swap out call_insn with inlined_instructions

    idx = caller_knl.instructions.index(call_insn)
    new_insns = (caller_knl.instructions[:idx] + inlined_insns +
                 caller_knl.instructions[idx + 1:])

    # }}}

    old_assumptions, new_assumptions = isl.align_two(caller_knl.assumptions,
                                                     new_assumptions)

    return caller_knl.copy(instructions=new_insns,
                           temporary_variables=new_temps,
                           domains=caller_knl.domains + new_domains,
                           assumptions=(old_assumptions.params()
                                        & new_assumptions.params()),
                           inames=new_inames)

예제 #44

0

파일 보기

파일: loop.py 프로젝트: connorjward/loopy

def merge_loop_domains(kernel):
    # FIXME: This should be moved to loopy.transforms.iname
    from loopy.kernel.tools import is_domain_dependent_on_inames

    while True:
        lnm = potential_loop_nest_map(kernel)
        parents_per_domain = kernel.parents_per_domain()
        all_parents_per_domain = kernel.all_parents_per_domain()

        iname_to_insns = kernel.iname_to_insns()

        new_domains = None

        for inner_iname, outer_inames in lnm.items():
            for outer_iname in outer_inames:
                # {{{ check if it's safe to merge

                inner_domain_idx = kernel.get_home_domain_index(inner_iname)
                outer_domain_idx = kernel.get_home_domain_index(outer_iname)

                if inner_domain_idx == outer_domain_idx:
                    break

                if (not iname_to_insns[inner_iname]
                        or not iname_to_insns[outer_iname]):
                    # Inames without instructions occur when used in
                    # a SubArrayRef. We don't want monster SubArrayRef domains,
                    # so refuse to merge those.
                    continue

                if iname_to_insns[inner_iname] != iname_to_insns[outer_iname]:
                    # The two inames are imperfectly nested. Domain fusion
                    # might be invalid when the inner loop is empty, leading to
                    # the outer loop also being empty.

                    # FIXME: Not fully correct, does not consider reductions
                    # https://gitlab.tiker.net/inducer/loopy/issues/172
                    continue

                if (outer_domain_idx
                        in all_parents_per_domain[inner_domain_idx]
                        and not outer_domain_idx
                        == parents_per_domain[inner_domain_idx]):
                    # Outer domain is not a direct parent of the inner
                    # domain. Unable to merge.
                    continue

                outer_dom = kernel.domains[outer_domain_idx]
                inner_dom = kernel.domains[inner_domain_idx]

                outer_inames = set(outer_dom.get_var_names(isl.dim_type.set))
                if is_domain_dependent_on_inames(kernel, inner_domain_idx,
                                                 outer_inames):
                    # Bounds of inner domain depend on outer domain.
                    # Unable to merge.
                    continue

                # }}}

                new_domains = kernel.domains[:]
                min_idx = min(inner_domain_idx, outer_domain_idx)
                max_idx = max(inner_domain_idx, outer_domain_idx)

                del new_domains[max_idx]
                del new_domains[min_idx]

                outer_dom, inner_dom = isl.align_two(outer_dom, inner_dom)

                new_domains.insert(min_idx, inner_dom & outer_dom)
                break

            if new_domains:
                break

        if not new_domains:
            # Nothing was accomplished in the last loop trip, time to quit.
            break

        kernel = kernel.copy(domains=new_domains)

    return kernel

예제 #45

0

파일 보기

파일: fusion.py 프로젝트: sailfish009/loopy

def _fuse_two_kernels(knla, knlb):
    from loopy.kernel import KernelState
    if knla.state != KernelState.INITIAL or knlb.state != KernelState.INITIAL:
        raise LoopyError("can only fuse kernels in INITIAL state")

    # {{{ fuse domains

    new_domains = knla.domains[:]

    for dom_b in knlb.domains:
        i_fuse = _find_fusable_loop_domain_index(dom_b, new_domains)
        if i_fuse is None:
            new_domains.append(dom_b)
        else:
            dom_a = new_domains[i_fuse]
            dom_a, dom_b = isl.align_two(dom_a, dom_b)

            shared_inames = list(
                    set(dom_a.get_var_dict(dim_type.set))
                    &
                    set(dom_b.get_var_dict(dim_type.set)))

            dom_a_s = dom_a.project_out_except(shared_inames, [dim_type.set])
            dom_b_s = dom_a.project_out_except(shared_inames, [dim_type.set])

            if not (dom_a_s <= dom_b_s and dom_b_s <= dom_a_s):
                raise LoopyError("kernels do not agree on domain of "
                        "inames '%s'" % (",".join(shared_inames)))

            new_domain = dom_a & dom_b

            new_domains[i_fuse] = new_domain

    # }}}

    vng = knla.get_var_name_generator()
    b_var_renames = {}

    # {{{ fuse args

    new_args = knla.args[:]
    for b_arg in knlb.args:
        if b_arg.name not in knla.arg_dict:
            new_arg_name = vng(b_arg.name)

            if new_arg_name != b_arg.name:
                b_var_renames[b_arg.name] = var(new_arg_name)

            new_args.append(b_arg.copy(name=new_arg_name))
        else:
            if b_arg != knla.arg_dict[b_arg.name]:
                raise LoopyError(
                        "argument '{arg_name}' has inconsistent definition between "
                        "the two kernels being merged ({arg_a} <-> {arg_b})"
                        .format(
                            arg_name=b_arg.name,
                            arg_a=str(knla.arg_dict[b_arg.name]),
                            arg_b=str(b_arg)))

    # }}}

    # {{{ fuse temporaries

    new_temporaries = knla.temporary_variables.copy()
    for b_name, b_tv in six.iteritems(knlb.temporary_variables):
        assert b_name == b_tv.name

        new_tv_name = vng(b_name)

        if new_tv_name != b_name:
            b_var_renames[b_name] = var(new_tv_name)

        assert new_tv_name not in new_temporaries
        new_temporaries[new_tv_name] = b_tv.copy(name=new_tv_name)

    # }}}

    knlb = _apply_renames_in_exprs(knlb, b_var_renames)

    from pymbolic.imperative.transform import \
            fuse_instruction_streams_with_unique_ids
    new_instructions, old_b_id_to_new_b_id = \
            fuse_instruction_streams_with_unique_ids(
                    knla.instructions, knlb.instructions)

    # {{{ fuse assumptions

    assump_a = knla.assumptions
    assump_b = knlb.assumptions
    assump_a, assump_b = isl.align_two(assump_a, assump_b)

    shared_param_names = list(
            set(assump_a.get_var_dict(dim_type.set))
            &
            set(assump_b.get_var_dict(dim_type.set)))

    assump_a_s = assump_a.project_out_except(shared_param_names, [dim_type.param])
    assump_b_s = assump_a.project_out_except(shared_param_names, [dim_type.param])

    if not (assump_a_s <= assump_b_s and assump_b_s <= assump_a_s):
        raise LoopyError("assumptions do not agree on kernels to be merged")

    new_assumptions = (assump_a & assump_b).params()

    # }}}

    from loopy.kernel import LoopKernel
    return LoopKernel(
            domains=new_domains,
            instructions=new_instructions,
            args=new_args,
            name="%s_and_%s" % (knla.name, knlb.name),
            preambles=_ordered_merge_lists(knla.preambles, knlb.preambles),
            preamble_generators=_ordered_merge_lists(
                knla.preamble_generators, knlb.preamble_generators),
            assumptions=new_assumptions,
            local_sizes=_merge_dicts(
                "local size", knla.local_sizes, knlb.local_sizes),
            temporary_variables=new_temporaries,
            iname_to_tags=_merge_dicts(
                "iname-to-tag mapping",
                knla.iname_to_tags,
                knlb.iname_to_tags),
            substitutions=_merge_dicts(
                "substitution",
                knla.substitutions,
                knlb.substitutions),
            function_manglers=_ordered_merge_lists(
                knla.function_manglers,
                knlb.function_manglers),
            symbol_manglers=_ordered_merge_lists(
                knla.symbol_manglers,
                knlb.symbol_manglers),

            iname_slab_increments=_merge_dicts(
                "iname slab increment",
                knla.iname_slab_increments,
                knlb.iname_slab_increments),
            loop_priority=knla.loop_priority.union(knlb.loop_priority),
            silenced_warnings=_ordered_merge_lists(
                knla.silenced_warnings,
                knlb.silenced_warnings),
            applied_iname_rewrites=_ordered_merge_lists(
                knla.applied_iname_rewrites,
                knlb.applied_iname_rewrites),
            index_dtype=_merge_values(
                "index dtype",
                knla.index_dtype,
                knlb.index_dtype),
            target=_merge_values(
                "target",
                knla.target,
                knlb.target),
            options=knla.options), old_b_id_to_new_b_id

예제 #46

0

파일 보기

파일: __init__.py 프로젝트: inducer/loopy

 def intersect(self, other):
     new_impl, new_other = isl.align_two(self.implemented_domain, other)
     return self.copy(implemented_domain=new_impl & new_other)

예제 #47

0

파일 보기

파일: fusion.py 프로젝트: rckirby/loopy

def _fuse_two_kernels(knla, knlb):
    from loopy.kernel import kernel_state
    if knla.state != kernel_state.INITIAL or knlb.state != kernel_state.INITIAL:
        raise LoopyError("can only fuse kernels in INITIAL state")

    # {{{ fuse domains

    new_domains = knla.domains[:]

    for dom_b in knlb.domains:
        i_fuse = _find_fusable_loop_domain_index(dom_b, new_domains)
        if i_fuse is None:
            new_domains.append(dom_b)
        else:
            dom_a = new_domains[i_fuse]
            dom_a, dom_b = isl.align_two(dom_a, dom_b)

            shared_inames = list(
                    set(dom_a.get_var_dict(dim_type.set))
                    &
                    set(dom_b.get_var_dict(dim_type.set)))

            dom_a_s = dom_a.project_out_except(shared_inames, [dim_type.set])
            dom_b_s = dom_a.project_out_except(shared_inames, [dim_type.set])

            if not (dom_a_s <= dom_b_s and dom_b_s <= dom_a_s):
                raise LoopyError("kernels do not agree on domain of "
                        "inames '%s'" % (",".join(shared_inames)))

            new_domain = dom_a & dom_b

            new_domains[i_fuse] = new_domain

    # }}}

    vng = knla.get_var_name_generator()
    b_var_renames = {}

    # {{{ fuse args

    new_args = knla.args[:]
    for b_arg in knlb.args:
        if b_arg.name not in knla.arg_dict:
            new_arg_name = vng(b_arg.name)

            if new_arg_name != b_arg.name:
                b_var_renames[b_arg.name] = var(new_arg_name)

            new_args.append(b_arg.copy(name=new_arg_name))
        else:
            if b_arg != knla.arg_dict[b_arg.name]:
                raise LoopyError(
                        "argument '%s' has inconsistent definition between "
                        "the two kernels being merged" % b_arg.name)

    # }}}

    # {{{ fuse temporaries

    new_temporaries = knla.temporary_variables.copy()
    for b_name, b_tv in six.iteritems(knlb.temporary_variables):
        assert b_name == b_tv.name

        new_tv_name = vng(b_name)

        if new_tv_name != b_name:
            b_var_renames[b_name] = var(new_tv_name)

        assert new_tv_name not in new_temporaries
        new_temporaries[new_tv_name] = b_tv.copy(name=new_tv_name)

    # }}}

    # {{{ apply renames in kernel b

    from loopy.symbolic import (
            SubstitutionRuleMappingContext,
            RuleAwareSubstitutionMapper)
    from pymbolic.mapper.substitutor import make_subst_func
    from loopy.context_matching import parse_stack_match

    srmc = SubstitutionRuleMappingContext(
            knlb.substitutions, knlb.get_var_name_generator())
    subst_map = RuleAwareSubstitutionMapper(
            srmc, make_subst_func(b_var_renames),
            within=parse_stack_match(None))
    knlb = subst_map.map_kernel(knlb)

    # }}}

    # {{{ fuse instructions

    new_instructions = knla.instructions[:]
    from pytools import UniqueNameGenerator
    insn_id_gen = UniqueNameGenerator(
            set([insna.id for insna in new_instructions]))

    knl_b_instructions = []
    old_b_id_to_new_b_id = {}
    for insnb in knlb.instructions:
        old_id = insnb.id
        new_id = insn_id_gen(old_id)
        old_b_id_to_new_b_id[old_id] = new_id

        knl_b_instructions.append(
                insnb.copy(id=new_id))

    for insnb in knl_b_instructions:
        new_instructions.append(
                insnb.copy(
                    insn_deps=frozenset(
                        old_b_id_to_new_b_id[dep_id]
                        for dep_id in insnb.insn_deps)))

    # }}}

    # {{{ fuse assumptions

    assump_a = knla.assumptions
    assump_b = knlb.assumptions
    assump_a, assump_b = isl.align_two(assump_a, assump_b)

    shared_param_names = list(
            set(dom_a.get_var_dict(dim_type.set))
            &
            set(dom_b.get_var_dict(dim_type.set)))

    assump_a_s = assump_a.project_out_except(shared_param_names, [dim_type.param])
    assump_b_s = assump_a.project_out_except(shared_param_names, [dim_type.param])

    if not (assump_a_s <= assump_b_s and assump_b_s <= assump_a_s):
        raise LoopyError("assumptions do not agree on kernels to be merged")

    new_assumptions = (assump_a & assump_b).params()

    # }}}

    from loopy.kernel import LoopKernel
    return LoopKernel(
            domains=new_domains,
            instructions=new_instructions,
            args=new_args,
            name="%s_and_%s" % (knla.name, knlb.name),
            preambles=_ordered_merge_lists(knla.preambles, knlb.preambles),
            preamble_generators=_ordered_merge_lists(
                knla.preamble_generators, knlb.preamble_generators),
            assumptions=new_assumptions,
            local_sizes=_merge_dicts(
                "local size", knla.local_sizes, knlb.local_sizes),
            temporary_variables=new_temporaries,
            iname_to_tag=_merge_dicts(
                "iname-to-tag mapping",
                knla.iname_to_tag,
                knlb.iname_to_tag),
            substitutions=_merge_dicts(
                "substitution",
                knla.substitutions,
                knlb.substitutions),
            function_manglers=_ordered_merge_lists(
                knla.function_manglers,
                knlb.function_manglers),
            symbol_manglers=_ordered_merge_lists(
                knla.symbol_manglers,
                knlb.symbol_manglers),

            iname_slab_increments=_merge_dicts(
                "iname slab increment",
                knla.iname_slab_increments,
                knlb.iname_slab_increments),
            loop_priority=_ordered_merge_lists(
                knla.loop_priority,
                knlb.loop_priority),
            silenced_warnings=_ordered_merge_lists(
                knla.silenced_warnings,
                knlb.silenced_warnings),
            applied_iname_rewrites=_ordered_merge_lists(
                knla.applied_iname_rewrites,
                knlb.applied_iname_rewrites),
            index_dtype=_merge_values(
                "index dtype",
                knla.index_dtype,
                knlb.index_dtype),
            target=_merge_values(
                "target",
                knla.target,
                knlb.target),
            options=knla.options)

예제 #48

0

파일 보기

파일: loop.py 프로젝트: navjotk/loopy

def generate_sequential_loop_dim_code(kernel, sched_index, codegen_state):
    ecm = codegen_state.expression_to_code_mapper
    loop_iname = kernel.schedule[sched_index].iname

    slabs = get_slab_decomposition(
            kernel, loop_iname, sched_index, codegen_state)

    from loopy.codegen.bounds import get_usable_inames_for_conditional

    # Note: this does not include loop_iname itself!
    usable_inames = get_usable_inames_for_conditional(kernel, sched_index)
    domain = kernel.get_inames_domain(loop_iname)

    result = []

    for slab_name, slab in slabs:
        cmt = "%s slab for '%s'" % (slab_name, loop_iname)
        if len(slabs) == 1:
            cmt = None

        # {{{ find bounds

        aligned_domain = isl.align_spaces(domain, slab, across_dim_types=True,
                obj_bigger_ok=True)

        dom_and_slab = aligned_domain & slab

        assumptions_non_param = isl.BasicSet.from_params(kernel.assumptions)
        dom_and_slab, assumptions_non_param = isl.align_two(
                dom_and_slab, assumptions_non_param)
        dom_and_slab = dom_and_slab & assumptions_non_param

        # move inames that are usable into parameters
        moved_inames = []
        for iname in dom_and_slab.get_var_names(dim_type.set):
            if iname in usable_inames:
                moved_inames.append(iname)
                dt, idx = dom_and_slab.get_var_dict()[iname]
                dom_and_slab = dom_and_slab.move_dims(
                        dim_type.param, dom_and_slab.dim(dim_type.param),
                        dt, idx, 1)

        _, loop_iname_idx = dom_and_slab.get_var_dict()[loop_iname]

        from loopy.isl_helpers import (
                static_min_of_pw_aff,
                static_max_of_pw_aff)

        lbound = (
                kernel.cache_manager.dim_min(
                    dom_and_slab, loop_iname_idx)
                .gist(kernel.assumptions)
                .coalesce())
        ubound = (
            kernel.cache_manager.dim_max(
                dom_and_slab, loop_iname_idx)
            .gist(kernel.assumptions)
            .coalesce())

        static_lbound = static_min_of_pw_aff(
                lbound,
                constants_only=False)
        static_ubound = static_max_of_pw_aff(
                ubound,
                constants_only=False)

        # }}}

        # {{{ find implemented slab, build inner code

        from loopy.isl_helpers import make_slab_from_bound_pwaffs

        # impl_slab may be overapproximated
        impl_slab = make_slab_from_bound_pwaffs(
                dom_and_slab.space,
                loop_iname, static_lbound, static_ubound)

        for iname in moved_inames:
            dt, idx = impl_slab.get_var_dict()[iname]
            impl_slab = impl_slab.move_dims(
                    dim_type.set, impl_slab.dim(dim_type.set),
                    dt, idx, 1)

        new_codegen_state = codegen_state.intersect(impl_slab)

        inner = build_loop_nest(
                intersect_kernel_with_slab(
                    kernel, slab, iname),
                sched_index+1, new_codegen_state)

        # }}}

        if cmt is not None:
            from cgen import Comment
            result.append(Comment(cmt))

        from cgen import Initializer, POD, Const, Line
        from loopy.symbolic import aff_to_expr

        if (static_ubound - static_lbound).plain_is_zero():
            # single-trip, generate just a variable assignment, not a loop
            result.append(gen_code_block([
                Initializer(Const(POD(kernel.index_dtype, loop_iname)),
                    ecm(aff_to_expr(static_lbound), PREC_NONE, "i")),
                Line(),
                inner,
                ]))

        else:
            result.append(
                kernel.target.emit_sequential_loop(
                       codegen_state, loop_iname, kernel.index_dtype,
                       static_lbound, static_ubound, inner))

    return gen_code_block(result)

예제 #49

0

파일 보기

파일: loop.py 프로젝트: navjotk/loopy

def set_up_hw_parallel_loops(kernel, sched_index, codegen_state,
        hw_inames_left=None):
    from loopy.kernel.data import (
            UniqueTag, HardwareParallelTag, LocalIndexTag, GroupIndexTag)

    if hw_inames_left is None:
        hw_inames_left = [iname
                for iname in kernel.all_inames()
                if isinstance(kernel.iname_to_tag.get(iname), HardwareParallelTag)]

    if not hw_inames_left:
        return build_loop_nest(kernel, sched_index, codegen_state)

    global_size, local_size = kernel.get_grid_sizes()

    hw_inames_left = hw_inames_left[:]
    iname = hw_inames_left.pop()

    tag = kernel.iname_to_tag.get(iname)

    from loopy.symbolic import GroupHardwareAxisIndex, LocalHardwareAxisIndex

    assert isinstance(tag, UniqueTag)
    if isinstance(tag, GroupIndexTag):
        hw_axis_expr = GroupHardwareAxisIndex(tag.axis)
    elif isinstance(tag, LocalIndexTag):
        hw_axis_expr = LocalHardwareAxisIndex(tag.axis)
    else:
        raise RuntimeError("unexpected hw tag type")

    other_inames_with_same_tag = [
            other_iname for other_iname in kernel.all_inames()
            if isinstance(kernel.iname_to_tag.get(other_iname), UniqueTag)
            and kernel.iname_to_tag.get(other_iname).key == tag.key
            and other_iname != iname]

    # {{{ 'implement' hardware axis boundaries

    if isinstance(tag, LocalIndexTag):
        hw_axis_size = local_size[tag.axis]
    elif isinstance(tag, GroupIndexTag):
        hw_axis_size = global_size[tag.axis]
    else:
        raise RuntimeError("unknown hardware parallel tag")

    result = []

    bounds = kernel.get_iname_bounds(iname)
    domain = kernel.get_inames_domain(iname)

    # It's ok to find a bound that's too "loose". The conditional
    # generators will mop up after us.
    from loopy.isl_helpers import static_min_of_pw_aff
    lower_bound = static_min_of_pw_aff(bounds.lower_bound_pw_aff,
            constants_only=False)

    # These bounds are 'implemented' by the hardware. Make sure
    # that the downstream conditional generators realize that.
    if not isinstance(hw_axis_size, int):
        hw_axis_size, lower_bound = isl.align_two(hw_axis_size, lower_bound)

    from loopy.isl_helpers import make_slab
    slab = make_slab(domain.get_space(), iname,
            lower_bound, lower_bound+hw_axis_size)
    codegen_state = codegen_state.intersect(slab)

    from loopy.symbolic import pw_aff_to_expr
    hw_axis_expr = hw_axis_expr + pw_aff_to_expr(lower_bound)

    # }}}

    slabs = get_slab_decomposition(
            kernel, iname, sched_index, codegen_state)

    if other_inames_with_same_tag and len(slabs) > 1:
        raise RuntimeError("cannot do slab decomposition on inames that share "
                "a tag with other inames")

    result = []

    from loopy.codegen import add_comment

    for slab_name, slab in slabs:
        cmt = "%s slab for '%s'" % (slab_name, iname)
        if len(slabs) == 1:
            cmt = None

        # Have the conditional infrastructure generate the
        # slabbing conditionals.
        slabbed_kernel = intersect_kernel_with_slab(kernel, slab, iname)
        new_codegen_state = codegen_state.copy_and_assign(iname, hw_axis_expr)

        inner = set_up_hw_parallel_loops(
                slabbed_kernel, sched_index,
                new_codegen_state, hw_inames_left)

        result.append(add_comment(cmt, inner))

    from loopy.codegen import gen_code_block
    return gen_code_block(result)

예제 #50

0

파일 보기

def generate_sequential_loop_dim_code(codegen_state, sched_index):
    kernel = codegen_state.kernel

    ecm = codegen_state.expression_to_code_mapper
    loop_iname = kernel.schedule[sched_index].iname

    slabs = get_slab_decomposition(kernel, loop_iname)

    from loopy.codegen.bounds import get_usable_inames_for_conditional

    # Note: this does not include loop_iname itself!
    usable_inames = get_usable_inames_for_conditional(kernel, sched_index)
    domain = kernel.get_inames_domain(loop_iname)

    result = []

    for slab_name, slab in slabs:
        cmt = "%s slab for '%s'" % (slab_name, loop_iname)
        if len(slabs) == 1:
            cmt = None

        # {{{ find bounds

        aligned_domain = isl.align_spaces(domain, slab, obj_bigger_ok=True)

        dom_and_slab = aligned_domain & slab

        assumptions_non_param = isl.BasicSet.from_params(kernel.assumptions)
        dom_and_slab, assumptions_non_param = isl.align_two(
            dom_and_slab, assumptions_non_param)
        dom_and_slab = dom_and_slab & assumptions_non_param

        # move inames that are usable into parameters
        moved_inames = []
        for das_iname in sorted(dom_and_slab.get_var_names(dim_type.set)):
            if das_iname in usable_inames:
                moved_inames.append(das_iname)
                dt, idx = dom_and_slab.get_var_dict()[das_iname]
                dom_and_slab = dom_and_slab.move_dims(
                    dim_type.param, dom_and_slab.dim(dim_type.param), dt, idx,
                    1)

        _, loop_iname_idx = dom_and_slab.get_var_dict()[loop_iname]

        impl_domain = isl.align_spaces(codegen_state.implemented_domain,
                                       dom_and_slab,
                                       obj_bigger_ok=True).params()

        lbound = (kernel.cache_manager.dim_min(
            dom_and_slab, loop_iname_idx).gist(
                kernel.assumptions).gist(impl_domain).coalesce())
        ubound = (kernel.cache_manager.dim_max(
            dom_and_slab, loop_iname_idx).gist(
                kernel.assumptions).gist(impl_domain).coalesce())

        # }}}

        # {{{ find implemented loop, build inner code

        from loopy.symbolic import pw_aff_to_pw_aff_implemented_by_expr
        impl_lbound = pw_aff_to_pw_aff_implemented_by_expr(lbound)
        impl_ubound = pw_aff_to_pw_aff_implemented_by_expr(ubound)

        # impl_loop may be overapproximated
        from loopy.isl_helpers import make_loop_bounds_from_pwaffs
        impl_loop = make_loop_bounds_from_pwaffs(dom_and_slab.space,
                                                 loop_iname, impl_lbound,
                                                 impl_ubound)

        for moved_iname in moved_inames:
            # move moved_iname to 'set' dim_type in impl_loop
            dt, idx = impl_loop.get_var_dict()[moved_iname]
            impl_loop = impl_loop.move_dims(dim_type.set,
                                            impl_loop.dim(dim_type.set), dt,
                                            idx, 1)

        new_codegen_state = (codegen_state.intersect(impl_loop).copy(
            kernel=intersect_kernel_with_slab(kernel, slab, loop_iname)))

        inner = build_loop_nest(new_codegen_state, sched_index + 1)

        # }}}

        if cmt is not None:
            result.append(codegen_state.ast_builder.emit_comment(cmt))

        astb = codegen_state.ast_builder

        from loopy.symbolic import pw_aff_to_expr

        if impl_ubound.is_equal(impl_lbound):
            # single-trip, generate just a variable assignment, not a loop
            inner = merge_codegen_results(codegen_state, [
                astb.emit_initializer(codegen_state,
                                      kernel.index_dtype,
                                      loop_iname,
                                      ecm(pw_aff_to_expr(lbound), PREC_NONE,
                                          "i"),
                                      is_const=True),
                astb.emit_blank_line(),
                inner,
            ])
            result.append(
                inner.with_new_ast(
                    codegen_state,
                    astb.ast_block_scope_class(
                        inner.current_ast(codegen_state))))

        else:
            inner_ast = inner.current_ast(codegen_state)

            from loopy.isl_helpers import simplify_pw_aff

            result.append(
                inner.with_new_ast(
                    codegen_state,
                    astb.emit_sequential_loop(
                        codegen_state, loop_iname, kernel.index_dtype,
                        pw_aff_to_expr(
                            simplify_pw_aff(lbound, kernel.assumptions)),
                        pw_aff_to_expr(
                            simplify_pw_aff(ubound, kernel.assumptions)),
                        inner_ast)))

    return merge_codegen_results(codegen_state, result)

예제 #51

0

파일 보기

파일: check.py 프로젝트: yueyedeai/loopy

def check_implemented_domains(kernel, implemented_domains, code=None):
    from islpy import dim_type

    from islpy import align_two

    last_idomains = None
    last_insn_inames = None

    for insn_id, idomains in six.iteritems(implemented_domains):
        insn = kernel.id_to_insn[insn_id]

        assert idomains

        insn_inames = kernel.insn_inames(insn)

        # {{{ if we've checked the same thing before, no need to check it again

        if last_idomains is not None and last_insn_inames is not None:
            if idomains == last_idomains and insn_inames == last_insn_inames:
                continue

        last_idomains = idomains
        last_insn_inames = insn_inames

        # }}}

        insn_impl_domain = idomains[0]
        for idomain in idomains[1:]:
            insn_impl_domain = insn_impl_domain | idomain
        assumption_non_param = isl.BasicSet.from_params(kernel.assumptions)
        assumptions, insn_impl_domain = align_two(
                assumption_non_param, insn_impl_domain)
        insn_impl_domain = (
                (insn_impl_domain & assumptions)
                .project_out_except(insn_inames, [dim_type.set]))

        from loopy.kernel.instruction import BarrierInstruction
        from loopy.kernel.data import LocalIndexTag
        if isinstance(insn, BarrierInstruction):
            # project out local-id-mapped inames, solves #94 on gitlab
            non_lid_inames = frozenset(iname for iname in insn_inames
                if not kernel.iname_tags_of_type(iname, LocalIndexTag))
            insn_impl_domain = insn_impl_domain.project_out_except(
                non_lid_inames, [dim_type.set])

        insn_domain = kernel.get_inames_domain(insn_inames)
        insn_parameters = frozenset(insn_domain.get_var_names(dim_type.param))
        assumptions, insn_domain = align_two(assumption_non_param, insn_domain)
        desired_domain = ((insn_domain & assumptions)
            .project_out_except(insn_inames, [dim_type.set])
            .project_out_except(insn_parameters, [dim_type.param]))

        if isinstance(insn, BarrierInstruction):
            # project out local-id-mapped inames, solves #94 on gitlab
            desired_domain = desired_domain.project_out_except(
                non_lid_inames, [dim_type.set])

        insn_impl_domain = (insn_impl_domain
                .project_out_except(insn_parameters, [dim_type.param]))
        insn_impl_domain, desired_domain = align_two(
                insn_impl_domain, desired_domain)

        if insn_impl_domain != desired_domain:
            i_minus_d = insn_impl_domain - desired_domain
            d_minus_i = desired_domain - insn_impl_domain

            parameter_inames = set(
                    insn_domain.get_dim_name(dim_type.param, i)
                    for i in range(insn_impl_domain.dim(dim_type.param)))

            lines = []
            for bigger, smaller, diff_set, gist_domain in [
                    ("implemented", "desired", i_minus_d,
                        desired_domain.gist(insn_impl_domain)),
                    ("desired", "implemented", d_minus_i,
                        insn_impl_domain.gist(desired_domain))]:

                if diff_set.is_empty():
                    continue

                diff_set = diff_set.coalesce()
                pt = diff_set.sample_point()
                assert not pt.is_void()

                #pt_set = isl.Set.from_point(pt)
                #lines.append("point implemented: %s" % (pt_set <= insn_impl_domain))
                #lines.append("point desired: %s" % (pt_set <= desired_domain))

                iname_to_dim = pt.get_space().get_var_dict()
                point_axes = []
                for iname in kernel.insn_inames(insn) | parameter_inames:
                    tp, dim = iname_to_dim[iname]
                    point_axes.append("%s=%d" % (
                        iname, pt.get_coordinate_val(tp, dim).to_python()))

                lines.append(
                        "sample point in %s but not %s: %s" % (
                            bigger, smaller, ", ".join(point_axes)))
                lines.append(
                        "gist of constraints in %s but not %s: %s" % (
                            smaller, bigger, gist_domain))

            if code is not None:
                print(79*"-")
                print("CODE:")
                print(79*"-")
                from loopy.target.execution import get_highlighted_code
                print(get_highlighted_code(code))
                print(79*"-")

            raise LoopyError("sanity check failed--implemented and desired "
                    "domain for instruction '%s' do not match\n\n"
                    "implemented: %s\n\n"
                    "desired:%s\n\n%s"
                    % (insn_id, insn_impl_domain, desired_domain, "\n".join(lines)))

    # placate the assert at the call site
    return True

예제 #52

0

파일 보기

 def intersect(self, other):
     new_impl, new_other = isl.align_two(self.implemented_domain, other)
     return self.copy(implemented_domain=new_impl & new_other)

예제 #53

0

파일 보기

파일: loop.py 프로젝트: inducer/loopy

def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func,
        hw_inames_left=None):
    kernel = codegen_state.kernel

    from loopy.kernel.data import (UniqueTag, HardwareConcurrentTag,
                LocalIndexTag, GroupIndexTag)

    from loopy.schedule import get_insn_ids_for_block_at
    insn_ids_for_block = get_insn_ids_for_block_at(kernel.schedule, schedule_index)

    if hw_inames_left is None:
        all_inames_by_insns = set()
        for insn_id in insn_ids_for_block:
            all_inames_by_insns |= kernel.insn_inames(insn_id)

        hw_inames_left = [iname for iname in all_inames_by_insns
                if kernel.iname_tags_of_type(iname, HardwareConcurrentTag)]

    if not hw_inames_left:
        return next_func(codegen_state)

    global_size, local_size = kernel.get_grid_sizes_for_insn_ids(
            insn_ids_for_block)

    hw_inames_left = hw_inames_left[:]
    iname = hw_inames_left.pop()

    from loopy.symbolic import GroupHardwareAxisIndex, LocalHardwareAxisIndex

    tag, = kernel.iname_tags_of_type(iname, UniqueTag, max_num=1, min_num=1)

    if isinstance(tag, GroupIndexTag):
        hw_axis_expr = GroupHardwareAxisIndex(tag.axis)
    elif isinstance(tag, LocalIndexTag):
        hw_axis_expr = LocalHardwareAxisIndex(tag.axis)
    else:
        raise RuntimeError("unexpected hw tag type")

    other_inames_with_same_tag = [
        other_iname for other_iname in kernel.all_inames()
        if (kernel.iname_tags_of_type(other_iname, UniqueTag)
            and other_iname != iname
            and any(_tag.key == tag.key
                    for _tag in kernel.iname_tags(other_iname)
                    if _tag))]

    # {{{ 'implement' hardware axis boundaries

    if isinstance(tag, LocalIndexTag):
        hw_axis_size = local_size[tag.axis]
    elif isinstance(tag, GroupIndexTag):
        hw_axis_size = global_size[tag.axis]
    else:
        raise RuntimeError("unknown hardware parallel tag")

    result = []

    bounds = kernel.get_iname_bounds(iname)
    domain = kernel.get_inames_domain(iname)

    # It's ok to find a bound that's too "loose". The conditional
    # generators will mop up after us.
    from loopy.isl_helpers import static_min_of_pw_aff
    lower_bound = static_min_of_pw_aff(bounds.lower_bound_pw_aff,
            constants_only=False)

    # These bounds are 'implemented' by the hardware. Make sure
    # that the downstream conditional generators realize that.
    if not isinstance(hw_axis_size, int):
        hw_axis_size, lower_bound = isl.align_two(hw_axis_size, lower_bound)

    from loopy.isl_helpers import make_slab
    slab = make_slab(domain.get_space(), iname,
            lower_bound, lower_bound+hw_axis_size)
    codegen_state = codegen_state.intersect(slab)

    from loopy.symbolic import pw_aff_to_expr
    hw_axis_expr = hw_axis_expr + pw_aff_to_expr(lower_bound)

    # }}}

    slabs = get_slab_decomposition(kernel, iname)

    if other_inames_with_same_tag and len(slabs) > 1:
        raise RuntimeError("cannot do slab decomposition on inames that share "
                "a tag with other inames")

    result = []

    for slab_name, slab in slabs:
        if len(slabs) > 1:
            result.append(
                    codegen_state.ast_builder.emit_comment(
                        "%s slab for '%s'" % (slab_name, iname)))

        # Have the conditional infrastructure generate the
        # slabbing conditionals.
        slabbed_kernel = intersect_kernel_with_slab(kernel, slab, iname)
        new_codegen_state = (codegen_state
                .copy_and_assign(iname, hw_axis_expr)
                .copy(kernel=slabbed_kernel))

        inner = set_up_hw_parallel_loops(
                new_codegen_state, schedule_index, next_func,
                hw_inames_left)

        result.append(inner)

    return merge_codegen_results(codegen_state, result)

예제 #54

0

파일 보기

def precompute(
        kernel,
        subst_use,
        sweep_inames=[],
        within=None,
        storage_axes=None,
        temporary_name=None,
        precompute_inames=None,
        precompute_outer_inames=None,
        storage_axis_to_tag={},

        # "None" is a valid value here, distinct from the default.
        default_tag=_not_provided,
        dtype=None,
        fetch_bounding_box=False,
        temporary_address_space=None,
        compute_insn_id=None,
        **kwargs):
    """Precompute the expression described in the substitution rule determined by
    *subst_use* and store it in a temporary array. A precomputation needs two
    things to operate, a list of *sweep_inames* (order irrelevant) and an
    ordered list of *storage_axes* (whose order will describe the axis ordering
    of the temporary array).

    :arg subst_use: Describes what to prefetch.

        The following objects may be given for *subst_use*:

        * The name of the substitution rule.

        * The tagged name ("name$tag") of the substitution rule.

        * A list of invocations of the substitution rule.
          This list of invocations, when swept across *sweep_inames*, then serves
          to define the footprint of the precomputation.

          Invocations may be tagged ("name$tag") to filter out a subset of the
          usage sites of the substitution rule. (Namely those usage sites that
          use the same tagged name.)

          Invocations may be given as a string or as a
          :class:`pymbolic.primitives.Expression` object.

          If only one invocation is to be given, then the only entry of the list
          may be given directly.

    If the list of invocations generating the footprint is not given,
    all (tag-matching, if desired) usage sites of the substitution rule
    are used to determine the footprint.

    The following cases can arise for each sweep axis:

    * The axis is an iname that occurs within arguments specified at
      usage sites of the substitution rule. This case is assumed covered
      by the storage axes provided for the argument.

    * The axis is an iname that occurs within the *value* of the rule, but not
      within its arguments. A new, dedicated storage axis is allocated for
      such an axis.

    :arg sweep_inames: A :class:`list` of inames to be swept.
        May also equivalently be a comma-separated string.
    :arg within: a stack match as understood by
        :func:`loopy.match.parse_stack_match`.
    :arg storage_axes: A :class:`list` of inames and/or rule argument
        names/indices to be used as storage axes.
        May also equivalently be a comma-separated string.
    :arg temporary_name:
        The temporary variable name to use for storing the precomputed data.
        If it does not exist, it will be created. If it does exist, its properties
        (such as size, type) are checked (and updated, if possible) to match
        its use.
    :arg precompute_inames:
        A tuple of inames to be used to carry out the precomputation.
        If the specified inames do not already exist, they will be
        created. If they do already exist, their loop domain is verified
        against the one required for this precomputation. This tuple may
        be shorter than the (provided or automatically found) *storage_axes*
        tuple, in which case names will be automatically created.
        May also equivalently be a comma-separated string.

    :arg precompute_outer_inames: A :class:`frozenset` of inames within which
        the compute instruction is nested. If *None*, make an educated guess.
        May also be specified as a comma-separated string.

    :arg default_tag: The :ref:`iname tag <iname-tags>` to be applied to the
        inames created to perform the precomputation. The current default will
        make them local axes and automatically split them to fit the work
        group size, but this default will disappear in favor of simply leaving them
        untagged in 2019. For 2018, a warning will be issued if no *default_tag* is
        specified.

    :arg compute_insn_id: The ID of the instruction generated to perform the
        precomputation.

    If `storage_axes` is not specified, it defaults to the arrangement
    `<direct sweep axes><arguments>` with the direct sweep axes being the
    slower-varying indices.

    Trivial storage axes (i.e. axes of length 1 with respect to the sweep) are
    eliminated.
    """

    # {{{ unify temporary_address_space / temporary_scope

    temporary_scope = kwargs.pop("temporary_scope", None)

    from loopy.kernel.data import AddressSpace
    if temporary_scope is not None:
        from warnings import warn
        warn(
            "temporary_scope is deprecated. Use temporary_address_space instead",
            DeprecationWarning,
            stacklevel=2)

        if temporary_address_space is not None:
            raise LoopyError(
                "may not specify both temporary_address_space and "
                "temporary_scope")

        temporary_address_space = temporary_scope

    del temporary_scope

    # }}}

    if kwargs:
        raise TypeError("unrecognized keyword arguments: %s" %
                        ", ".join(kwargs.keys()))

    # {{{ check, standardize arguments

    if isinstance(sweep_inames, str):
        sweep_inames = [iname.strip() for iname in sweep_inames.split(",")]

    for iname in sweep_inames:
        if iname not in kernel.all_inames():
            raise RuntimeError("sweep iname '%s' is not a known iname" % iname)

    sweep_inames = list(sweep_inames)
    sweep_inames_set = frozenset(sweep_inames)

    if isinstance(storage_axes, str):
        storage_axes = [ax.strip() for ax in storage_axes.split(",")]

    if isinstance(precompute_inames, str):
        precompute_inames = [
            iname.strip() for iname in precompute_inames.split(",")
        ]

    if isinstance(precompute_outer_inames, str):
        precompute_outer_inames = frozenset(
            iname.strip() for iname in precompute_outer_inames.split(","))

    if isinstance(subst_use, str):
        subst_use = [subst_use]

    footprint_generators = None

    subst_name = None
    subst_tag = None

    from pymbolic.primitives import Variable, Call
    from loopy.symbolic import parse, TaggedVariable

    for use in subst_use:
        if isinstance(use, str):
            use = parse(use)

        if isinstance(use, Call):
            if footprint_generators is None:
                footprint_generators = []

            footprint_generators.append(use)
            subst_name_as_expr = use.function
        else:
            subst_name_as_expr = use

        if isinstance(subst_name_as_expr, TaggedVariable):
            new_subst_name = subst_name_as_expr.name
            new_subst_tag = subst_name_as_expr.tag
        elif isinstance(subst_name_as_expr, Variable):
            new_subst_name = subst_name_as_expr.name
            new_subst_tag = None
        else:
            raise ValueError("unexpected type of subst_name")

        if (subst_name, subst_tag) == (None, None):
            subst_name, subst_tag = new_subst_name, new_subst_tag
        else:
            if (subst_name, subst_tag) != (new_subst_name, new_subst_tag):
                raise ValueError("not all uses in subst_use agree "
                                 "on rule name and tag")

    from loopy.match import parse_stack_match
    within = parse_stack_match(within)

    try:
        subst = kernel.substitutions[subst_name]
    except KeyError:
        raise LoopyError("substitution rule '%s' not found" % subst_name)

    c_subst_name = subst_name.replace(".", "_")

    # {{{ handle default_tag

    from loopy.transform.data import _not_provided \
            as transform_data_not_provided

    if default_tag is _not_provided or default_tag is transform_data_not_provided:
        # no need to warn for scalar precomputes
        if sweep_inames:
            from warnings import warn
            warn(
                "Not specifying default_tag is deprecated, and default_tag "
                "will become mandatory in 2019.x. "
                "Pass 'default_tag=\"l.auto\" to match the current default, "
                "or Pass 'default_tag=None to leave the loops untagged, which "
                "is the recommended behavior.",
                DeprecationWarning,
                stacklevel=(

                    # In this case, we came here through add_prefetch. Increase
                    # the stacklevel.
                    3 if default_tag is transform_data_not_provided else 2))

        default_tag = "l.auto"

    from loopy.kernel.data import parse_tag
    default_tag = parse_tag(default_tag)

    # }}}

    # }}}

    # {{{ process invocations in footprint generators, start access_descriptors

    if footprint_generators:
        from pymbolic.primitives import Variable, Call

        access_descriptors = []

        for fpg in footprint_generators:
            if isinstance(fpg, Variable):
                args = ()
            elif isinstance(fpg, Call):
                args = fpg.parameters
            else:
                raise ValueError("footprint generator must "
                                 "be substitution rule invocation")

            access_descriptors.append(
                RuleAccessDescriptor(identifier=access_descriptor_id(
                    args, None),
                                     args=args))

    # }}}

    # {{{ gather up invocations in kernel code, finish access_descriptors

    if not footprint_generators:
        rule_mapping_context = SubstitutionRuleMappingContext(
            kernel.substitutions, kernel.get_var_name_generator())
        invg = RuleInvocationGatherer(rule_mapping_context, kernel, subst_name,
                                      subst_tag, within)
        del rule_mapping_context

        import loopy as lp
        for insn in kernel.instructions:
            if isinstance(insn, lp.MultiAssignmentBase):
                for assignee in insn.assignees:
                    invg(assignee, kernel, insn)
                invg(insn.expression, kernel, insn)

        access_descriptors = invg.access_descriptors
        if not access_descriptors:
            raise RuntimeError("no invocations of '%s' found" % subst_name)

    # }}}

    # {{{ find inames used in arguments

    expanding_usage_arg_deps = set()

    for accdesc in access_descriptors:
        for arg in accdesc.args:
            expanding_usage_arg_deps.update(
                get_dependencies(arg) & kernel.all_inames())

    # }}}

    var_name_gen = kernel.get_var_name_generator()

    # {{{ use given / find new storage_axes

    # extra axes made necessary because they don't occur in the arguments
    extra_storage_axes = set(sweep_inames_set - expanding_usage_arg_deps)

    from loopy.symbolic import SubstitutionRuleExpander
    submap = SubstitutionRuleExpander(kernel.substitutions)

    value_inames = (get_dependencies(submap(subst.expression)) -
                    frozenset(subst.arguments)) & kernel.all_inames()
    if value_inames - expanding_usage_arg_deps < extra_storage_axes:
        raise RuntimeError("unreferenced sweep inames specified: " +
                           ", ".join(extra_storage_axes - value_inames -
                                     expanding_usage_arg_deps))

    new_iname_to_tag = {}

    if storage_axes is None:
        storage_axes = []

        # Add sweep_inames (in given--rather than arbitrary--order) to
        # storage_axes *if* they are part of extra_storage_axes.
        for iname in sweep_inames:
            if iname in extra_storage_axes:
                extra_storage_axes.remove(iname)
                storage_axes.append(iname)

        if extra_storage_axes:
            if (precompute_inames is not None
                    and len(storage_axes) < len(precompute_inames)):
                raise LoopyError(
                    "must specify a sufficient number of "
                    "storage_axes to uniquely determine the meaning "
                    "of the given precompute_inames. (%d storage_axes "
                    "needed)" % len(precompute_inames))
            storage_axes.extend(sorted(extra_storage_axes))

        storage_axes.extend(range(len(subst.arguments)))

    del extra_storage_axes

    prior_storage_axis_name_dict = {}

    storage_axis_names = []
    storage_axis_sources = []  # number for arg#, or iname

    # {{{ check for pre-existing precompute_inames

    if precompute_inames is not None:
        preexisting_precompute_inames = (set(precompute_inames)
                                         & kernel.all_inames())
    else:
        preexisting_precompute_inames = set()

    # }}}

    for i, saxis in enumerate(storage_axes):
        tag_lookup_saxis = saxis

        if saxis in subst.arguments:
            saxis = subst.arguments.index(saxis)

        storage_axis_sources.append(saxis)

        if isinstance(saxis, int):
            # argument index
            name = old_name = subst.arguments[saxis]
        else:
            old_name = saxis
            name = "%s_%s" % (c_subst_name, old_name)

        if (precompute_inames is not None and i < len(precompute_inames)
                and precompute_inames[i]):
            name = precompute_inames[i]
            tag_lookup_saxis = name
            if (name not in preexisting_precompute_inames
                    and var_name_gen.is_name_conflicting(name)):
                raise RuntimeError("new storage axis name '%s' "
                                   "conflicts with existing name" % name)
        else:
            name = var_name_gen(name)

        storage_axis_names.append(name)
        if name not in preexisting_precompute_inames:
            new_iname_to_tag[name] = storage_axis_to_tag.get(
                tag_lookup_saxis, default_tag)

        prior_storage_axis_name_dict[name] = old_name

    del storage_axis_to_tag
    del storage_axes
    del precompute_inames

    # }}}

    # {{{ fill out access_descriptors[...].storage_axis_exprs

    access_descriptors = [
        accdesc.copy(storage_axis_exprs=storage_axis_exprs(
            storage_axis_sources, accdesc.args))
        for accdesc in access_descriptors
    ]

    # }}}

    expanding_inames = sweep_inames_set | frozenset(expanding_usage_arg_deps)
    assert expanding_inames <= kernel.all_inames()

    if storage_axis_names:
        # {{{ find domain to be changed

        change_inames = expanding_inames | preexisting_precompute_inames

        from loopy.kernel.tools import DomainChanger
        domch = DomainChanger(kernel, change_inames)

        if domch.leaf_domain_index is not None:
            # If the sweep inames are at home in parent domains, then we'll add
            # fetches with loops over copies of these parent inames that will end
            # up being scheduled *within* loops over these parents.

            for iname in sweep_inames_set:
                if kernel.get_home_domain_index(
                        iname) != domch.leaf_domain_index:
                    raise RuntimeError(
                        "sweep iname '%s' is not 'at home' in the "
                        "sweep's leaf domain" % iname)

        # }}}

        abm = ArrayToBufferMap(kernel, domch.domain, sweep_inames,
                               access_descriptors, len(storage_axis_names))

        non1_storage_axis_names = []
        for i, saxis in enumerate(storage_axis_names):
            if abm.non1_storage_axis_flags[i]:
                non1_storage_axis_names.append(saxis)
            else:
                del new_iname_to_tag[saxis]

                if saxis in preexisting_precompute_inames:
                    raise LoopyError(
                        "precompute axis %d (1-based) was "
                        "eliminated as "
                        "having length 1 but also mapped to existing "
                        "iname '%s'" % (i + 1, saxis))

        mod_domain = domch.domain

        # {{{ modify the domain, taking into account preexisting inames

        # inames may already exist in mod_domain, add them primed to start
        primed_non1_saxis_names = [
            iname + "'" for iname in non1_storage_axis_names
        ]

        mod_domain = abm.augment_domain_with_sweep(
            domch.domain,
            primed_non1_saxis_names,
            boxify_sweep=fetch_bounding_box)

        check_domain = mod_domain

        for i, saxis in enumerate(non1_storage_axis_names):
            var_dict = mod_domain.get_var_dict(isl.dim_type.set)

            if saxis in preexisting_precompute_inames:
                # add equality constraint between existing and new variable

                dt, dim_idx = var_dict[saxis]
                saxis_aff = isl.Aff.var_on_domain(mod_domain.space, dt,
                                                  dim_idx)

                dt, dim_idx = var_dict[primed_non1_saxis_names[i]]
                new_var_aff = isl.Aff.var_on_domain(mod_domain.space, dt,
                                                    dim_idx)

                mod_domain = mod_domain.add_constraint(
                    isl.Constraint.equality_from_aff(new_var_aff - saxis_aff))

                # project out the new one
                mod_domain = mod_domain.project_out(dt, dim_idx, 1)

            else:
                # remove the prime from the new variable
                dt, dim_idx = var_dict[primed_non1_saxis_names[i]]
                mod_domain = mod_domain.set_dim_name(dt, dim_idx, saxis)

        def add_assumptions(d):
            assumption_non_param = isl.BasicSet.from_params(kernel.assumptions)
            assumptions, domain = isl.align_two(assumption_non_param, d)
            return assumptions & domain

        # {{{ check that we got the desired domain

        check_domain = add_assumptions(
            check_domain.project_out_except(primed_non1_saxis_names,
                                            [isl.dim_type.set]))

        mod_check_domain = add_assumptions(mod_domain)

        # re-add the prime from the new variable
        var_dict = mod_check_domain.get_var_dict(isl.dim_type.set)

        for saxis in non1_storage_axis_names:
            dt, dim_idx = var_dict[saxis]
            mod_check_domain = mod_check_domain.set_dim_name(
                dt, dim_idx, saxis + "'")

        mod_check_domain = mod_check_domain.project_out_except(
            primed_non1_saxis_names, [isl.dim_type.set])

        mod_check_domain, check_domain = isl.align_two(mod_check_domain,
                                                       check_domain)

        # The modified domain can't get bigger by adding constraints
        assert mod_check_domain <= check_domain

        if not check_domain <= mod_check_domain:
            print(check_domain)
            print(mod_check_domain)
            raise LoopyError("domain of preexisting inames does not match "
                             "domain needed for precompute")

        # }}}

        # {{{ check that we didn't shrink the original domain

        # project out the new names from the modified domain
        orig_domain_inames = list(domch.domain.get_var_dict(isl.dim_type.set))
        mod_check_domain = add_assumptions(
            mod_domain.project_out_except(orig_domain_inames,
                                          [isl.dim_type.set]))

        check_domain = add_assumptions(domch.domain)

        mod_check_domain, check_domain = isl.align_two(mod_check_domain,
                                                       check_domain)

        # The modified domain can't get bigger by adding constraints
        assert mod_check_domain <= check_domain

        if not check_domain <= mod_check_domain:
            print(check_domain)
            print(mod_check_domain)
            raise LoopyError(
                "original domain got shrunk by applying the precompute")

        # }}}

        # }}}

        new_kernel_domains = domch.get_domains_with(mod_domain)

    else:
        # leave kernel domains unchanged
        new_kernel_domains = kernel.domains

        non1_storage_axis_names = []
        abm = NoOpArrayToBufferMap()

    kernel = kernel.copy(domains=new_kernel_domains)

    # {{{ set up compute insn

    if temporary_name is None:
        temporary_name = var_name_gen(based_on=c_subst_name)

    assignee = var(temporary_name)

    if non1_storage_axis_names:
        assignee = assignee[tuple(
            var(iname) for iname in non1_storage_axis_names)]

    # {{{ process substitutions on compute instruction

    storage_axis_subst_dict = {}

    for arg_name, bi in zip(storage_axis_names, abm.storage_base_indices):
        if arg_name in non1_storage_axis_names:
            arg = var(arg_name)
        else:
            arg = 0

        storage_axis_subst_dict[prior_storage_axis_name_dict.get(
            arg_name, arg_name)] = arg + bi

    rule_mapping_context = SubstitutionRuleMappingContext(
        kernel.substitutions, kernel.get_var_name_generator())

    from loopy.match import parse_stack_match
    expr_subst_map = RuleAwareSubstitutionMapper(
        rule_mapping_context,
        make_subst_func(storage_axis_subst_dict),
        within=parse_stack_match(None))

    compute_expression = expr_subst_map(subst.expression, kernel, None)

    # }}}

    from loopy.kernel.data import Assignment
    if compute_insn_id is None:
        compute_insn_id = kernel.make_unique_instruction_id(
            based_on=c_subst_name)

    compute_insn = Assignment(
        id=compute_insn_id,
        assignee=assignee,
        expression=compute_expression,
        # within_inames determined below
    )
    compute_dep_id = compute_insn_id
    added_compute_insns = [compute_insn]

    if temporary_address_space == AddressSpace.GLOBAL:
        barrier_insn_id = kernel.make_unique_instruction_id(
            based_on=c_subst_name + "_barrier")
        from loopy.kernel.instruction import BarrierInstruction
        barrier_insn = BarrierInstruction(id=barrier_insn_id,
                                          depends_on=frozenset(
                                              [compute_insn_id]),
                                          synchronization_kind="global",
                                          mem_kind="global")
        compute_dep_id = barrier_insn_id

        added_compute_insns.append(barrier_insn)

    # }}}

    # {{{ substitute rule into expressions in kernel (if within footprint)

    from loopy.symbolic import SubstitutionRuleExpander
    expander = SubstitutionRuleExpander(kernel.substitutions)

    invr = RuleInvocationReplacer(rule_mapping_context,
                                  subst_name,
                                  subst_tag,
                                  within,
                                  access_descriptors,
                                  abm,
                                  storage_axis_names,
                                  storage_axis_sources,
                                  non1_storage_axis_names,
                                  temporary_name,
                                  compute_insn_id,
                                  compute_dep_id,
                                  compute_read_variables=get_dependencies(
                                      expander(compute_expression)))

    kernel = invr.map_kernel(kernel)
    kernel = kernel.copy(instructions=added_compute_insns +
                         kernel.instructions)
    kernel = rule_mapping_context.finish_kernel(kernel)

    # }}}

    # {{{ add dependencies to compute insn

    kernel = kernel.copy(instructions=[
        insn.copy(depends_on=frozenset(invr.compute_insn_depends_on)) if insn.
        id == compute_insn_id else insn for insn in kernel.instructions
    ])

    # }}}

    # {{{ propagate storage iname subst to dependencies of compute instructions

    from loopy.kernel.tools import find_recursive_dependencies
    compute_deps = find_recursive_dependencies(kernel,
                                               frozenset([compute_insn_id]))

    # FIXME: Need to verify that there are no outside dependencies
    # on compute_deps

    prior_storage_axis_names = frozenset(storage_axis_subst_dict)

    new_insns = []
    for insn in kernel.instructions:
        if (insn.id in compute_deps
                and insn.within_inames & prior_storage_axis_names):
            insn = (insn.with_transformed_expressions(
                lambda expr: expr_subst_map(expr, kernel, insn)).copy(
                    within_inames=frozenset(
                        storage_axis_subst_dict.get(iname, var(iname)).name
                        for iname in insn.within_inames)))

            new_insns.append(insn)
        else:
            new_insns.append(insn)

    kernel = kernel.copy(instructions=new_insns)

    # }}}

    # {{{ determine inames for compute insn

    if precompute_outer_inames is None:
        from loopy.kernel.tools import guess_iname_deps_based_on_var_use
        precompute_outer_inames = (
            frozenset(non1_storage_axis_names)
            | frozenset((expanding_usage_arg_deps | value_inames) -
                        sweep_inames_set)
            | guess_iname_deps_based_on_var_use(kernel, compute_insn))
    else:
        if not isinstance(precompute_outer_inames, frozenset):
            raise TypeError("precompute_outer_inames must be a frozenset")

        precompute_outer_inames = precompute_outer_inames \
                | frozenset(non1_storage_axis_names)

    kernel = kernel.copy(instructions=[
        insn.copy(within_inames=precompute_outer_inames) if insn.id ==
        compute_insn_id else insn for insn in kernel.instructions
    ])

    # }}}

    # {{{ set up temp variable

    import loopy as lp
    if dtype is not None:
        dtype = np.dtype(dtype)

    if temporary_address_space is None:
        temporary_address_space = lp.auto

    new_temp_shape = tuple(abm.non1_storage_shape)

    new_temporary_variables = kernel.temporary_variables.copy()
    if temporary_name not in new_temporary_variables:
        temp_var = lp.TemporaryVariable(
            name=temporary_name,
            dtype=dtype,
            base_indices=(0, ) * len(new_temp_shape),
            shape=tuple(abm.non1_storage_shape),
            address_space=temporary_address_space,
            dim_names=tuple(non1_storage_axis_names))

    else:
        temp_var = new_temporary_variables[temporary_name]

        # {{{ check and adapt existing temporary

        if temp_var.dtype is lp.auto:
            pass
        elif temp_var.dtype is not lp.auto and dtype is lp.auto:
            dtype = temp_var.dtype
        elif temp_var.dtype is not lp.auto and dtype is not lp.auto:
            if temp_var.dtype != dtype:
                raise LoopyError("Existing and new dtype of temporary '%s' "
                                 "do not match (existing: %s, new: %s)" %
                                 (temporary_name, temp_var.dtype, dtype))

        temp_var = temp_var.copy(dtype=dtype)

        if len(temp_var.shape) != len(new_temp_shape):
            raise LoopyError(
                "Existing and new temporary '%s' do not "
                "have matching number of dimensions ('%d' vs. '%d') " %
                (temporary_name, len(temp_var.shape), len(new_temp_shape)))

        if temp_var.base_indices != (0, ) * len(new_temp_shape):
            raise LoopyError(
                "Existing and new temporary '%s' do not "
                "have matching number of dimensions ('%d' vs. '%d') " %
                (temporary_name, len(temp_var.shape), len(new_temp_shape)))

        new_temp_shape = tuple(
            max(i, ex_i) for i, ex_i in zip(new_temp_shape, temp_var.shape))

        temp_var = temp_var.copy(shape=new_temp_shape)

        if temporary_address_space == temp_var.address_space:
            pass
        elif temporary_address_space is lp.auto:
            temporary_address_space = temp_var.address_space
        elif temp_var.address_space is lp.auto:
            pass
        else:
            raise LoopyError("Existing and new temporary '%s' do not "
                             "have matching scopes (existing: %s, new: %s)" %
                             (temporary_name,
                              AddressSpace.stringify(temp_var.address_space),
                              AddressSpace.stringify(temporary_address_space)))

        temp_var = temp_var.copy(address_space=temporary_address_space)

        # }}}

    new_temporary_variables[temporary_name] = temp_var

    kernel = kernel.copy(temporary_variables=new_temporary_variables)

    # }}}

    from loopy import tag_inames
    kernel = tag_inames(kernel, new_iname_to_tag)

    from loopy.kernel.data import AutoFitLocalIndexTag, filter_iname_tags_by_type

    if filter_iname_tags_by_type(new_iname_to_tag.values(),
                                 AutoFitLocalIndexTag):
        from loopy.kernel.tools import assign_automatic_axes
        kernel = assign_automatic_axes(kernel)

    return kernel

예제 #55

0

파일 보기

파일: precompute.py 프로젝트: navjotk/loopy

def precompute(kernel, subst_use, sweep_inames=[], within=None,
        storage_axes=None, temporary_name=None, precompute_inames=None,
        storage_axis_to_tag={}, default_tag="l.auto", dtype=None,
        fetch_bounding_box=False, temporary_is_local=None,
        compute_insn_id=None):
    """Precompute the expression described in the substitution rule determined by
    *subst_use* and store it in a temporary array. A precomputation needs two
    things to operate, a list of *sweep_inames* (order irrelevant) and an
    ordered list of *storage_axes* (whose order will describe the axis ordering
    of the temporary array).

    :arg subst_use: Describes what to prefetch.

    The following objects may be given for *subst_use*:

    * The name of the substitution rule.

    * The tagged name ("name$tag") of the substitution rule.

    * A list of invocations of the substitution rule.
      This list of invocations, when swept across *sweep_inames*, then serves
      to define the footprint of the precomputation.

      Invocations may be tagged ("name$tag") to filter out a subset of the
      usage sites of the substitution rule. (Namely those usage sites that
      use the same tagged name.)

      Invocations may be given as a string or as a
      :class:`pymbolic.primitives.Expression` object.

      If only one invocation is to be given, then the only entry of the list
      may be given directly.

    If the list of invocations generating the footprint is not given,
    all (tag-matching, if desired) usage sites of the substitution rule
    are used to determine the footprint.

    The following cases can arise for each sweep axis:

    * The axis is an iname that occurs within arguments specified at
      usage sites of the substitution rule. This case is assumed covered
      by the storage axes provided for the argument.

    * The axis is an iname that occurs within the *value* of the rule, but not
      within its arguments. A new, dedicated storage axis is allocated for
      such an axis.

    :arg sweep_inames: A :class:`list` of inames and/or rule argument
        names to be swept.
        May also equivalently be a comma-separated string.
    :arg storage_axes: A :class:`list` of inames and/or rule argument
        names/indices to be used as storage axes.
        May also equivalently be a comma-separated string.
    :arg within: a stack match as understood by
        :func:`loopy.context_matching.parse_stack_match`.
    :arg temporary_name:
        The temporary variable name to use for storing the precomputed data.
        If it does not exist, it will be created. If it does exist, its properties
        (such as size, type) are checked (and updated, if possible) to match
        its use.
    :arg precompute_inames:
        A tuple of inames to be used to carry out the precomputation.
        If the specified inames do not already exist, they will be
        created. If they do already exist, their loop domain is verified
        against the one required for this precomputation. This tuple may
        be shorter than the (provided or automatically found) *storage_axes*
        tuple, in which case names will be automatically created.
        May also equivalently be a comma-separated string.

    :arg compute_insn_id: The ID of the instruction performing the precomputation.

    If `storage_axes` is not specified, it defaults to the arrangement
    `<direct sweep axes><arguments>` with the direct sweep axes being the
    slower-varying indices.

    Trivial storage axes (i.e. axes of length 1 with respect to the sweep) are
    eliminated.
    """

    # {{{ check, standardize arguments

    if isinstance(sweep_inames, str):
        sweep_inames = [iname.strip() for iname in sweep_inames.split(",")]

    for iname in sweep_inames:
        if iname not in kernel.all_inames():
            raise RuntimeError("sweep iname '%s' is not a known iname"
                    % iname)

    sweep_inames = list(sweep_inames)
    sweep_inames_set = frozenset(sweep_inames)

    if isinstance(storage_axes, str):
        storage_axes = [ax.strip() for ax in storage_axes.split(",")]

    if isinstance(precompute_inames, str):
        precompute_inames = [iname.strip() for iname in precompute_inames.split(",")]

    if isinstance(subst_use, str):
        subst_use = [subst_use]

    footprint_generators = None

    subst_name = None
    subst_tag = None

    from pymbolic.primitives import Variable, Call
    from loopy.symbolic import parse, TaggedVariable

    for use in subst_use:
        if isinstance(use, str):
            use = parse(use)

        if isinstance(use, Call):
            if footprint_generators is None:
                footprint_generators = []

            footprint_generators.append(use)
            subst_name_as_expr = use.function
        else:
            subst_name_as_expr = use

        if isinstance(subst_name_as_expr, TaggedVariable):
            new_subst_name = subst_name_as_expr.name
            new_subst_tag = subst_name_as_expr.tag
        elif isinstance(subst_name_as_expr, Variable):
            new_subst_name = subst_name_as_expr.name
            new_subst_tag = None
        else:
            raise ValueError("unexpected type of subst_name")

        if (subst_name, subst_tag) == (None, None):
            subst_name, subst_tag = new_subst_name, new_subst_tag
        else:
            if (subst_name, subst_tag) != (new_subst_name, new_subst_tag):
                raise ValueError("not all uses in subst_use agree "
                        "on rule name and tag")

    from loopy.context_matching import parse_stack_match
    within = parse_stack_match(within)

    from loopy.kernel.data import parse_tag
    default_tag = parse_tag(default_tag)

    subst = kernel.substitutions[subst_name]
    c_subst_name = subst_name.replace(".", "_")

    # }}}

    # {{{ process invocations in footprint generators, start access_descriptors

    if footprint_generators:
        from pymbolic.primitives import Variable, Call

        access_descriptors = []

        for fpg in footprint_generators:
            if isinstance(fpg, Variable):
                args = ()
            elif isinstance(fpg, Call):
                args = fpg.parameters
            else:
                raise ValueError("footprint generator must "
                        "be substitution rule invocation")

            access_descriptors.append(
                    RuleAccessDescriptor(
                        identifier=access_descriptor_id(args, None),
                        args=args
                        ))

    # }}}

    # {{{ gather up invocations in kernel code, finish access_descriptors

    if not footprint_generators:
        rule_mapping_context = SubstitutionRuleMappingContext(
                kernel.substitutions, kernel.get_var_name_generator())
        invg = RuleInvocationGatherer(
                rule_mapping_context, kernel, subst_name, subst_tag, within)
        del rule_mapping_context

        import loopy as lp
        for insn in kernel.instructions:
            if isinstance(insn, lp.Assignment):
                invg(insn.assignee, kernel, insn)
                invg(insn.expression, kernel, insn)

        access_descriptors = invg.access_descriptors
        if not access_descriptors:
            raise RuntimeError("no invocations of '%s' found" % subst_name)

    # }}}

    # {{{ find inames used in arguments

    expanding_usage_arg_deps = set()

    for accdesc in access_descriptors:
        for arg in accdesc.args:
            expanding_usage_arg_deps.update(
                    get_dependencies(arg) & kernel.all_inames())

    # }}}

    var_name_gen = kernel.get_var_name_generator()

    # {{{ use given / find new storage_axes

    # extra axes made necessary because they don't occur in the arguments
    extra_storage_axes = set(sweep_inames_set - expanding_usage_arg_deps)

    from loopy.symbolic import SubstitutionRuleExpander
    submap = SubstitutionRuleExpander(kernel.substitutions)

    value_inames = get_dependencies(
            submap(subst.expression)
            ) & kernel.all_inames()
    if value_inames - expanding_usage_arg_deps < extra_storage_axes:
        raise RuntimeError("unreferenced sweep inames specified: "
                + ", ".join(extra_storage_axes
                    - value_inames - expanding_usage_arg_deps))

    new_iname_to_tag = {}

    if storage_axes is None:
        storage_axes = []

        # Add sweep_inames (in given--rather than arbitrary--order) to
        # storage_axes *if* they are part of extra_storage_axes.
        for iname in sweep_inames:
            if iname in extra_storage_axes:
                extra_storage_axes.remove(iname)
                storage_axes.append(iname)

        if extra_storage_axes:
            if (precompute_inames is not None
                    and len(storage_axes) < len(precompute_inames)):
                raise LoopyError("must specify a sufficient number of "
                        "storage_axes to uniquely determine the meaning "
                        "of the given precompute_inames. (%d storage_axes "
                        "needed)" % len(precompute_inames))
            storage_axes.extend(sorted(extra_storage_axes))

        storage_axes.extend(range(len(subst.arguments)))

    del extra_storage_axes

    prior_storage_axis_name_dict = {}

    storage_axis_names = []
    storage_axis_sources = []  # number for arg#, or iname

    # {{{ check for pre-existing precompute_inames

    if precompute_inames is not None:
        preexisting_precompute_inames = (
                set(precompute_inames) & kernel.all_inames())
    else:
        preexisting_precompute_inames = set()

    # }}}

    for i, saxis in enumerate(storage_axes):
        tag_lookup_saxis = saxis

        if saxis in subst.arguments:
            saxis = subst.arguments.index(saxis)

        storage_axis_sources.append(saxis)

        if isinstance(saxis, int):
            # argument index
            name = old_name = subst.arguments[saxis]
        else:
            old_name = saxis
            name = "%s_%s" % (c_subst_name, old_name)

        if (precompute_inames is not None
                and i < len(precompute_inames)
                and precompute_inames[i]):
            name = precompute_inames[i]
            tag_lookup_saxis = name
            if (name not in preexisting_precompute_inames
                    and var_name_gen.is_name_conflicting(name)):
                raise RuntimeError("new storage axis name '%s' "
                        "conflicts with existing name" % name)
        else:
            name = var_name_gen(name)

        storage_axis_names.append(name)
        if name not in preexisting_precompute_inames:
            new_iname_to_tag[name] = storage_axis_to_tag.get(
                    tag_lookup_saxis, default_tag)

        prior_storage_axis_name_dict[name] = old_name

    del storage_axis_to_tag
    del storage_axes
    del precompute_inames

    # }}}

    # {{{ fill out access_descriptors[...].storage_axis_exprs

    access_descriptors = [
            accdesc.copy(
                storage_axis_exprs=storage_axis_exprs(
                    storage_axis_sources, accdesc.args))
            for accdesc in access_descriptors]

    # }}}

    expanding_inames = sweep_inames_set | frozenset(expanding_usage_arg_deps)
    assert expanding_inames <= kernel.all_inames()

    if storage_axis_names:
        # {{{ find domain to be changed

        change_inames = expanding_inames | preexisting_precompute_inames

        from loopy.kernel.tools import DomainChanger
        domch = DomainChanger(kernel, change_inames)

        if domch.leaf_domain_index is not None:
            # If the sweep inames are at home in parent domains, then we'll add
            # fetches with loops over copies of these parent inames that will end
            # up being scheduled *within* loops over these parents.

            for iname in sweep_inames_set:
                if kernel.get_home_domain_index(iname) != domch.leaf_domain_index:
                    raise RuntimeError("sweep iname '%s' is not 'at home' in the "
                            "sweep's leaf domain" % iname)

        # }}}

        abm = ArrayToBufferMap(kernel, domch.domain, sweep_inames,
                access_descriptors, len(storage_axis_names))

        non1_storage_axis_names = []
        for i, saxis in enumerate(storage_axis_names):
            if abm.non1_storage_axis_flags[i]:
                non1_storage_axis_names.append(saxis)
            else:
                del new_iname_to_tag[saxis]

                if saxis in preexisting_precompute_inames:
                    raise LoopyError("precompute axis %d (1-based) was "
                            "eliminated as "
                            "having length 1 but also mapped to existing "
                            "iname '%s'" % (i+1, saxis))

        mod_domain = domch.domain

        # {{{ modify the domain, taking into account preexisting inames

        # inames may already exist in mod_domain, add them primed to start
        primed_non1_saxis_names = [
                iname+"'" for iname in non1_storage_axis_names]

        mod_domain = abm.augment_domain_with_sweep(
            domch.domain, primed_non1_saxis_names,
            boxify_sweep=fetch_bounding_box)

        check_domain = mod_domain

        for i, saxis in enumerate(non1_storage_axis_names):
            var_dict = mod_domain.get_var_dict(isl.dim_type.set)

            if saxis in preexisting_precompute_inames:
                # add equality constraint between existing and new variable

                dt, dim_idx = var_dict[saxis]
                saxis_aff = isl.Aff.var_on_domain(mod_domain.space, dt, dim_idx)

                dt, dim_idx = var_dict[primed_non1_saxis_names[i]]
                new_var_aff = isl.Aff.var_on_domain(mod_domain.space, dt, dim_idx)

                mod_domain = mod_domain.add_constraint(
                        isl.Constraint.equality_from_aff(new_var_aff - saxis_aff))

                # project out the new one
                mod_domain = mod_domain.project_out(dt, dim_idx, 1)

            else:
                # remove the prime from the new variable
                dt, dim_idx = var_dict[primed_non1_saxis_names[i]]
                mod_domain = mod_domain.set_dim_name(dt, dim_idx, saxis)

        # {{{ check that we got the desired domain

        check_domain = check_domain.project_out_except(
                primed_non1_saxis_names, [isl.dim_type.set])

        mod_check_domain = mod_domain

        # re-add the prime from the new variable
        var_dict = mod_check_domain.get_var_dict(isl.dim_type.set)

        for saxis in non1_storage_axis_names:
            dt, dim_idx = var_dict[saxis]
            mod_check_domain = mod_check_domain.set_dim_name(dt, dim_idx, saxis+"'")

        mod_check_domain = mod_check_domain.project_out_except(
                primed_non1_saxis_names, [isl.dim_type.set])

        mod_check_domain, check_domain = isl.align_two(
                mod_check_domain, check_domain)

        # The modified domain can't get bigger by adding constraints
        assert mod_check_domain <= check_domain

        if not check_domain <= mod_check_domain:
            print(check_domain)
            print(mod_check_domain)
            raise LoopyError("domain of preexisting inames does not match "
                    "domain needed for precompute")

        # }}}

        # {{{ check that we didn't shrink the original domain

        # project out the new names from the modified domain
        orig_domain_inames = list(domch.domain.get_var_dict(isl.dim_type.set))
        mod_check_domain = mod_domain.project_out_except(
                orig_domain_inames, [isl.dim_type.set])

        check_domain = domch.domain

        mod_check_domain, check_domain = isl.align_two(
                mod_check_domain, check_domain)

        # The modified domain can't get bigger by adding constraints
        assert mod_check_domain <= check_domain

        if not check_domain <= mod_check_domain:
            print(check_domain)
            print(mod_check_domain)
            raise LoopyError("original domain got shrunk by applying the precompute")

        # }}}

        # }}}

        new_kernel_domains = domch.get_domains_with(mod_domain)

    else:
        # leave kernel domains unchanged
        new_kernel_domains = kernel.domains

        non1_storage_axis_names = []
        abm = NoOpArrayToBufferMap()

    kernel = kernel.copy(domains=new_kernel_domains)

    # {{{ set up compute insn

    if temporary_name is None:
        temporary_name = var_name_gen(based_on=c_subst_name)

    assignee = var(temporary_name)

    if non1_storage_axis_names:
        assignee = assignee.index(
                tuple(var(iname) for iname in non1_storage_axis_names))

    # {{{ process substitutions on compute instruction

    storage_axis_subst_dict = {}

    for arg_name, bi in zip(storage_axis_names, abm.storage_base_indices):
        if arg_name in non1_storage_axis_names:
            arg = var(arg_name)
        else:
            arg = 0

        storage_axis_subst_dict[
                prior_storage_axis_name_dict.get(arg_name, arg_name)] = arg+bi

    rule_mapping_context = SubstitutionRuleMappingContext(
            kernel.substitutions, kernel.get_var_name_generator())

    from loopy.context_matching import parse_stack_match
    expr_subst_map = RuleAwareSubstitutionMapper(
            rule_mapping_context,
            make_subst_func(storage_axis_subst_dict),
            within=parse_stack_match(None))

    compute_expression = expr_subst_map(subst.expression, kernel, None)

    # }}}

    from loopy.kernel.data import Assignment
    if compute_insn_id is None:
        compute_insn_id = kernel.make_unique_instruction_id(based_on=c_subst_name)

    compute_insn = Assignment(
            id=compute_insn_id,
            assignee=assignee,
            expression=compute_expression)

    # }}}

    # {{{ substitute rule into expressions in kernel (if within footprint)

    invr = RuleInvocationReplacer(rule_mapping_context,
            subst_name, subst_tag, within,
            access_descriptors, abm,
            storage_axis_names, storage_axis_sources,
            non1_storage_axis_names,
            temporary_name, compute_insn_id)

    kernel = invr.map_kernel(kernel)
    kernel = kernel.copy(
            instructions=[compute_insn] + kernel.instructions)
    kernel = rule_mapping_context.finish_kernel(kernel)

    # }}}

    # {{{ set up temp variable

    import loopy as lp
    if dtype is None:
        dtype = lp.auto
    else:
        dtype = np.dtype(dtype)

    import loopy as lp

    if temporary_is_local is None:
        temporary_is_local = lp.auto

    new_temp_shape = tuple(abm.non1_storage_shape)

    new_temporary_variables = kernel.temporary_variables.copy()
    if temporary_name not in new_temporary_variables:
        temp_var = lp.TemporaryVariable(
                name=temporary_name,
                dtype=dtype,
                base_indices=(0,)*len(new_temp_shape),
                shape=tuple(abm.non1_storage_shape),
                is_local=temporary_is_local)

    else:
        temp_var = new_temporary_variables[temporary_name]

        # {{{ check and adapt existing temporary

        if temp_var.dtype is lp.auto:
            pass
        elif temp_var.dtype is not lp.auto and dtype is lp.auto:
            dtype = temp_var.dtype
        elif temp_var.dtype is not lp.auto and dtype is not lp.auto:
            if temp_var.dtype != dtype:
                raise LoopyError("Existing and new dtype of temporary '%s' "
                        "do not match (existing: %s, new: %s)"
                        % (temporary_name, temp_var.dtype, dtype))

        temp_var = temp_var.copy(dtype=dtype)

        if len(temp_var.shape) != len(new_temp_shape):
            raise LoopyError("Existing and new temporary '%s' do not "
                    "have matching number of dimensions "
                    % (temporary_name,
                        len(temp_var.shape), len(new_temp_shape)))

        if temp_var.base_indices != (0,) * len(new_temp_shape):
            raise LoopyError("Existing and new temporary '%s' do not "
                    "have matching number of dimensions "
                    % (temporary_name,
                        len(temp_var.shape), len(new_temp_shape)))

        new_temp_shape = tuple(
                max(i, ex_i)
                for i, ex_i in zip(new_temp_shape, temp_var.shape))

        temp_var = temp_var.copy(shape=new_temp_shape)

        if temporary_is_local == temp_var.is_local:
            pass
        elif temporary_is_local is lp.auto:
            temporary_is_local = temp_var.is_local
        elif temp_var.is_local is lp.auto:
            pass
        else:
            raise LoopyError("Existing and new temporary '%s' do not "
                    "have matching values of 'is_local'"
                    % (temporary_name,
                        temp_var.is_local, temporary_is_local))

        temp_var = temp_var.copy(is_local=temporary_is_local)

        # }}}

    new_temporary_variables[temporary_name] = temp_var

    kernel = kernel.copy(
            temporary_variables=new_temporary_variables)

    # }}}

    from loopy import tag_inames
    kernel = tag_inames(kernel, new_iname_to_tag)

    from loopy.kernel.data import AutoFitLocalIndexTag
    has_automatic_axes = any(
            isinstance(tag, AutoFitLocalIndexTag)
            for tag in new_iname_to_tag.values())

    if has_automatic_axes:
        from loopy.kernel.tools import assign_automatic_axes
        kernel = assign_automatic_axes(kernel)

    return kernel

예제 #56

0

파일 보기

파일: control.py 프로젝트: navjotk/loopy

    def build_insn_group(sched_index_info_entries, codegen_state,
            done_group_lengths=set()):
        """
        :arg done_group_lengths: A set of group lengths (integers) that grows
            from empty to include the longest found group and downwards with every
            recursive call.  It serves to prevent infinite recursion by preventing
            recursive calls from doing anything about groups that are too small.
        """

        # The rough plan here is that build_insn_group starts out with the
        # entirety of the current schedule item's downward siblings (i.e. all
        # the ones up to the next LeaveLoop). It will then iterate upward to
        # find the largest usable conditional hoist group.
        #
        # It will then call itself recursively, telling its recursive instances
        # to ignore the hoist group it just found by adding that group length
        # to done_group_length. (It'll also chop the set of schedule indices
        # considered down so that a callee cannot find a *longer* hoist group.)
        #
        # Upon return the hoist is wrapped around the returned code and
        # build_insn_group calls itself for the remainder of schedule indices
        # that were not in the hoist group.

        if not sched_index_info_entries:
            return []

        origin_si_entry = sched_index_info_entries[0]
        current_iname_set = origin_si_entry.admissible_cond_inames
        current_pred_set = (origin_si_entry.required_predicates
                - codegen_state.implemented_predicates)

        # {{{ grow schedule item group

        # Keep growing schedule item group as long as group fulfills minimum
        # size requirement.

        bounds_check_cache = BoundsCheckCache(
                kernel, codegen_state.implemented_domain)

        found_hoists = []

        candidate_group_length = 1
        while candidate_group_length <= len(sched_index_info_entries):
            if candidate_group_length in done_group_lengths:
                candidate_group_length += 1
                continue

            current_iname_set = (
                    current_iname_set
                    & sched_index_info_entries[candidate_group_length-1]
                    .admissible_cond_inames)
            current_pred_set = (
                    current_pred_set
                    & sched_index_info_entries[candidate_group_length-1]
                    .required_predicates)

            # {{{ see which inames are actually used in group

            # And only generate conditionals for those.
            used_inames = set()
            for sched_index_info_entry in \
                    sched_index_info_entries[0:candidate_group_length]:
                used_inames |= sched_index_info_entry.used_inames_within

            # }}}

            only_unshared_inames = kernel.remove_inames_for_shared_hw_axes(
                    current_iname_set & used_inames)

            bounds_checks = bounds_check_cache(only_unshared_inames)

            if (bounds_checks  # found a bounds check
                    or current_pred_set
                    or candidate_group_length == 1):
                # length-1 must always be an option to reach the recursion base
                # case below
                found_hoists.append((candidate_group_length,
                    bounds_checks, current_pred_set))

            if not bounds_checks and not current_pred_set:
                # already no more checks possible, let's not waste time
                # checking longer groups.
                break

            candidate_group_length += 1

        # }}}

        # pick largest such group
        group_length, bounds_checks, pred_checks = max(found_hoists)

        check_set = None
        for cns in bounds_checks:
            cns_set = (isl.BasicSet.universe(cns.get_space())
                    .add_constraint(cns))

            if check_set is None:
                check_set = cns_set
            else:
                check_set, cns_set = isl.align_two(check_set, cns_set)
                check_set = check_set.intersect(cns_set)

        if check_set is None:
            new_codegen_state = codegen_state
            is_empty = False
        else:
            is_empty = check_set.is_empty()
            new_codegen_state = codegen_state.intersect(check_set)

        if pred_checks:
            new_codegen_state = new_codegen_state.copy(
                    implemented_predicates=new_codegen_state.implemented_predicates
                    | pred_checks)

        if is_empty:
            result = []
        else:
            if group_length == 1:
                # group only contains starting schedule item
                def gen_code(inner_codegen_state):
                    result = []
                    for i in origin_si_entry.schedule_indices:
                        inner = generate_code_for_sched_index(
                            kernel, i, inner_codegen_state)

                        if inner is not None:
                            result.append(inner)

                    return result

            else:
                # recurse with a bigger done_group_lengths
                def gen_code(inner_codegen_state):
                    return build_insn_group(
                            sched_index_info_entries[0:group_length],
                            inner_codegen_state,
                            done_group_lengths=(
                                done_group_lengths | set([group_length])))

            # gen_code returns a list

            if bounds_checks or pred_checks:
                from loopy.codegen import wrap_in_if
                from loopy.codegen.bounds import constraint_to_code

                prev_gen_code = gen_code

                def gen_code(inner_codegen_state):
                    conditionals = [
                            constraint_to_code(
                                inner_codegen_state.expression_to_code_mapper, cns)
                            for cns in bounds_checks] + list(pred_checks)

                    prev_result = prev_gen_code(inner_codegen_state)

                    return [wrap_in_if(
                             conditionals,
                             gen_code_block(prev_result))]

                cannot_vectorize = False
                if new_codegen_state.vectorization_info is not None:
                    from loopy.isl_helpers import obj_involves_variable
                    for cond in bounds_checks:
                        if obj_involves_variable(
                                cond,
                                new_codegen_state.vectorization_info.iname):
                            cannot_vectorize = True
                            break

                if cannot_vectorize:
                    def gen_code_wrapper(inner_codegen_state):
                        # gen_code returns a list, but this needs to return a
                        # GeneratedCode instance.

                        return gen_code_block(gen_code(inner_codegen_state))

                    result = [new_codegen_state.unvectorize(gen_code_wrapper)]
                else:
                    result = gen_code(new_codegen_state)

            else:
                result = gen_code(new_codegen_state)

        return result + build_insn_group(
                sched_index_info_entries[group_length:], codegen_state)