def emit_sequential_loop(self, codegen_state, iname, iname_dtype, static_lbound, static_ubound, inner): ecm = codegen_state.expression_to_code_mapper from loopy.symbolic import aff_to_expr from pymbolic.mapper.stringifier import PREC_NONE from cgen import For return For( "uniform %s %s = %s" % (self.target.dtype_to_typename(iname_dtype), iname, ecm(aff_to_expr(static_lbound), PREC_NONE, "i")), "%s <= %s" % (iname, ecm(aff_to_expr(static_ubound), PREC_NONE, "i")), "++%s" % iname, inner)
def emit_sequential_loop(self, codegen_state, iname, iname_dtype, static_lbound, static_ubound, inner): ecm = codegen_state.expression_to_code_mapper from loopy.symbolic import aff_to_expr from pymbolic import var from pymbolic.primitives import Comparison from pymbolic.mapper.stringifier import PREC_NONE from cgen import For, InlineInitializer return For( InlineInitializer(POD(self, iname_dtype, iname), ecm(aff_to_expr(static_lbound), PREC_NONE, "i")), ecm(Comparison(var(iname), "<=", aff_to_expr(static_ubound)), PREC_NONE, "i"), "++%s" % iname, inner)
def simplify_via_aff(expr): from loopy.symbolic import aff_from_expr, aff_to_expr, get_dependencies deps = get_dependencies(expr) return aff_to_expr( aff_from_expr( isl.Space.create_from_names(isl.DEFAULT_CONTEXT, list(deps)), expr))
def emit_sequential_loop(self, codegen_state, iname, iname_dtype, static_lbound, static_ubound, inner): ecm = codegen_state.expression_to_code_mapper from loopy.symbolic import aff_to_expr from pymbolic.mapper.stringifier import PREC_NONE from cgen import For return For( "%s %s = %s" % (self.target.dtype_to_typename(iname_dtype), iname, ecm(aff_to_expr(static_lbound), PREC_NONE, "i")), "%s <= %s" % ( iname, ecm(aff_to_expr(static_ubound), PREC_NONE, "i")), "++%s" % iname, inner)
def emit_sequential_loop(self, codegen_state, iname, iname_dtype, static_lbound, static_ubound, inner): ecm = codegen_state.expression_to_code_mapper from loopy.symbolic import aff_to_expr from loopy.target.c import POD from pymbolic.mapper.stringifier import PREC_NONE from cgen import For, Initializer from cgen.ispc import ISPCUniform return For( Initializer(ISPCUniform(POD(self, iname_dtype, iname)), ecm(aff_to_expr(static_lbound), PREC_NONE, "i")), ecm(p.Comparison(var(iname), "<=", aff_to_expr(static_ubound)), PREC_NONE, "i"), "++%s" % iname, inner)
def emit_sequential_loop(self, codegen_state, iname, iname_dtype, static_lbound, static_ubound, inner): ecm = codegen_state.expression_to_code_mapper from loopy.symbolic import aff_to_expr from pymbolic.mapper.stringifier import PREC_NONE from genpy import For return For( (iname,), "range(%s, %s + 1)" % ( ecm(aff_to_expr(static_lbound), PREC_NONE, "i"), ecm(aff_to_expr(static_ubound), PREC_NONE, "i"), ), inner)
def get_constant_iname_length(self, iname): from loopy.isl_helpers import static_max_of_pw_aff from loopy.symbolic import aff_to_expr return int( aff_to_expr( static_max_of_pw_aff(self.get_iname_bounds( iname, constants_only=True).size, constants_only=True)))
def test_aff_to_expr(): s = isl.Space.create_from_names(isl.Context(), ["a", "b"]) zero = isl.Aff.zero_on_domain(isl.LocalSpace.from_space(s)) one = zero.set_constant_val(1) # noqa a = zero.set_coefficient_val(isl.dim_type.in_, 0, 1) b = zero.set_coefficient_val(isl.dim_type.in_, 1, 1) x = (5 * a + 3 * b) % 17 % 5 print(x) from loopy.symbolic import aff_to_expr print(aff_to_expr(x))
def test_aff_to_expr(): s = isl.Space.create_from_names(isl.Context(), ["a", "b"]) zero = isl.Aff.zero_on_domain(isl.LocalSpace.from_space(s)) one = zero.set_constant_val(1) # noqa a = zero.set_coefficient_val(isl.dim_type.in_, 0, 1) b = zero.set_coefficient_val(isl.dim_type.in_, 1, 1) x = (5*a + 3*b) % 17 % 5 print(x) from loopy.symbolic import aff_to_expr print(aff_to_expr(x))
def simplify_via_aff(expr): from loopy.symbolic import aff_to_expr, guarded_aff_from_expr, get_dependencies from loopy.diagnostic import ExpressionToAffineConversionError deps = sorted(get_dependencies(expr)) try: return aff_to_expr( guarded_aff_from_expr( isl.Space.create_from_names(isl.DEFAULT_CONTEXT, list(deps)), expr)) except ExpressionToAffineConversionError: return expr
def determine_temporaries_to_promote(kernel, temporaries, name_gen): """ :returns: A :class:`dict` mapping temporary names from `temporaries` to :class:`PromotedTemporary` objects """ new_temporaries = {} def_lists, use_lists = get_def_and_use_lists_for_all_temporaries(kernel) from loopy.kernel.data import LocalIndexTag for temporary in temporaries: temporary = kernel.temporary_variables[temporary] if temporary.scope == temp_var_scope.GLOBAL: # Nothing to be done for global temporaries (I hope) continue assert temporary.base_storage is None, \ "Cannot promote temporaries with base_storage to global" hw_inames = get_common_hw_inames(kernel, def_lists[temporary.name] + use_lists[temporary.name]) # This takes advantage of the fact that g < l in the alphabet :) hw_inames = sorted(hw_inames, key=lambda iname: str(kernel.iname_to_tag[iname])) shape_prefix = [] backing_hw_inames = [] for iname in hw_inames: tag = kernel.iname_to_tag[iname] is_local_iname = isinstance(tag, LocalIndexTag) if is_local_iname and temporary.scope == temp_var_scope.LOCAL: # Restrict shape to that of group inames for locals. continue backing_hw_inames.append(iname) from loopy.isl_helpers import static_max_of_pw_aff from loopy.symbolic import aff_to_expr shape_prefix.append( aff_to_expr( static_max_of_pw_aff( kernel.get_iname_bounds(iname).size, False))) backing_temporary = PromotedTemporary( name=name_gen(temporary.name), orig_temporary=temporary, shape_prefix=tuple(shape_prefix), hw_inames=backing_hw_inames) new_temporaries[temporary.name] = backing_temporary return new_temporaries
def determine_temporaries_to_promote(kernel, temporaries, name_gen): """ :returns: A :class:`dict` mapping temporary names from `temporaries` to :class:`PromotedTemporary` objects """ new_temporaries = {} def_lists, use_lists = get_def_and_use_lists_for_all_temporaries(kernel) from loopy.kernel.data import LocalIndexTag for temporary in temporaries: temporary = kernel.temporary_variables[temporary] if temporary.scope == temp_var_scope.GLOBAL: # Nothing to be done for global temporaries (I hope) continue assert temporary.base_storage is None, \ "Cannot promote temporaries with base_storage to global" hw_inames = get_common_hw_inames( kernel, def_lists[temporary.name] + use_lists[temporary.name]) # This takes advantage of the fact that g < l in the alphabet :) hw_inames = sorted(hw_inames, key=lambda iname: str(kernel.iname_to_tag[iname])) shape_prefix = [] backing_hw_inames = [] for iname in hw_inames: tag = kernel.iname_to_tag[iname] is_local_iname = isinstance(tag, LocalIndexTag) if is_local_iname and temporary.scope == temp_var_scope.LOCAL: # Restrict shape to that of group inames for locals. continue backing_hw_inames.append(iname) from loopy.isl_helpers import static_max_of_pw_aff from loopy.symbolic import aff_to_expr shape_prefix.append( aff_to_expr( static_max_of_pw_aff( kernel.get_iname_bounds(iname).size, False))) backing_temporary = PromotedTemporary(name=name_gen(temporary.name), orig_temporary=temporary, shape_prefix=tuple(shape_prefix), hw_inames=backing_hw_inames) new_temporaries[temporary.name] = backing_temporary return new_temporaries
def _get_val_in_bset(bset: isl.BasicSet, idim: int) -> ScalarExpression: """ Gets the value of *bset*'s *idim*-th set-dim in terms of it's param-dims. .. note:: Assumes all constraints in *bset* are equality constraints. """ from loopy.symbolic import aff_to_expr max_val = bset.dim_max(idim) assert max_val.is_equal(bset.dim_min(idim)) if max_val.n_piece() != 1: raise NotImplementedError("Shape inference resulted in a piecewise" " result.") (_, aff), = max_val.get_pieces() return aff_to_expr(aff)
def subst_into_pwaff(new_space, pwaff, subst_dict): """ Returns an instance of :class:`islpy.PwAff` with substitutions from *subst_dict* substituted into *pwaff*. :arg pwaff: an instance of :class:`islpy.PwAff` :arg subst_dict: a mapping from parameters of *pwaff* to :class:`pymbolic.primitives.Expression` made up of terms comprising the parameters of *new_space*. The expression must be affine in the param dims of *new_space*. """ from pymbolic.mapper.substitutor import (SubstitutionMapper, make_subst_func) from loopy.symbolic import aff_from_expr, aff_to_expr from functools import reduce i_begin_subst_space = pwaff.dim(dim_type.param) pwaff, subst_domain, subst_dict = get_param_subst_domain( new_space, pwaff, subst_dict) subst_mapper = SubstitutionMapper(make_subst_func(subst_dict)) pwaffs = [] for valid_set, qpoly in pwaff.get_pieces(): valid_set = valid_set & subst_domain if valid_set.plain_is_empty(): continue valid_set = valid_set.project_out(dim_type.param, 0, i_begin_subst_space) aff = aff_from_expr(valid_set.space, subst_mapper(aff_to_expr(qpoly))) pwaffs.append(isl.PwAff.alloc(valid_set, aff)) if not pwaffs: raise ValueError("no pieces of PwAff survived the substitution") return reduce(lambda pwaff1, pwaff2: pwaff1.union_add(pwaff2), pwaffs).coalesce()
def get_constant_iname_length(self, iname): from loopy.isl_helpers import static_max_of_pw_aff from loopy.symbolic import aff_to_expr return int(aff_to_expr(static_max_of_pw_aff( self.get_iname_bounds(iname, constants_only=True).size, constants_only=True)))
def test_aff_to_expr_2(): from loopy.symbolic import aff_to_expr x = isl.Aff("[n] -> { [i0] -> [(-i0 + 2*floor((i0)/2))] }") from pymbolic import var i0 = var("i0") assert aff_to_expr(x) == (-1) * i0 + 2 * (i0 // 2)
def test_aff_to_expr_2(): from loopy.symbolic import aff_to_expr x = isl.Aff("[n] -> { [i0] -> [(-i0 + 2*floor((i0)/2))] }") from pymbolic import var i0 = var("i0") assert aff_to_expr(x) == (-1)*i0 + 2*(i0 // 2)
def generate_sequential_loop_dim_code(kernel, sched_index, codegen_state): ecm = codegen_state.expression_to_code_mapper loop_iname = kernel.schedule[sched_index].iname slabs = get_slab_decomposition( kernel, loop_iname, sched_index, codegen_state) from loopy.codegen.bounds import get_usable_inames_for_conditional # Note: this does not include loop_iname itself! usable_inames = get_usable_inames_for_conditional(kernel, sched_index) domain = kernel.get_inames_domain(loop_iname) result = [] for slab_name, slab in slabs: cmt = "%s slab for '%s'" % (slab_name, loop_iname) if len(slabs) == 1: cmt = None # {{{ find bounds aligned_domain = isl.align_spaces(domain, slab, across_dim_types=True, obj_bigger_ok=True) dom_and_slab = aligned_domain & slab assumptions_non_param = isl.BasicSet.from_params(kernel.assumptions) dom_and_slab, assumptions_non_param = isl.align_two( dom_and_slab, assumptions_non_param) dom_and_slab = dom_and_slab & assumptions_non_param # move inames that are usable into parameters moved_inames = [] for iname in dom_and_slab.get_var_names(dim_type.set): if iname in usable_inames: moved_inames.append(iname) dt, idx = dom_and_slab.get_var_dict()[iname] dom_and_slab = dom_and_slab.move_dims( dim_type.param, dom_and_slab.dim(dim_type.param), dt, idx, 1) _, loop_iname_idx = dom_and_slab.get_var_dict()[loop_iname] from loopy.isl_helpers import ( static_min_of_pw_aff, static_max_of_pw_aff) lbound = ( kernel.cache_manager.dim_min( dom_and_slab, loop_iname_idx) .gist(kernel.assumptions) .coalesce()) ubound = ( kernel.cache_manager.dim_max( dom_and_slab, loop_iname_idx) .gist(kernel.assumptions) .coalesce()) static_lbound = static_min_of_pw_aff( lbound, constants_only=False) static_ubound = static_max_of_pw_aff( ubound, constants_only=False) # }}} # {{{ find implemented slab, build inner code from loopy.isl_helpers import make_slab_from_bound_pwaffs # impl_slab may be overapproximated impl_slab = make_slab_from_bound_pwaffs( dom_and_slab.space, loop_iname, static_lbound, static_ubound) for iname in moved_inames: dt, idx = impl_slab.get_var_dict()[iname] impl_slab = impl_slab.move_dims( dim_type.set, impl_slab.dim(dim_type.set), dt, idx, 1) new_codegen_state = codegen_state.intersect(impl_slab) inner = build_loop_nest( intersect_kernel_with_slab( kernel, slab, iname), sched_index+1, new_codegen_state) # }}} if cmt is not None: from cgen import Comment result.append(Comment(cmt)) from cgen import Initializer, POD, Const, Line from loopy.symbolic import aff_to_expr if (static_ubound - static_lbound).plain_is_zero(): # single-trip, generate just a variable assignment, not a loop result.append(gen_code_block([ Initializer(Const(POD(kernel.index_dtype, loop_iname)), ecm(aff_to_expr(static_lbound), PREC_NONE, "i")), Line(), inner, ])) else: result.append( kernel.target.emit_sequential_loop( codegen_state, loop_iname, kernel.index_dtype, static_lbound, static_ubound, inner)) return gen_code_block(result)
def determine_temporaries_to_promote(kernel, temporaries, name_gen): """ For each temporary in the passed list of temporaries, construct a :class:`PromotedTemporary` which describes how the temporary should get promoted into global storage. :returns: A :class:`dict` mapping temporary names from `temporaries` to :class:`PromotedTemporary` objects """ new_temporaries = {} def_lists, use_lists = get_def_and_use_lists_for_all_temporaries(kernel) from loopy.kernel.data import LocalIndexTag for temporary in temporaries: temporary = kernel.temporary_variables[temporary] if temporary.scope == temp_var_scope.GLOBAL: # Nothing to be done for global temporaries (I hope) continue assert temporary.base_storage is None, \ "Cannot promote temporaries with base_storage to global" # `hw_inames`: The set of hw-parallel tagged inames that this temporary # is associated with. This is used for determining the shape of the # global storage needed for saving and restoring the temporary across # kernel calls. # # TODO: Make a policy decision about which dimensions to use. Currently, # the code looks at each instruction that defines or uses the temporary, # and takes the common set of hw-parallel tagged inames associated with # these instructions. # # Furthermore, in the case of local temporaries, inames that are tagged # hw-local do not contribute to the global storage shape. hw_inames = get_common_hw_inames( kernel, def_lists[temporary.name] + use_lists[temporary.name]) # This takes advantage of the fact that g < l in the alphabet :) hw_inames = sorted(hw_inames, key=lambda iname: str(kernel.iname_to_tag[iname])) # Calculate the sizes of the dimensions that get added in front for # the global storage of the temporary. shape_prefix = [] backing_hw_inames = [] for iname in hw_inames: tag = kernel.iname_to_tag[iname] is_local_iname = isinstance(tag, LocalIndexTag) if is_local_iname and temporary.scope == temp_var_scope.LOCAL: # Restrict shape to that of group inames for locals. continue backing_hw_inames.append(iname) from loopy.isl_helpers import static_max_of_pw_aff from loopy.symbolic import aff_to_expr shape_prefix.append( aff_to_expr( static_max_of_pw_aff( kernel.get_iname_bounds(iname).size, False))) backing_temporary = PromotedTemporary(name=name_gen(temporary.name), orig_temporary=temporary, shape_prefix=tuple(shape_prefix), hw_inames=backing_hw_inames) new_temporaries[temporary.name] = backing_temporary return new_temporaries
def auto_promote_temporary(self, temporary_name): temporary = self.kernel.temporary_variables[temporary_name] if temporary.scope == temp_var_scope.GLOBAL: # Nothing to be done for global temporaries (I hope) return None if temporary.initializer is not None: # Temporaries with initializers do not need saving/reloading - the # code generation takes care of emitting the initializers. assert temporary.read_only return None if temporary.base_storage is not None: raise ValueError( "Cannot promote temporaries with base_storage to global") # `hw_inames`: The set of hw-parallel tagged inames that this temporary # is associated with. This is used for determining the shape of the # global storage needed for saving and restoring the temporary across # kernel calls. # # TODO: Make a policy decision about which dimensions to use. Currently, # the code looks at each instruction that defines or uses the temporary, # and takes the common set of hw-parallel tagged inames associated with # these instructions. # # Furthermore, in the case of local temporaries, inames that are tagged # hw-local do not contribute to the global storage shape. hw_inames = self.insn_query.common_hw_inames( self.insn_query.insns_reading_or_writing(temporary.name)) # We want hw_inames to be arranged according to the order: # g.0 < g.1 < ... < l.0 < l.1 < ... # Sorting lexicographically accomplishes this. hw_inames = sorted( hw_inames, key=lambda iname: str(self.kernel.iname_to_tag[iname])) # Calculate the sizes of the dimensions that get added in front for # the global storage of the temporary. hw_dims = [] backing_hw_inames = [] for iname in hw_inames: tag = self.kernel.iname_to_tag[iname] from loopy.kernel.data import LocalIndexTag is_local_iname = isinstance(tag, LocalIndexTag) if is_local_iname and temporary.scope == temp_var_scope.LOCAL: # Restrict shape to that of group inames for locals. continue backing_hw_inames.append(iname) from loopy.isl_helpers import static_max_of_pw_aff from loopy.symbolic import aff_to_expr hw_dims.append( aff_to_expr( static_max_of_pw_aff( self.kernel.get_iname_bounds(iname).size, False))) non_hw_dims = temporary.shape if len(non_hw_dims) == 0 and len(hw_dims) == 0: # Scalar not in hardware: ensure at least one dimension. non_hw_dims = (1, ) backing_temporary = self.PromotedTemporary( name=self.var_name_gen(temporary.name + "_save_slot"), orig_temporary=temporary, hw_dims=tuple(hw_dims), non_hw_dims=non_hw_dims, hw_inames=backing_hw_inames) return backing_temporary
def generate_sequential_loop_dim_code(codegen_state, sched_index): kernel = codegen_state.kernel ecm = codegen_state.expression_to_code_mapper loop_iname = kernel.schedule[sched_index].iname slabs = get_slab_decomposition(kernel, loop_iname) from loopy.codegen.bounds import get_usable_inames_for_conditional # Note: this does not include loop_iname itself! usable_inames = get_usable_inames_for_conditional(kernel, sched_index) domain = kernel.get_inames_domain(loop_iname) result = [] for slab_name, slab in slabs: cmt = "%s slab for '%s'" % (slab_name, loop_iname) if len(slabs) == 1: cmt = None # {{{ find bounds aligned_domain = isl.align_spaces(domain, slab, across_dim_types=True, obj_bigger_ok=True) dom_and_slab = aligned_domain & slab assumptions_non_param = isl.BasicSet.from_params(kernel.assumptions) dom_and_slab, assumptions_non_param = isl.align_two( dom_and_slab, assumptions_non_param) dom_and_slab = dom_and_slab & assumptions_non_param # move inames that are usable into parameters moved_inames = [] for iname in dom_and_slab.get_var_names(dim_type.set): if iname in usable_inames: moved_inames.append(iname) dt, idx = dom_and_slab.get_var_dict()[iname] dom_and_slab = dom_and_slab.move_dims( dim_type.param, dom_and_slab.dim(dim_type.param), dt, idx, 1) _, loop_iname_idx = dom_and_slab.get_var_dict()[loop_iname] from loopy.isl_helpers import (static_min_of_pw_aff, static_max_of_pw_aff) lbound = (kernel.cache_manager.dim_min( dom_and_slab, loop_iname_idx).gist(kernel.assumptions).coalesce()) ubound = (kernel.cache_manager.dim_max( dom_and_slab, loop_iname_idx).gist(kernel.assumptions).coalesce()) static_lbound = static_min_of_pw_aff(lbound, constants_only=False) static_ubound = static_max_of_pw_aff(ubound, constants_only=False) # }}} # {{{ find implemented slab, build inner code from loopy.isl_helpers import make_slab_from_bound_pwaffs # impl_slab may be overapproximated impl_slab = make_slab_from_bound_pwaffs(dom_and_slab.space, loop_iname, static_lbound, static_ubound) for iname in moved_inames: dt, idx = impl_slab.get_var_dict()[iname] impl_slab = impl_slab.move_dims(dim_type.set, impl_slab.dim(dim_type.set), dt, idx, 1) new_codegen_state = (codegen_state.intersect(impl_slab).copy( kernel=intersect_kernel_with_slab(kernel, slab, iname))) inner = build_loop_nest(new_codegen_state, sched_index + 1) # }}} if cmt is not None: result.append(codegen_state.ast_builder.emit_comment(cmt)) from loopy.symbolic import aff_to_expr astb = codegen_state.ast_builder if (static_ubound - static_lbound).plain_is_zero(): # single-trip, generate just a variable assignment, not a loop result.append( merge_codegen_results(codegen_state, [ astb.emit_initializer(codegen_state, kernel.index_dtype, loop_iname, ecm(aff_to_expr(static_lbound), PREC_NONE, "i"), is_const=True), astb.emit_blank_line(), inner, ])) else: inner_ast = inner.current_ast(codegen_state) result.append( inner.with_new_ast( codegen_state, astb.emit_sequential_loop(codegen_state, loop_iname, kernel.index_dtype, static_lbound, static_ubound, inner_ast))) return merge_codegen_results(codegen_state, result)
def simplify_via_aff(expr): from loopy.symbolic import aff_from_expr, aff_to_expr, get_dependencies deps = get_dependencies(expr) return aff_to_expr(aff_from_expr( isl.Space.create_from_names(isl.DEFAULT_CONTEXT, list(deps)), expr))