def generate_unroll_loop(codegen_state, sched_index): kernel = codegen_state.kernel iname = kernel.schedule[sched_index].iname bounds = kernel.get_iname_bounds(iname, constants_only=True) from loopy.isl_helpers import (static_max_of_pw_aff, static_value_of_pw_aff) from loopy.symbolic import pw_aff_to_expr length_aff = static_max_of_pw_aff(bounds.size, constants_only=True) if not length_aff.is_cst(): raise LoopyError("length of unrolled loop '%s' is not a constant, " "cannot unroll") length = int(pw_aff_to_expr(length_aff)) try: lower_bound_aff = static_value_of_pw_aff( bounds.lower_bound_pw_aff.coalesce(), constants_only=False) except Exception as e: raise type(e)("while finding lower bound of '%s': " % iname) result = [] for i in range(length): idx_aff = lower_bound_aff + i new_codegen_state = codegen_state.fix(iname, idx_aff) result.append(build_loop_nest(new_codegen_state, sched_index + 1)) return merge_codegen_results(codegen_state, result)
def generate_unroll_loop(kernel, sched_index, codegen_state): iname = kernel.schedule[sched_index].iname bounds = kernel.get_iname_bounds(iname, constants_only=True) from loopy.isl_helpers import ( static_max_of_pw_aff, static_value_of_pw_aff) from loopy.symbolic import pw_aff_to_expr length_aff = static_max_of_pw_aff(bounds.size, constants_only=True) if not length_aff.is_cst(): raise LoopyError( "length of unrolled loop '%s' is not a constant, " "cannot unroll") length = int(pw_aff_to_expr(length_aff)) try: lower_bound_aff = static_value_of_pw_aff( bounds.lower_bound_pw_aff.coalesce(), constants_only=False) except Exception as e: raise type(e)("while finding lower bound of '%s': " % iname) result = [] for i in range(length): idx_aff = lower_bound_aff + i new_codegen_state = codegen_state.fix(iname, idx_aff) result.append( build_loop_nest(kernel, sched_index+1, new_codegen_state)) return gen_code_block(result)
def generate_vectorize_loop(codegen_state, sched_index): kernel = codegen_state.kernel iname = kernel.schedule[sched_index].iname bounds = kernel.get_iname_bounds(iname, constants_only=True) from loopy.isl_helpers import ( static_max_of_pw_aff, static_value_of_pw_aff) from loopy.symbolic import pw_aff_to_expr length_aff = static_max_of_pw_aff(bounds.size, constants_only=True) if not length_aff.is_cst(): warn(kernel, "vec_upper_not_const", "upper bound for vectorized loop '%s' is not a constant, " "cannot vectorize--unrolling instead") return generate_unroll_loop(kernel, sched_index, codegen_state) length = int(pw_aff_to_expr(length_aff)) try: lower_bound_aff = static_value_of_pw_aff( bounds.lower_bound_pw_aff.coalesce(), constants_only=False) except Exception as e: raise type(e)("while finding lower bound of '%s': " % iname) if not lower_bound_aff.plain_is_zero(): warn(kernel, "vec_lower_not_0", "lower bound for vectorized loop '%s' is not zero, " "cannot vectorize--unrolling instead") return generate_unroll_loop(kernel, sched_index, codegen_state) # {{{ 'implement' vectorization bounds domain = kernel.get_inames_domain(iname) from loopy.isl_helpers import make_slab slab = make_slab(domain.get_space(), iname, lower_bound_aff, lower_bound_aff+length) codegen_state = codegen_state.intersect(slab) # }}} from loopy.codegen import VectorizationInfo new_codegen_state = codegen_state.copy( vectorization_info=VectorizationInfo( iname=iname, length=length, space=length_aff.space)) return build_loop_nest(new_codegen_state, sched_index+1)
def generate_vectorize_loop(codegen_state, sched_index): kernel = codegen_state.kernel iname = kernel.schedule[sched_index].iname bounds = kernel.get_iname_bounds(iname, constants_only=True) from loopy.isl_helpers import ( static_max_of_pw_aff, static_value_of_pw_aff) from loopy.symbolic import pw_aff_to_expr length_aff = static_max_of_pw_aff(bounds.size, constants_only=True) if not length_aff.is_cst(): warn(kernel, "vec_upper_not_const", "upper bound for vectorized loop '%s' is not a constant, " "cannot vectorize--unrolling instead") return generate_unroll_loop(codegen_state, sched_index) length = int(pw_aff_to_expr(length_aff)) try: lower_bound_aff = static_value_of_pw_aff( bounds.lower_bound_pw_aff.coalesce(), constants_only=False) except Exception as e: raise type(e)("while finding lower bound of '%s': " % iname) if not lower_bound_aff.plain_is_zero(): warn(kernel, "vec_lower_not_0", "lower bound for vectorized loop '%s' is not zero, " "cannot vectorize--unrolling instead") return generate_unroll_loop(codegen_state, sched_index) # {{{ 'implement' vectorization bounds domain = kernel.get_inames_domain(iname) from loopy.isl_helpers import make_slab slab = make_slab(domain.get_space(), iname, lower_bound_aff, lower_bound_aff+length) codegen_state = codegen_state.intersect(slab) # }}} from loopy.codegen import VectorizationInfo new_codegen_state = codegen_state.copy( vectorization_info=VectorizationInfo( iname=iname, length=length, space=length_aff.space)) return build_loop_nest(new_codegen_state, sched_index+1)
def base_index_and_length(self, set, iname, context=None): if not isinstance(iname, int): iname_to_dim = set.space.get_var_dict() idx = iname_to_dim[iname][1] else: idx = iname lower_bound_pw_aff = self.dim_min(set, idx) upper_bound_pw_aff = self.dim_max(set, idx) from loopy.diagnostic import StaticValueFindingError from loopy.isl_helpers import (static_max_of_pw_aff, static_min_of_pw_aff, static_value_of_pw_aff) from loopy.symbolic import pw_aff_to_expr # {{{ first: try to find static lower bound value try: base_index_aff = static_value_of_pw_aff(lower_bound_pw_aff, constants_only=False, context=context) except StaticValueFindingError: base_index_aff = None if base_index_aff is not None: base_index = pw_aff_to_expr(base_index_aff) size = pw_aff_to_expr( static_max_of_pw_aff(upper_bound_pw_aff - base_index_aff + 1, constants_only=False, context=context)) return base_index, size # }}} # {{{ if that didn't work, try finding a lower bound base_index_aff = static_min_of_pw_aff(lower_bound_pw_aff, constants_only=False, context=context) base_index = pw_aff_to_expr(base_index_aff) size = pw_aff_to_expr( static_max_of_pw_aff(upper_bound_pw_aff - base_index_aff + 1, constants_only=False, context=context)) return base_index, size
def base_index_and_length(self, set, iname, context=None): if not isinstance(iname, int): iname_to_dim = set.space.get_var_dict() idx = iname_to_dim[iname][1] else: idx = iname lower_bound_pw_aff = self.dim_min(set, idx) upper_bound_pw_aff = self.dim_max(set, idx) from loopy.diagnostic import StaticValueFindingError from loopy.isl_helpers import ( static_max_of_pw_aff, static_min_of_pw_aff, static_value_of_pw_aff) from loopy.symbolic import pw_aff_to_expr # {{{ first: try to find static lower bound value try: base_index_aff = static_value_of_pw_aff( lower_bound_pw_aff, constants_only=False, context=context) except StaticValueFindingError: base_index_aff = None if base_index_aff is not None: base_index = pw_aff_to_expr(base_index_aff) size = pw_aff_to_expr(static_max_of_pw_aff( upper_bound_pw_aff - base_index_aff + 1, constants_only=False, context=context)) return base_index, size # }}} # {{{ if that didn't work, try finding a lower bound base_index_aff = static_min_of_pw_aff( lower_bound_pw_aff, constants_only=False, context=context) base_index = pw_aff_to_expr(base_index_aff) size = pw_aff_to_expr(static_max_of_pw_aff( upper_bound_pw_aff - base_index_aff + 1, constants_only=False, context=context)) return base_index, size
def base_index_and_length(self, set, iname, context=None): if not isinstance(iname, int): iname_to_dim = set.space.get_var_dict() idx = iname_to_dim[iname][1] else: idx = iname lower_bound_pw_aff = self.dim_min(set, idx) upper_bound_pw_aff = self.dim_max(set, idx) from loopy.isl_helpers import static_max_of_pw_aff, static_value_of_pw_aff from loopy.symbolic import pw_aff_to_expr size = pw_aff_to_expr(static_max_of_pw_aff( upper_bound_pw_aff - lower_bound_pw_aff + 1, constants_only=False, context=context)) try: base_index = pw_aff_to_expr( static_value_of_pw_aff(lower_bound_pw_aff, constants_only=False, context=context)) except Exception as e: raise type(e)("while finding lower bound of '%s': %s" % (iname, str(e))) return base_index, size
def base_index_and_length(self, set, iname, context=None, n_allowed_params_in_length=None): """ :arg n_allowed_params_in_length: Simplifies the 'length' argument so that only the first that many params (in the domain of *set*) occur. """ if not isinstance(iname, int): iname_to_dim = set.space.get_var_dict() idx = iname_to_dim[iname][1] else: idx = iname lower_bound_pw_aff = self.dim_min(set, idx) upper_bound_pw_aff = self.dim_max(set, idx) from loopy.diagnostic import StaticValueFindingError from loopy.isl_helpers import (static_max_of_pw_aff, static_min_of_pw_aff, static_value_of_pw_aff, find_max_of_pwaff_with_params) from loopy.symbolic import pw_aff_to_expr # {{{ first: try to find static lower bound value try: base_index_aff = static_value_of_pw_aff(lower_bound_pw_aff, constants_only=False, context=context) except StaticValueFindingError: base_index_aff = None if base_index_aff is not None: base_index = pw_aff_to_expr(base_index_aff) length = find_max_of_pwaff_with_params( upper_bound_pw_aff - base_index_aff + 1, n_allowed_params_in_length) length = pw_aff_to_expr( static_max_of_pw_aff(length, constants_only=False, context=context)) return base_index, length # }}} # {{{ if that didn't work, try finding a lower bound base_index_aff = static_min_of_pw_aff(lower_bound_pw_aff, constants_only=False, context=context) base_index = pw_aff_to_expr(base_index_aff) length = find_max_of_pwaff_with_params( upper_bound_pw_aff - base_index_aff + 1, n_allowed_params_in_length) length = pw_aff_to_expr( static_max_of_pw_aff(length, constants_only=False, context=context)) return base_index, length
def join_inames(kernel, inames, new_iname=None, tag=None, within=None): """ :arg inames: fastest varying last :arg within: a stack match as understood by :func:`loopy.context_matching.parse_stack_match`. """ # now fastest varying first inames = inames[::-1] if new_iname is None: new_iname = kernel.get_var_name_generator()("_and_".join(inames)) from loopy.kernel.tools import DomainChanger domch = DomainChanger(kernel, frozenset(inames)) for iname in inames: if kernel.get_home_domain_index(iname) != domch.leaf_domain_index: raise LoopyError("iname '%s' is not 'at home' in the " "join's leaf domain" % iname) new_domain = domch.domain new_dim_idx = new_domain.dim(dim_type.set) new_domain = new_domain.add_dims(dim_type.set, 1) new_domain = new_domain.set_dim_name(dim_type.set, new_dim_idx, new_iname) joint_aff = zero = isl.Aff.zero_on_domain(new_domain.space) subst_dict = {} base_divisor = 1 from pymbolic import var for i, iname in enumerate(inames): iname_dt, iname_idx = zero.get_space().get_var_dict()[iname] iname_aff = zero.add_coefficient_val(iname_dt, iname_idx, 1) joint_aff = joint_aff + base_divisor*iname_aff bounds = kernel.get_iname_bounds(iname, constants_only=True) from loopy.isl_helpers import ( static_max_of_pw_aff, static_value_of_pw_aff) from loopy.symbolic import pw_aff_to_expr length = int(pw_aff_to_expr( static_max_of_pw_aff(bounds.size, constants_only=True))) try: lower_bound_aff = static_value_of_pw_aff( bounds.lower_bound_pw_aff.coalesce(), constants_only=False) except Exception as e: raise type(e)("while finding lower bound of '%s': " % iname) my_val = var(new_iname) // base_divisor if i+1 < len(inames): my_val %= length my_val += pw_aff_to_expr(lower_bound_aff) subst_dict[iname] = my_val base_divisor *= length from loopy.isl_helpers import iname_rel_aff new_domain = new_domain.add_constraint( isl.Constraint.equality_from_aff( iname_rel_aff(new_domain.get_space(), new_iname, "==", joint_aff))) for i, iname in enumerate(inames): iname_to_dim = new_domain.get_space().get_var_dict() iname_dt, iname_idx = iname_to_dim[iname] if within is None: new_domain = new_domain.project_out(iname_dt, iname_idx, 1) def subst_forced_iname_deps(fid): result = set() for iname in fid: if iname in inames: result.add(new_iname) else: result.add(iname) return frozenset(result) new_insns = [ insn.copy( forced_iname_deps=subst_forced_iname_deps(insn.forced_iname_deps)) for insn in kernel.instructions] kernel = (kernel .copy( instructions=new_insns, domains=domch.get_domains_with(new_domain), applied_iname_rewrites=kernel.applied_iname_rewrites + [subst_dict] )) from loopy.context_matching import parse_stack_match within = parse_stack_match(within) from pymbolic.mapper.substitutor import make_subst_func rule_mapping_context = SubstitutionRuleMappingContext( kernel.substitutions, kernel.get_var_name_generator()) ijoin = _InameJoiner(rule_mapping_context, within, make_subst_func(subst_dict), inames, new_iname) kernel = rule_mapping_context.finish_kernel( ijoin.map_kernel(kernel)) if tag is not None: kernel = tag_inames(kernel, {new_iname: tag}) return kernel
def join_inames(kernel, inames, new_iname=None, tag=None, within=None): """ :arg inames: fastest varying last :arg within: a stack match as understood by :func:`loopy.match.parse_stack_match`. """ # now fastest varying first inames = inames[::-1] if new_iname is None: new_iname = kernel.get_var_name_generator()("_and_".join(inames)) from loopy.kernel.tools import DomainChanger domch = DomainChanger(kernel, frozenset(inames)) for iname in inames: if kernel.get_home_domain_index(iname) != domch.leaf_domain_index: raise LoopyError("iname '%s' is not 'at home' in the " "join's leaf domain" % iname) new_domain = domch.domain new_dim_idx = new_domain.dim(dim_type.set) new_domain = new_domain.add_dims(dim_type.set, 1) new_domain = new_domain.set_dim_name(dim_type.set, new_dim_idx, new_iname) joint_aff = zero = isl.Aff.zero_on_domain(new_domain.space) subst_dict = {} base_divisor = 1 from pymbolic import var for i, iname in enumerate(inames): iname_dt, iname_idx = zero.get_space().get_var_dict()[iname] iname_aff = zero.add_coefficient_val(iname_dt, iname_idx, 1) joint_aff = joint_aff + base_divisor * iname_aff bounds = kernel.get_iname_bounds(iname, constants_only=True) from loopy.isl_helpers import (static_max_of_pw_aff, static_value_of_pw_aff) from loopy.symbolic import pw_aff_to_expr length = int( pw_aff_to_expr( static_max_of_pw_aff(bounds.size, constants_only=True))) try: lower_bound_aff = static_value_of_pw_aff( bounds.lower_bound_pw_aff.coalesce(), constants_only=False) except Exception as e: raise type(e)("while finding lower bound of '%s': " % iname) my_val = var(new_iname) // base_divisor if i + 1 < len(inames): my_val %= length my_val += pw_aff_to_expr(lower_bound_aff) subst_dict[iname] = my_val base_divisor *= length from loopy.isl_helpers import iname_rel_aff new_domain = new_domain.add_constraint( isl.Constraint.equality_from_aff( iname_rel_aff(new_domain.get_space(), new_iname, "==", joint_aff))) for i, iname in enumerate(inames): iname_to_dim = new_domain.get_space().get_var_dict() iname_dt, iname_idx = iname_to_dim[iname] if within is None: new_domain = new_domain.project_out(iname_dt, iname_idx, 1) def subst_within_inames(fid): result = set() for iname in fid: if iname in inames: result.add(new_iname) else: result.add(iname) return frozenset(result) new_insns = [ insn.copy(within_inames=subst_within_inames(insn.within_inames)) for insn in kernel.instructions ] kernel = (kernel.copy( instructions=new_insns, domains=domch.get_domains_with(new_domain), applied_iname_rewrites=kernel.applied_iname_rewrites + [subst_dict])) from loopy.match import parse_stack_match within = parse_stack_match(within) from pymbolic.mapper.substitutor import make_subst_func rule_mapping_context = SubstitutionRuleMappingContext( kernel.substitutions, kernel.get_var_name_generator()) ijoin = _InameJoiner(rule_mapping_context, within, make_subst_func(subst_dict), inames, new_iname) kernel = rule_mapping_context.finish_kernel(ijoin.map_kernel(kernel)) if tag is not None: kernel = tag_inames(kernel, {new_iname: tag}) return kernel