def rename_argument(kernel, old_name, new_name, existing_ok=False): """ .. versionadded:: 2016.2 """ var_name_gen = kernel.get_var_name_generator() if old_name not in kernel.arg_dict: raise LoopyError("old arg name '%s' does not exist" % old_name) does_exist = var_name_gen.is_name_conflicting(new_name) if does_exist and not existing_ok: raise LoopyError( "argument name '%s' conflicts with an existing identifier" "--cannot rename" % new_name) # {{{ instructions from pymbolic import var subst_dict = {old_name: var(new_name)} from loopy.symbolic import (RuleAwareSubstitutionMapper, SubstitutionRuleMappingContext) from pymbolic.mapper.substitutor import make_subst_func rule_mapping_context = SubstitutionRuleMappingContext( kernel.substitutions, var_name_gen) smap = RuleAwareSubstitutionMapper(rule_mapping_context, make_subst_func(subst_dict), within=lambda kernel, insn, stack: True) kernel = rule_mapping_context.finish_kernel(smap.map_kernel(kernel)) # }}} # {{{ args new_args = [] for arg in kernel.args: if arg.name == old_name: arg = arg.copy(name=new_name) new_args.append(arg) # }}} # {{{ domain new_domains = [] for dom in kernel.domains: dom_var_dict = dom.get_var_dict() if old_name in dom_var_dict: dt, pos = dom_var_dict[old_name] dom = dom.set_dim_name(dt, pos, new_name) new_domains.append(dom) # }}} return kernel.copy(domains=new_domains, args=new_args)
def simplify_indices(kernel): """ Returns a copy of *kernel* with the index-expressions simplified via :func:`loopy.symbolic.simplify_using_aff`. """ from loopy.symbolic import SubstitutionRuleMappingContext as SRMC rule_mapping_context = SRMC(kernel.substitutions, kernel.get_var_name_generator()) idx_simplifier = IndexSimplifier(rule_mapping_context, kernel) return rule_mapping_context.finish_kernel( idx_simplifier.map_kernel(kernel))
def _unresolve_callables(kernel, callables_table): from loopy.symbolic import SubstitutionRuleMappingContext from loopy.kernel import KernelState vng = kernel.get_var_name_generator() rule_mapping_context = SubstitutionRuleMappingContext( kernel.substitutions, vng) mapper = _CallablesUnresolver(rule_mapping_context, callables_table, kernel.target) return (rule_mapping_context.finish_kernel( mapper.map_kernel(kernel)).copy(state=KernelState.INITIAL))
def rename_resolved_functions_in_a_single_kernel(kernel, renaming_dict): """ Returns a copy of *kernel* with the instances of :class:`ResolvedFunction` renames according to *renaming_dict*. """ from loopy.symbolic import SubstitutionRuleMappingContext rule_mapping_context = SubstitutionRuleMappingContext( kernel.substitutions, kernel.get_var_name_generator()) resolved_function_renamer = ResolvedFunctionRenamer( rule_mapping_context, renaming_dict) return (rule_mapping_context.finish_kernel( resolved_function_renamer.map_kernel(kernel)))
def rename_callable(program, old_name, new_name=None, existing_ok=False): """ :arg program: An instance of :class:`loopy.TranslationUnit` :arg old_name: The callable to be renamed :arg new_name: New name for the callable to be renamed :arg existing_ok: An instance of :class:`bool` """ from loopy.symbolic import (RuleAwareSubstitutionMapper, SubstitutionRuleMappingContext) from pymbolic import var assert isinstance(program, TranslationUnit) assert isinstance(old_name, str) if (new_name in program.callables_table) and not existing_ok: raise LoopyError(f"callables named '{new_name}' already exists") if new_name is None: namegen = UniqueNameGenerator(program.callables_table.keys()) new_name = namegen(old_name) assert isinstance(new_name, str) new_callables_table = {} for name, clbl in program.callables_table.items(): if name == old_name: name = new_name if isinstance(clbl, CallableKernel): knl = clbl.subkernel rule_mapping_context = SubstitutionRuleMappingContext( knl.substitutions, knl.get_var_name_generator()) smap = RuleAwareSubstitutionMapper(rule_mapping_context, {var(old_name): var(new_name)}.get, within=lambda *args: True) knl = rule_mapping_context.finish_kernel(smap.map_kernel(knl)) clbl = clbl.copy(subkernel=knl.copy(name=name)) elif isinstance(clbl, ScalarCallable): pass else: raise NotImplementedError(f"{type(clbl)}") new_callables_table[name] = clbl new_entrypoints = program.entrypoints.copy() if old_name in new_entrypoints: new_entrypoints = ((new_entrypoints | frozenset([new_name])) - frozenset([old_name])) return program.copy(callables_table=new_callables_table, entrypoints=new_entrypoints)
def expand_subst(kernel, within=None): logger.debug("%s: expand subst" % kernel.name) from loopy.symbolic import RuleAwareSubstitutionRuleExpander from loopy.context_matching import parse_stack_match rule_mapping_context = SubstitutionRuleMappingContext( kernel.substitutions, kernel.get_var_name_generator()) submap = RuleAwareSubstitutionRuleExpander( rule_mapping_context, kernel.substitutions, parse_stack_match(within)) return rule_mapping_context.finish_kernel(submap.map_kernel(kernel))
def expand_subst(kernel, within=None): if not kernel.substitutions: return kernel logger.debug("%s: expand subst" % kernel.name) from loopy.symbolic import RuleAwareSubstitutionRuleExpander from loopy.match import parse_stack_match rule_mapping_context = SubstitutionRuleMappingContext( kernel.substitutions, kernel.get_var_name_generator()) submap = RuleAwareSubstitutionRuleExpander(rule_mapping_context, kernel.substitutions, parse_stack_match(within)) return rule_mapping_context.finish_kernel(submap.map_kernel(kernel))
def _split_reduction(kernel, inames, direction, within=None): if direction not in ["in", "out"]: raise ValueError("invalid value for 'direction': %s" % direction) if isinstance(inames, str): inames = inames.split(",") inames = set(inames) from loopy.match import parse_stack_match within = parse_stack_match(within) rule_mapping_context = SubstitutionRuleMappingContext( kernel.substitutions, kernel.get_var_name_generator()) rsplit = _ReductionSplitter(rule_mapping_context, within, inames, direction) return rule_mapping_context.finish_kernel(rsplit.map_kernel(kernel))
def _split_reduction(kernel, inames, direction, within=None): if direction not in ["in", "out"]: raise ValueError("invalid value for 'direction': %s" % direction) if isinstance(inames, str): inames = inames.split(",") inames = set(inames) from loopy.context_matching import parse_stack_match within = parse_stack_match(within) rule_mapping_context = SubstitutionRuleMappingContext( kernel.substitutions, kernel.get_var_name_generator()) rsplit = _ReductionSplitter(rule_mapping_context, within, inames, direction) return rule_mapping_context.finish_kernel( rsplit.map_kernel(kernel))
def make_reduction_inames_unique(kernel, inames=None, within=None): """ :arg inames: if not *None*, only apply to these inames :arg within: a stack match as understood by :func:`loopy.match.parse_stack_match`. .. versionadded:: 2016.2 """ name_gen = kernel.get_var_name_generator() from loopy.match import parse_stack_match within = parse_stack_match(within) # {{{ change kernel rule_mapping_context = SubstitutionRuleMappingContext( kernel.substitutions, name_gen) r_uniq = _ReductionInameUniquifier(rule_mapping_context, inames, within=within) kernel = rule_mapping_context.finish_kernel(r_uniq.map_kernel(kernel)) # }}} # {{{ duplicate the inames for old_iname, new_iname in r_uniq.old_to_new: from loopy.kernel.tools import DomainChanger domch = DomainChanger(kernel, frozenset([old_iname])) from loopy.isl_helpers import duplicate_axes kernel = kernel.copy(domains=domch.get_domains_with( duplicate_axes(domch.domain, [old_iname], [new_iname]))) # }}} return kernel
def expand_subst(kernel, within=None): """ Returns an instance of :class:`loopy.LoopKernel` with the substitutions referenced in instructions of *kernel* matched by *within* expanded. :arg within: a stack match as understood by :func:`loopy.match.parse_stack_match`. """ if not kernel.substitutions: return kernel logger.debug("%s: expand subst" % kernel.name) from loopy.symbolic import RuleAwareSubstitutionRuleExpander from loopy.match import parse_stack_match rule_mapping_context = SubstitutionRuleMappingContext( kernel.substitutions, kernel.get_var_name_generator()) submap = RuleAwareSubstitutionRuleExpander(rule_mapping_context, kernel.substitutions, parse_stack_match(within)) return rule_mapping_context.finish_kernel(submap.map_kernel(kernel))
def expand_subst(kernel, within=None): """ Returns an instance of :class:`loopy.LoopKernel` with the substitutions referenced in instructions of *kernel* matched by *within* expanded. :arg within: a stack match as understood by :func:`loopy.match.parse_stack_match`. """ if not kernel.substitutions: return kernel logger.debug("%s: expand subst" % kernel.name) from loopy.symbolic import RuleAwareSubstitutionRuleExpander from loopy.match import parse_stack_match rule_mapping_context = SubstitutionRuleMappingContext( kernel.substitutions, kernel.get_var_name_generator()) submap = RuleAwareSubstitutionRuleExpander( rule_mapping_context, kernel.substitutions, parse_stack_match(within)) return rule_mapping_context.finish_kernel(submap.map_kernel(kernel))
def join_inames(kernel, inames, new_iname=None, tag=None, within=None): """ :arg inames: fastest varying last :arg within: a stack match as understood by :func:`loopy.context_matching.parse_stack_match`. """ # now fastest varying first inames = inames[::-1] if new_iname is None: new_iname = kernel.get_var_name_generator()("_and_".join(inames)) from loopy.kernel.tools import DomainChanger domch = DomainChanger(kernel, frozenset(inames)) for iname in inames: if kernel.get_home_domain_index(iname) != domch.leaf_domain_index: raise LoopyError("iname '%s' is not 'at home' in the " "join's leaf domain" % iname) new_domain = domch.domain new_dim_idx = new_domain.dim(dim_type.set) new_domain = new_domain.add_dims(dim_type.set, 1) new_domain = new_domain.set_dim_name(dim_type.set, new_dim_idx, new_iname) joint_aff = zero = isl.Aff.zero_on_domain(new_domain.space) subst_dict = {} base_divisor = 1 from pymbolic import var for i, iname in enumerate(inames): iname_dt, iname_idx = zero.get_space().get_var_dict()[iname] iname_aff = zero.add_coefficient_val(iname_dt, iname_idx, 1) joint_aff = joint_aff + base_divisor*iname_aff bounds = kernel.get_iname_bounds(iname, constants_only=True) from loopy.isl_helpers import ( static_max_of_pw_aff, static_value_of_pw_aff) from loopy.symbolic import pw_aff_to_expr length = int(pw_aff_to_expr( static_max_of_pw_aff(bounds.size, constants_only=True))) try: lower_bound_aff = static_value_of_pw_aff( bounds.lower_bound_pw_aff.coalesce(), constants_only=False) except Exception as e: raise type(e)("while finding lower bound of '%s': " % iname) my_val = var(new_iname) // base_divisor if i+1 < len(inames): my_val %= length my_val += pw_aff_to_expr(lower_bound_aff) subst_dict[iname] = my_val base_divisor *= length from loopy.isl_helpers import iname_rel_aff new_domain = new_domain.add_constraint( isl.Constraint.equality_from_aff( iname_rel_aff(new_domain.get_space(), new_iname, "==", joint_aff))) for i, iname in enumerate(inames): iname_to_dim = new_domain.get_space().get_var_dict() iname_dt, iname_idx = iname_to_dim[iname] if within is None: new_domain = new_domain.project_out(iname_dt, iname_idx, 1) def subst_forced_iname_deps(fid): result = set() for iname in fid: if iname in inames: result.add(new_iname) else: result.add(iname) return frozenset(result) new_insns = [ insn.copy( forced_iname_deps=subst_forced_iname_deps(insn.forced_iname_deps)) for insn in kernel.instructions] kernel = (kernel .copy( instructions=new_insns, domains=domch.get_domains_with(new_domain), applied_iname_rewrites=kernel.applied_iname_rewrites + [subst_dict] )) from loopy.context_matching import parse_stack_match within = parse_stack_match(within) from pymbolic.mapper.substitutor import make_subst_func rule_mapping_context = SubstitutionRuleMappingContext( kernel.substitutions, kernel.get_var_name_generator()) ijoin = _InameJoiner(rule_mapping_context, within, make_subst_func(subst_dict), inames, new_iname) kernel = rule_mapping_context.finish_kernel( ijoin.map_kernel(kernel)) if tag is not None: kernel = tag_inames(kernel, {new_iname: tag}) return kernel
def assignment_to_subst(kernel, lhs_name, extra_arguments=(), within=None, force_retain_argument=False): """Extract an assignment (to a temporary variable or an argument) as a :ref:`substitution-rule`. The temporary may be an array, in which case the array indices will become arguments to the substitution rule. :arg within: a stack match as understood by :func:`loopy.match.parse_stack_match`. :arg force_retain_argument: If True and if *lhs_name* is an argument, it is kept even if it is no longer referenced. This operation will change all usage sites of *lhs_name* matched by *within*. If there are further usage sites of *lhs_name*, then the original assignment to *lhs_name* as well as the temporary variable is left in place. """ if isinstance(extra_arguments, str): extra_arguments = tuple(s.strip() for s in extra_arguments.split(",")) # {{{ establish the relevant definition of lhs_name for each usage site dep_kernel = expand_subst(kernel) from loopy.kernel.creation import apply_single_writer_depencency_heuristic dep_kernel = apply_single_writer_depencency_heuristic(dep_kernel) id_to_insn = dep_kernel.id_to_insn def get_relevant_definition_insn_id(usage_insn_id): insn = id_to_insn[usage_insn_id] def_id = set() for dep_id in insn.depends_on: dep_insn = id_to_insn[dep_id] if lhs_name in dep_insn.write_dependency_names(): if lhs_name in dep_insn.read_dependency_names(): raise LoopyError( "instruction '%s' both reads *and* " "writes '%s'--cannot transcribe to substitution " "rule" % (dep_id, lhs_name)) def_id.add(dep_id) else: rec_result = get_relevant_definition_insn_id(dep_id) if rec_result is not None: def_id.add(rec_result) if len(def_id) > 1: raise LoopyError( "more than one write to '%s' found in " "depdendencies of '%s'--definition cannot be resolved " "(writer instructions ids: %s)" % (lhs_name, usage_insn_id, ", ".join(def_id))) if not def_id: return None else: def_id, = def_id return def_id usage_to_definition = {} for insn in dep_kernel.instructions: if lhs_name not in insn.read_dependency_names(): continue def_id = get_relevant_definition_insn_id(insn.id) if def_id is None: raise LoopyError("no write to '%s' found in dependency tree " "of '%s'--definition cannot be resolved" % (lhs_name, insn.id)) usage_to_definition[insn.id] = def_id definition_insn_ids = set() for insn in kernel.instructions: if lhs_name in insn.write_dependency_names(): definition_insn_ids.add(insn.id) # }}} if not definition_insn_ids: raise LoopyError("no assignments to variable '%s' found" % lhs_name) from loopy.match import parse_stack_match within = parse_stack_match(within) rule_mapping_context = SubstitutionRuleMappingContext( kernel.substitutions, kernel.get_var_name_generator()) tts = AssignmentToSubstChanger(rule_mapping_context, lhs_name, definition_insn_ids, usage_to_definition, extra_arguments, within) kernel = rule_mapping_context.finish_kernel(tts.map_kernel(kernel)) from loopy.kernel.data import SubstitutionRule # {{{ create new substitution rules new_substs = kernel.substitutions.copy() for def_id, subst_name in six.iteritems( tts.definition_insn_id_to_subst_name): def_insn = kernel.id_to_insn[def_id] from loopy.kernel.data import Assignment assert isinstance(def_insn, Assignment) from pymbolic.primitives import Variable, Subscript if isinstance(def_insn.assignee, Subscript): indices = def_insn.assignee.index_tuple elif isinstance(def_insn.assignee, Variable): indices = () else: raise LoopyError("Unrecognized LHS type: %s" % type(def_insn.assignee).__name__) arguments = [] for i in indices: if not isinstance(i, Variable): raise LoopyError("In defining instruction '%s': " "asignee index '%s' is not a plain variable. " "Perhaps use loopy.affine_map_inames() " "to perform substitution." % (def_id, i)) arguments.append(i.name) new_substs[subst_name] = SubstitutionRule( name=subst_name, arguments=tuple(arguments) + extra_arguments, expression=def_insn.expression) # }}} # {{{ delete temporary variable if possible # (copied below if modified) new_temp_vars = kernel.temporary_variables new_args = kernel.args if lhs_name in kernel.temporary_variables: if not any(six.itervalues(tts.saw_unmatched_usage_sites)): # All usage sites matched--they're now substitution rules. # We can get rid of the variable. new_temp_vars = new_temp_vars.copy() del new_temp_vars[lhs_name] if lhs_name in kernel.arg_dict and not force_retain_argument: if not any(six.itervalues(tts.saw_unmatched_usage_sites)): # All usage sites matched--they're now substitution rules. # We can get rid of the argument new_args = new_args[:] for i in range(len(new_args)): if new_args[i].name == lhs_name: del new_args[i] break # }}} import loopy as lp kernel = lp.remove_instructions( kernel, set(insn_id for insn_id, still_used in six.iteritems( tts.saw_unmatched_usage_sites) if not still_used)) return kernel.copy( substitutions=new_substs, temporary_variables=new_temp_vars, args=new_args, )
def _split_array_axis_inner(kernel, array_name, axis_nr, count, order="C"): if count == 1: return kernel # {{{ adjust arrays from loopy.kernel.tools import ArrayChanger achng = ArrayChanger(kernel, array_name) ary = achng.get() from pytools import div_ceil # {{{ adjust shape new_shape = ary.shape if new_shape is not None: new_shape = list(new_shape) axis_len = new_shape[axis_nr] new_shape[axis_nr] = count outer_len = div_ceil(axis_len, count) if order == "F": new_shape.insert(axis_nr + 1, outer_len) elif order == "C": new_shape.insert(axis_nr, outer_len) else: raise RuntimeError("order '%s' not understood" % order) new_shape = tuple(new_shape) # }}} # {{{ adjust dim tags if ary.dim_tags is None: raise RuntimeError("dim_tags of '%s' are not known" % array_name) new_dim_tags = list(ary.dim_tags) old_dim_tag = ary.dim_tags[axis_nr] from loopy.kernel.array import FixedStrideArrayDimTag if not isinstance(old_dim_tag, FixedStrideArrayDimTag): raise RuntimeError("axis %d of '%s' is not tagged fixed-stride" % (axis_nr, array_name)) old_stride = old_dim_tag.stride outer_stride = count * old_stride if order == "F": new_dim_tags.insert(axis_nr + 1, FixedStrideArrayDimTag(outer_stride)) elif order == "C": new_dim_tags.insert(axis_nr, FixedStrideArrayDimTag(outer_stride)) else: raise RuntimeError("order '%s' not understood" % order) new_dim_tags = tuple(new_dim_tags) # }}} # {{{ adjust dim_names new_dim_names = ary.dim_names if new_dim_names is not None: new_dim_names = list(new_dim_names) existing_name = new_dim_names[axis_nr] new_dim_names[axis_nr] = existing_name + "_inner" outer_name = existing_name + "_outer" if order == "F": new_dim_names.insert(axis_nr + 1, outer_name) elif order == "C": new_dim_names.insert(axis_nr, outer_name) else: raise RuntimeError("order '%s' not understood" % order) new_dim_names = tuple(new_dim_names) # }}} kernel = achng.with_changed_array( ary.copy(shape=new_shape, dim_tags=new_dim_tags, dim_names=new_dim_names)) # }}} var_name_gen = kernel.get_var_name_generator() def split_access_axis(expr): idx = expr.index if not isinstance(idx, tuple): idx = (idx, ) idx = list(idx) axis_idx = idx[axis_nr] from loopy.symbolic import simplify_using_aff inner_index = simplify_using_aff(kernel, axis_idx % count) outer_index = simplify_using_aff(kernel, axis_idx // count) idx[axis_nr] = inner_index if order == "F": idx.insert(axis_nr + 1, outer_index) elif order == "C": idx.insert(axis_nr, outer_index) else: raise RuntimeError("order '%s' not understood" % order) return expr.aggregate.index(tuple(idx)) rule_mapping_context = SubstitutionRuleMappingContext( kernel.substitutions, var_name_gen) aash = ArrayAxisSplitHelper(rule_mapping_context, {array_name}, split_access_axis) kernel = rule_mapping_context.finish_kernel(aash.map_kernel(kernel)) return kernel
def assignment_to_subst(kernel, lhs_name, extra_arguments=(), within=None, force_retain_argument=False): """Extract an assignment (to a temporary variable or an argument) as a :ref:`substitution-rule`. The temporary may be an array, in which case the array indices will become arguments to the substitution rule. :arg within: a stack match as understood by :func:`loopy.match.parse_stack_match`. :arg force_retain_argument: If True and if *lhs_name* is an argument, it is kept even if it is no longer referenced. This operation will change all usage sites of *lhs_name* matched by *within*. If there are further usage sites of *lhs_name*, then the original assignment to *lhs_name* as well as the temporary variable is left in place. """ if isinstance(extra_arguments, str): extra_arguments = tuple(s.strip() for s in extra_arguments.split(",")) # {{{ establish the relevant definition of lhs_name for each usage site dep_kernel = expand_subst(kernel) from loopy.kernel.creation import apply_single_writer_depencency_heuristic dep_kernel = apply_single_writer_depencency_heuristic(dep_kernel) id_to_insn = dep_kernel.id_to_insn def get_relevant_definition_insn_id(usage_insn_id): insn = id_to_insn[usage_insn_id] def_id = set() for dep_id in insn.depends_on: dep_insn = id_to_insn[dep_id] if lhs_name in dep_insn.write_dependency_names(): if lhs_name in dep_insn.read_dependency_names(): raise LoopyError("instruction '%s' both reads *and* " "writes '%s'--cannot transcribe to substitution " "rule" % (dep_id, lhs_name)) def_id.add(dep_id) else: rec_result = get_relevant_definition_insn_id(dep_id) if rec_result is not None: def_id.add(rec_result) if len(def_id) > 1: raise LoopyError("more than one write to '%s' found in " "depdendencies of '%s'--definition cannot be resolved " "(writer instructions ids: %s)" % (lhs_name, usage_insn_id, ", ".join(def_id))) if not def_id: return None else: def_id, = def_id return def_id usage_to_definition = {} for insn in dep_kernel.instructions: if lhs_name not in insn.read_dependency_names(): continue def_id = get_relevant_definition_insn_id(insn.id) if def_id is None: raise LoopyError("no write to '%s' found in dependency tree " "of '%s'--definition cannot be resolved" % (lhs_name, insn.id)) usage_to_definition[insn.id] = def_id definition_insn_ids = set() for insn in kernel.instructions: if lhs_name in insn.write_dependency_names(): definition_insn_ids.add(insn.id) # }}} if not definition_insn_ids: raise LoopyError("no assignments to variable '%s' found" % lhs_name) from loopy.match import parse_stack_match within = parse_stack_match(within) rule_mapping_context = SubstitutionRuleMappingContext( kernel.substitutions, kernel.get_var_name_generator()) tts = AssignmentToSubstChanger(rule_mapping_context, lhs_name, definition_insn_ids, usage_to_definition, extra_arguments, within) kernel = rule_mapping_context.finish_kernel(tts.map_kernel(kernel)) from loopy.kernel.data import SubstitutionRule # {{{ create new substitution rules new_substs = kernel.substitutions.copy() for def_id, subst_name in six.iteritems(tts.definition_insn_id_to_subst_name): def_insn = kernel.id_to_insn[def_id] from loopy.kernel.data import Assignment assert isinstance(def_insn, Assignment) from pymbolic.primitives import Variable, Subscript if isinstance(def_insn.assignee, Subscript): indices = def_insn.assignee.index_tuple elif isinstance(def_insn.assignee, Variable): indices = () else: raise LoopyError( "Unrecognized LHS type: %s" % type(def_insn.assignee).__name__) arguments = [] for i in indices: if not isinstance(i, Variable): raise LoopyError("In defining instruction '%s': " "asignee index '%s' is not a plain variable. " "Perhaps use loopy.affine_map_inames() " "to perform substitution." % (def_id, i)) arguments.append(i.name) new_substs[subst_name] = SubstitutionRule( name=subst_name, arguments=tuple(arguments) + extra_arguments, expression=def_insn.expression) # }}} # {{{ delete temporary variable if possible # (copied below if modified) new_temp_vars = kernel.temporary_variables new_args = kernel.args if lhs_name in kernel.temporary_variables: if not any(six.itervalues(tts.saw_unmatched_usage_sites)): # All usage sites matched--they're now substitution rules. # We can get rid of the variable. new_temp_vars = new_temp_vars.copy() del new_temp_vars[lhs_name] if lhs_name in kernel.arg_dict and not force_retain_argument: if not any(six.itervalues(tts.saw_unmatched_usage_sites)): # All usage sites matched--they're now substitution rules. # We can get rid of the argument new_args = new_args[:] for i in range(len(new_args)): if new_args[i].name == lhs_name: del new_args[i] break # }}} import loopy as lp kernel = lp.remove_instructions( kernel, set( insn_id for insn_id, still_used in six.iteritems( tts.saw_unmatched_usage_sites) if not still_used)) return kernel.copy( substitutions=new_substs, temporary_variables=new_temp_vars, args=new_args, )
def join_inames(kernel, inames, new_iname=None, tag=None, within=None): """ :arg inames: fastest varying last :arg within: a stack match as understood by :func:`loopy.match.parse_stack_match`. """ # now fastest varying first inames = inames[::-1] if new_iname is None: new_iname = kernel.get_var_name_generator()("_and_".join(inames)) from loopy.kernel.tools import DomainChanger domch = DomainChanger(kernel, frozenset(inames)) for iname in inames: if kernel.get_home_domain_index(iname) != domch.leaf_domain_index: raise LoopyError("iname '%s' is not 'at home' in the " "join's leaf domain" % iname) new_domain = domch.domain new_dim_idx = new_domain.dim(dim_type.set) new_domain = new_domain.add_dims(dim_type.set, 1) new_domain = new_domain.set_dim_name(dim_type.set, new_dim_idx, new_iname) joint_aff = zero = isl.Aff.zero_on_domain(new_domain.space) subst_dict = {} base_divisor = 1 from pymbolic import var for i, iname in enumerate(inames): iname_dt, iname_idx = zero.get_space().get_var_dict()[iname] iname_aff = zero.add_coefficient_val(iname_dt, iname_idx, 1) joint_aff = joint_aff + base_divisor * iname_aff bounds = kernel.get_iname_bounds(iname, constants_only=True) from loopy.isl_helpers import (static_max_of_pw_aff, static_value_of_pw_aff) from loopy.symbolic import pw_aff_to_expr length = int( pw_aff_to_expr( static_max_of_pw_aff(bounds.size, constants_only=True))) try: lower_bound_aff = static_value_of_pw_aff( bounds.lower_bound_pw_aff.coalesce(), constants_only=False) except Exception as e: raise type(e)("while finding lower bound of '%s': " % iname) my_val = var(new_iname) // base_divisor if i + 1 < len(inames): my_val %= length my_val += pw_aff_to_expr(lower_bound_aff) subst_dict[iname] = my_val base_divisor *= length from loopy.isl_helpers import iname_rel_aff new_domain = new_domain.add_constraint( isl.Constraint.equality_from_aff( iname_rel_aff(new_domain.get_space(), new_iname, "==", joint_aff))) for i, iname in enumerate(inames): iname_to_dim = new_domain.get_space().get_var_dict() iname_dt, iname_idx = iname_to_dim[iname] if within is None: new_domain = new_domain.project_out(iname_dt, iname_idx, 1) def subst_within_inames(fid): result = set() for iname in fid: if iname in inames: result.add(new_iname) else: result.add(iname) return frozenset(result) new_insns = [ insn.copy(within_inames=subst_within_inames(insn.within_inames)) for insn in kernel.instructions ] kernel = (kernel.copy( instructions=new_insns, domains=domch.get_domains_with(new_domain), applied_iname_rewrites=kernel.applied_iname_rewrites + [subst_dict])) from loopy.match import parse_stack_match within = parse_stack_match(within) from pymbolic.mapper.substitutor import make_subst_func rule_mapping_context = SubstitutionRuleMappingContext( kernel.substitutions, kernel.get_var_name_generator()) ijoin = _InameJoiner(rule_mapping_context, within, make_subst_func(subst_dict), inames, new_iname) kernel = rule_mapping_context.finish_kernel(ijoin.map_kernel(kernel)) if tag is not None: kernel = tag_inames(kernel, {new_iname: tag}) return kernel
class DifferentiationContext(object): def __init__(self, kernel, var_name_gen, by_name, diff_iname_prefix, additional_shape): self.kernel = kernel self.by_name = by_name self.diff_iname_prefix = diff_iname_prefix self.additional_shape = additional_shape self.imported_outputs = set() self.output_to_diff_output = {} self.generate_instruction_id = self.kernel.get_instruction_id_generator() self.new_args = [] self.new_temporary_variables = {} self.new_instructions = [] self.imported_instructions = set() self.new_domains = [] self.rule_mapping_context = SubstitutionRuleMappingContext( kernel.substitutions, var_name_gen) def get_new_kernel(self): knl = self.kernel new_args = knl.args + self.new_args new_temp_vars = knl.temporary_variables.copy() new_temp_vars.update(self.new_temporary_variables) knl = knl.copy( args=new_args, temporary_variables=new_temp_vars, instructions=self.new_instructions, domains=knl.domains + self.new_domains) del new_args del new_temp_vars knl = self.rule_mapping_context.finish_kernel(knl) return knl # {{{ kernel gen entrypoints def add_diff_inames(self): diff_inames = tuple( self.rule_mapping_context.make_unique_var_name( self.diff_iname_prefix+str(i)) for i in range(len(self.additional_shape))) diff_parameters = set() from loopy.symbolic import get_dependencies for s in self.additional_shape: diff_parameters.update(get_dependencies(s)) diff_domain = isl.BasicSet( "[%s] -> {[%s]}" % (", ".join(diff_parameters), ", ".join(diff_inames))) for i, diff_iname in enumerate(diff_inames): diff_domain = diff_domain & make_slab( diff_domain.space, diff_iname, 0, self.additional_shape[i]) self.new_domains.append(diff_domain) return diff_inames # }}} def import_instruction_and_deps(self, insn_id): if insn_id in self.imported_instructions: return insn = self.kernel.id_to_insn[insn_id] self.new_instructions.append(insn) self.imported_instructions.add(insn_id) id_map = RuleAwareIdentityMapper(self.rule_mapping_context) if isinstance(insn, lp.Assignment): id_map(insn.expression, self.kernel, insn) else: raise RuntimeError("do not know how to deal with " "instruction of type %s" % type(insn)) for dep in insn.insn_deps: self.import_instruction_and_deps(dep) def import_output_var(self, var_name): writers = self.kernel.writer_map().get(var_name, []) if len(writers) > 1: raise LoopyError("%s is written in more than one place" % var_name) if not writers: return insn_id, = writers self.import_instruction_and_deps(insn_id) def get_diff_var(self, var_name): """ :return: a string containing the name of a new variable holding the derivative of *var_name* by the desired *diff_context.by_name*, or *None* if no dependency exists. """ new_var_name = self.rule_mapping_context.make_unique_var_name( var_name + "_d" + self.by_name) writers = self.kernel.writer_map().get(var_name, []) if not writers: # FIXME: There should be hooks to supply earlier dvar_dby # This would be the spot to think about them. return None if len(writers) > 1: raise LoopyError("%s is written in more than one place" % var_name) orig_writer_id, = writers orig_writer_insn = self.kernel.id_to_insn[orig_writer_id] diff_inames = self.add_diff_inames() diff_iname_exprs = tuple(var(diname) for diname in diff_inames) # {{{ write code diff_mapper = LoopyDiffMapper(self.rule_mapping_context, self, diff_inames) diff_expr = diff_mapper(orig_writer_insn.expression, self.kernel, orig_writer_insn) if not diff_expr: return None (_, lhs_ind), = orig_writer_insn.assignees_and_indices() new_insn_id = self.generate_instruction_id() insn = lp.Assignment( id=new_insn_id, assignee=var(new_var_name)[ lhs_ind + diff_iname_exprs], expression=diff_expr) self.new_instructions.append(insn) # }}} # {{{ manage variable declaration if var_name in self.kernel.arg_dict: arg = self.kernel.arg_dict[var_name] orig_shape = arg.shape elif var_name in self.kernel.temporary_variables: tv = self.kernel.temporary_variables[var_name] orig_shape = tv.shape else: raise ValueError("%s: variable not found" % var_name) shape = orig_shape + self.additional_shape dim_tags = ("c",) * len(shape) if var_name in self.kernel.arg_dict: self.new_args.append( lp.GlobalArg( new_var_name, arg.dtype, shape=shape, dim_tags=dim_tags, )) elif var_name in self.kernel.temporary_variables: self.new_temporary_variables[new_var_name] = lp.TemporaryVariable( new_var_name, tv.dtype, shape=shape, dim_tags=dim_tags) # }}} return new_var_name
def buffer_array( kernel, var_name, buffer_inames, init_expression=None, store_expression=None, within=None, default_tag="l.auto", temporary_is_local=None, fetch_bounding_box=False, ): """ :arg init_expression: Either *None* (indicating the prior value of the buffered array should be read) or an expression optionally involving the variable 'base' (which references the associated location in the array being buffered). :arg store_expression: Either *None* or an expression involving variables 'base' and 'buffer' (without array indices). """ # {{{ process arguments if isinstance(init_expression, str): from loopy.symbolic import parse init_expression = parse(init_expression) if isinstance(store_expression, str): from loopy.symbolic import parse store_expression = parse(store_expression) if isinstance(buffer_inames, str): buffer_inames = [s.strip() for s in buffer_inames.split(",") if s.strip()] for iname in buffer_inames: if iname not in kernel.all_inames(): raise RuntimeError("sweep iname '%s' is not a known iname" % iname) buffer_inames = list(buffer_inames) buffer_inames_set = frozenset(buffer_inames) from loopy.context_matching import parse_stack_match within = parse_stack_match(within) if var_name in kernel.arg_dict: var_descr = kernel.arg_dict[var_name] elif var_name in kernel.temporary_variables: var_descr = kernel.temporary_variables[var_name] else: raise ValueError("variable '%s' not found" % var_name) from loopy.kernel.data import ArrayBase if isinstance(var_descr, ArrayBase): var_shape = var_descr.shape else: var_shape = () if temporary_is_local is None: import loopy as lp temporary_is_local = lp.auto # }}} var_name_gen = kernel.get_var_name_generator() within_inames = set() access_descriptors = [] for insn in kernel.instructions: if not within(kernel, insn.id, ()): continue for assignee, index in insn.assignees_and_indices(): if assignee == var_name: within_inames.update((get_dependencies(index) & kernel.all_inames()) - buffer_inames_set) access_descriptors.append(AccessDescriptor(identifier=insn.id, storage_axis_exprs=index)) # {{{ find fetch/store inames init_inames = [] store_inames = [] new_iname_to_tag = {} for i in range(len(var_shape)): init_iname = var_name_gen("%s_init_%d" % (var_name, i)) store_iname = var_name_gen("%s_store_%d" % (var_name, i)) new_iname_to_tag[init_iname] = default_tag new_iname_to_tag[store_iname] = default_tag init_inames.append(init_iname) store_inames.append(store_iname) # }}} # {{{ modify loop domain non1_init_inames = [] non1_store_inames = [] if var_shape: # {{{ find domain to be changed from loopy.kernel.tools import DomainChanger domch = DomainChanger(kernel, buffer_inames_set | within_inames) if domch.leaf_domain_index is not None: # If the sweep inames are at home in parent domains, then we'll add # fetches with loops over copies of these parent inames that will end # up being scheduled *within* loops over these parents. for iname in buffer_inames_set: if kernel.get_home_domain_index(iname) != domch.leaf_domain_index: raise RuntimeError("buffer iname '%s' is not 'at home' in the " "sweep's leaf domain" % iname) # }}} abm = ArrayToBufferMap(kernel, domch.domain, buffer_inames, access_descriptors, len(var_shape)) for i in range(len(var_shape)): if abm.non1_storage_axis_flags[i]: non1_init_inames.append(init_inames[i]) non1_store_inames.append(store_inames[i]) else: del new_iname_to_tag[init_inames[i]] del new_iname_to_tag[store_inames[i]] new_domain = domch.domain new_domain = abm.augment_domain_with_sweep(new_domain, non1_init_inames, boxify_sweep=fetch_bounding_box) new_domain = abm.augment_domain_with_sweep(new_domain, non1_store_inames, boxify_sweep=fetch_bounding_box) new_kernel_domains = domch.get_domains_with(new_domain) del new_domain else: # leave kernel domains unchanged new_kernel_domains = kernel.domains abm = NoOpArrayToBufferMap() # }}} # {{{ set up temp variable import loopy as lp buf_var_name = var_name_gen(based_on=var_name + "_buf") new_temporary_variables = kernel.temporary_variables.copy() temp_var = lp.TemporaryVariable( name=buf_var_name, dtype=var_descr.dtype, base_indices=(0,) * len(abm.non1_storage_shape), shape=tuple(abm.non1_storage_shape), is_local=temporary_is_local, ) new_temporary_variables[buf_var_name] = temp_var # }}} new_insns = [] buf_var = var(buf_var_name) # {{{ generate init instruction buf_var_init = buf_var if non1_init_inames: buf_var_init = buf_var_init.index(tuple(var(iname) for iname in non1_init_inames)) init_base = var(var_name) init_subscript = [] init_iname_idx = 0 if var_shape: for i in range(len(var_shape)): ax_subscript = abm.storage_base_indices[i] if abm.non1_storage_axis_flags[i]: ax_subscript += var(non1_init_inames[init_iname_idx]) init_iname_idx += 1 init_subscript.append(ax_subscript) if init_subscript: init_base = init_base.index(tuple(init_subscript)) if init_expression is None: init_expression = init_base else: init_expression = init_expression init_expression = SubstitutionMapper(make_subst_func({"base": init_base}))(init_expression) init_insn_id = kernel.make_unique_instruction_id(based_on="init_" + var_name) from loopy.kernel.data import ExpressionInstruction init_instruction = ExpressionInstruction( id=init_insn_id, assignee=buf_var_init, expression=init_expression, forced_iname_deps=frozenset(within_inames), insn_deps=frozenset(), insn_deps_is_final=True, ) # }}} rule_mapping_context = SubstitutionRuleMappingContext(kernel.substitutions, kernel.get_var_name_generator()) aar = ArrayAccessReplacer(rule_mapping_context, var_name, within, abm, buf_var) kernel = rule_mapping_context.finish_kernel(aar.map_kernel(kernel)) did_write = False for insn_id in aar.modified_insn_ids: insn = kernel.id_to_insn[insn_id] if any(assignee_name == buf_var_name for assignee_name, _ in insn.assignees_and_indices()): did_write = True # {{{ add init_insn_id to insn_deps new_insns = [] def none_to_empty_set(s): if s is None: return frozenset() else: return s for insn in kernel.instructions: if insn.id in aar.modified_insn_ids: new_insns.append(insn.copy(insn_deps=(none_to_empty_set(insn.insn_deps) | frozenset([init_insn_id])))) else: new_insns.append(insn) # }}} # {{{ generate store instruction buf_var_store = buf_var if non1_store_inames: buf_var_store = buf_var_store.index(tuple(var(iname) for iname in non1_store_inames)) store_subscript = [] store_iname_idx = 0 if var_shape: for i in range(len(var_shape)): ax_subscript = abm.storage_base_indices[i] if abm.non1_storage_axis_flags[i]: ax_subscript += var(non1_store_inames[store_iname_idx]) store_iname_idx += 1 store_subscript.append(ax_subscript) store_target = var(var_name) if store_subscript: store_target = store_target.index(tuple(store_subscript)) if store_expression is None: store_expression = buf_var_store else: store_expression = SubstitutionMapper(make_subst_func({"base": store_target, "buffer": buf_var_store}))( store_expression ) from loopy.kernel.data import ExpressionInstruction store_instruction = ExpressionInstruction( id=kernel.make_unique_instruction_id(based_on="store_" + var_name), insn_deps=frozenset(aar.modified_insn_ids), assignee=store_target, expression=store_expression, forced_iname_deps=frozenset(within_inames), ) # }}} new_insns.append(init_instruction) if did_write: new_insns.append(store_instruction) kernel = kernel.copy( domains=new_kernel_domains, instructions=new_insns, temporary_variables=new_temporary_variables ) from loopy import tag_inames kernel = tag_inames(kernel, new_iname_to_tag) return kernel
def duplicate_inames(knl, inames, within, new_inames=None, suffix=None, tags={}): """ :arg within: a stack match as understood by :func:`loopy.match.parse_stack_match`. """ # {{{ normalize arguments, find unique new_inames if isinstance(inames, str): inames = [iname.strip() for iname in inames.split(",")] if isinstance(new_inames, str): new_inames = [iname.strip() for iname in new_inames.split(",")] from loopy.match import parse_stack_match within = parse_stack_match(within) if new_inames is None: new_inames = [None] * len(inames) if len(new_inames) != len(inames): raise ValueError( "new_inames must have the same number of entries as inames") name_gen = knl.get_var_name_generator() for i, iname in enumerate(inames): new_iname = new_inames[i] if new_iname is None: new_iname = iname if suffix is not None: new_iname += suffix new_iname = name_gen(new_iname) else: if name_gen.is_name_conflicting(new_iname): raise ValueError( "new iname '%s' conflicts with existing names" % new_iname) name_gen.add_name(new_iname) new_inames[i] = new_iname # }}} # {{{ duplicate the inames for old_iname, new_iname in zip(inames, new_inames): from loopy.kernel.tools import DomainChanger domch = DomainChanger(knl, frozenset([old_iname])) from loopy.isl_helpers import duplicate_axes knl = knl.copy(domains=domch.get_domains_with( duplicate_axes(domch.domain, [old_iname], [new_iname]))) # }}} # {{{ change the inames in the code rule_mapping_context = SubstitutionRuleMappingContext( knl.substitutions, name_gen) indup = _InameDuplicator(rule_mapping_context, old_to_new=dict(list(zip(inames, new_inames))), within=within) knl = rule_mapping_context.finish_kernel(indup.map_kernel(knl)) # }}} # {{{ realize tags for old_iname, new_iname in zip(inames, new_inames): new_tag = tags.get(old_iname) if new_tag is not None: knl = tag_inames(knl, {new_iname: new_tag}) # }}} return knl
def affine_map_inames(kernel, old_inames, new_inames, equations): """Return a new *kernel* where the affine transform specified by *equations* has been applied to the inames. :arg old_inames: A list of inames to be replaced by affine transforms of their values. May also be a string of comma-separated inames. :arg new_inames: A list of new inames that are not yet used in *kernel*, but have their values established in terms of *old_inames* by *equations*. May also be a string of comma-separated inames. :arg equations: A list of equations estabilishing a relationship between *old_inames* and *new_inames*. Each equation may be a tuple ``(lhs, rhs)`` of expressions or a string, with left and right hand side of the equation separated by ``=``. """ # {{{ check and parse arguments if isinstance(new_inames, str): new_inames = new_inames.split(",") new_inames = [iname.strip() for iname in new_inames] if isinstance(old_inames, str): old_inames = old_inames.split(",") old_inames = [iname.strip() for iname in old_inames] if isinstance(equations, str): equations = [equations] import re eqn_re = re.compile(r"^([^=]+)=([^=]+)$") def parse_equation(eqn): if isinstance(eqn, str): eqn_match = eqn_re.match(eqn) if not eqn_match: raise ValueError("invalid equation: %s" % eqn) from loopy.symbolic import parse lhs = parse(eqn_match.group(1)) rhs = parse(eqn_match.group(2)) return (lhs, rhs) elif isinstance(eqn, tuple): if len(eqn) != 2: raise ValueError("unexpected length of equation tuple, " "got %d, should be 2" % len(eqn)) return eqn else: raise ValueError("unexpected type of equation" "got %d, should be string or tuple" % type(eqn).__name__) equations = [parse_equation(eqn) for eqn in equations] all_vars = kernel.all_variable_names() for iname in new_inames: if iname in all_vars: raise LoopyError("new iname '%s' is already used in kernel" % iname) for iname in old_inames: if iname not in kernel.all_inames(): raise LoopyError("old iname '%s' not known" % iname) # }}} # {{{ substitute iname use from pymbolic.algorithm import solve_affine_equations_for old_inames_to_expr = solve_affine_equations_for(old_inames, equations) subst_dict = dict( (v.name, expr) for v, expr in old_inames_to_expr.items()) var_name_gen = kernel.get_var_name_generator() from pymbolic.mapper.substitutor import make_subst_func from loopy.context_matching import parse_stack_match rule_mapping_context = SubstitutionRuleMappingContext( kernel.substitutions, var_name_gen) old_to_new = RuleAwareSubstitutionMapper(rule_mapping_context, make_subst_func(subst_dict), within=parse_stack_match(None)) kernel = ( rule_mapping_context.finish_kernel( old_to_new.map_kernel(kernel)) .copy( applied_iname_rewrites=kernel.applied_iname_rewrites + [subst_dict] )) # }}} # {{{ change domains new_inames_set = set(new_inames) old_inames_set = set(old_inames) new_domains = [] for idom, dom in enumerate(kernel.domains): dom_var_dict = dom.get_var_dict() old_iname_overlap = [ iname for iname in old_inames if iname in dom_var_dict] if not old_iname_overlap: new_domains.append(dom) continue from loopy.symbolic import get_dependencies dom_new_inames = set() dom_old_inames = set() # mapping for new inames to dim_types new_iname_dim_types = {} dom_equations = [] for iname in old_iname_overlap: for ieqn, (lhs, rhs) in enumerate(equations): eqn_deps = get_dependencies(lhs) | get_dependencies(rhs) if iname in eqn_deps: dom_new_inames.update(eqn_deps & new_inames_set) dom_old_inames.update(eqn_deps & old_inames_set) if dom_old_inames: dom_equations.append((lhs, rhs)) this_eqn_old_iname_dim_types = set( dom_var_dict[old_iname][0] for old_iname in eqn_deps & old_inames_set) if this_eqn_old_iname_dim_types: if len(this_eqn_old_iname_dim_types) > 1: raise ValueError("inames '%s' (from equation %d (0-based)) " "in domain %d (0-based) are not " "of a uniform dim_type" % (", ".join(eqn_deps & old_inames_set), ieqn, idom)) this_eqn_new_iname_dim_type, = this_eqn_old_iname_dim_types for new_iname in eqn_deps & new_inames_set: if new_iname in new_iname_dim_types: if (this_eqn_new_iname_dim_type != new_iname_dim_types[new_iname]): raise ValueError("dim_type disagreement for " "iname '%s' (from equation %d (0-based)) " "in domain %d (0-based)" % (new_iname, ieqn, idom)) else: new_iname_dim_types[new_iname] = \ this_eqn_new_iname_dim_type if not dom_old_inames <= set(dom_var_dict): raise ValueError("domain %d (0-based) does not know about " "all old inames (specifically '%s') needed to define new inames" % (idom, ", ".join(dom_old_inames - set(dom_var_dict)))) # add inames to domain with correct dim_types dom_new_inames = list(dom_new_inames) for iname in dom_new_inames: dt = new_iname_dim_types[iname] iname_idx = dom.dim(dt) dom = dom.add_dims(dt, 1) dom = dom.set_dim_name(dt, iname_idx, iname) # add equations from loopy.symbolic import aff_from_expr for lhs, rhs in dom_equations: dom = dom.add_constraint( isl.Constraint.equality_from_aff( aff_from_expr(dom.space, rhs - lhs))) # project out old inames for iname in dom_old_inames: dt, idx = dom.get_var_dict()[iname] dom = dom.project_out(dt, idx, 1) new_domains.append(dom) # }}} return kernel.copy(domains=new_domains)
def fix_parameters(kernel, within=None, **value_dict): """Fix the values of the arguments to specific constants. *value_dict* consists of *name*/*value* pairs, where *name* will be fixed to be *value*. *name* may refer to :ref:`domain-parameters` or :ref:`arguments`. """ if not value_dict: return kernel def process_set_one_param(s, name, value): var_dict = s.get_var_dict() try: dt, idx = var_dict[name] except KeyError: return s value_aff = isl.Aff.zero_on_domain(s.space) + value from loopy.isl_helpers import iname_rel_aff name_equal_value_aff = iname_rel_aff(s.space, name, "==", value_aff) s = (s.add_constraint( isl.Constraint.equality_from_aff( name_equal_value_aff)).project_out(dt, idx, 1)) return s def process_set(s): for name, value in value_dict.items(): s = process_set_one_param(s, name, value) return s new_domains = [process_set(dom) for dom in kernel.domains] from pymbolic.mapper.substitutor import make_subst_func subst_func = make_subst_func(value_dict) from loopy.symbolic import SubstitutionMapper, PartialEvaluationMapper subst_map = SubstitutionMapper(subst_func) ev_map = PartialEvaluationMapper() def map_expr(expr): return ev_map(subst_map(expr)) from loopy.kernel.array import ArrayBase new_args = [] for arg in kernel.args: if arg.name in value_dict.keys(): # remove from argument list continue if not isinstance(arg, ArrayBase): new_args.append(arg) else: new_args.append(arg.map_exprs(map_expr)) new_temp_vars = {} for tv in kernel.temporary_variables.values(): new_temp_vars[tv.name] = tv.map_exprs(map_expr) from loopy.match import parse_stack_match within = parse_stack_match(within) rule_mapping_context = SubstitutionRuleMappingContext( kernel.substitutions, kernel.get_var_name_generator()) esubst_map = RuleAwareSubstitutionMapper(rule_mapping_context, subst_func, within=within) return (rule_mapping_context.finish_kernel( esubst_map.map_kernel(kernel, within=within)).copy( domains=new_domains, args=new_args, temporary_variables=new_temp_vars, assumptions=process_set(kernel.assumptions), ))
def link_inames(knl, inames, new_iname, within=None, tag=None): # {{{ normalize arguments if isinstance(inames, str): inames = inames.split(",") var_name_gen = knl.get_var_name_generator() new_iname = var_name_gen(new_iname) # }}} # {{{ ensure that each iname is used at most once in each instruction inames_set = set(inames) if 0: # FIXME! for insn in knl.instructions: insn_inames = knl.insn_inames(insn.id) | insn.reduction_inames() if len(insn_inames & inames_set) > 1: raise LoopyError("To-be-linked inames '%s' are used in " "instruction '%s'. No more than one such iname can " "be used in one instruction." % (", ".join(insn_inames & inames_set), insn.id)) # }}} from loopy.kernel.tools import DomainChanger domch = DomainChanger(knl, tuple(inames)) # {{{ ensure that projections are identical unrelated_dom_inames = list( set(domch.domain.get_var_names(dim_type.set)) - inames_set) domain = domch.domain # move all inames to be linked to end to prevent shuffly confusion for iname in inames: dt, index = domain.get_var_dict()[iname] assert dt == dim_type.set # move to tail of param dim_type domain = domain.move_dims( dim_type.param, domain.dim(dim_type.param), dt, index, 1) # move to tail of set dim_type domain = domain.move_dims( dim_type.set, domain.dim(dim_type.set), dim_type.param, domain.dim(dim_type.param)-1, 1) projections = [ domch.domain.project_out_except( unrelated_dom_inames + [iname], [dim_type.set]) for iname in inames] all_equal = True first_proj = projections[0] for proj in projections[1:]: all_equal = all_equal and (proj <= first_proj and first_proj <= proj) if not all_equal: raise LoopyError("Inames cannot be linked because their domain " "constraints are not the same.") del domain # messed up for testing, do not use # }}} # change the domain from loopy.isl_helpers import duplicate_axes knl = knl.copy( domains=domch.get_domains_with( duplicate_axes(domch.domain, [inames[0]], [new_iname]))) # {{{ change the code from pymbolic import var subst_dict = dict((iname, var(new_iname)) for iname in inames) from loopy.context_matching import parse_stack_match within = parse_stack_match(within) from pymbolic.mapper.substitutor import make_subst_func rule_mapping_context = SubstitutionRuleMappingContext( knl.substitutions, var_name_gen) ijoin = RuleAwareSubstitutionMapper(rule_mapping_context, make_subst_func(subst_dict), within) knl = rule_mapping_context.finish_kernel( ijoin.map_kernel(knl)) # }}} knl = remove_unused_inames(knl, inames) if tag is not None: knl = tag_inames(knl, {new_iname: tag}) return knl
def rename_iname(knl, old_iname, new_iname, existing_ok=False, within=None): """ :arg within: a stack match as understood by :func:`loopy.context_matching.parse_stack_match`. :arg existing_ok: execute even if *new_iname* already exists """ var_name_gen = knl.get_var_name_generator() does_exist = var_name_gen.is_name_conflicting(new_iname) if does_exist and not existing_ok: raise ValueError("iname '%s' conflicts with an existing identifier" "--cannot rename" % new_iname) if does_exist: # {{{ check that the domains match up dom = knl.get_inames_domain(frozenset((old_iname, new_iname))) var_dict = dom.get_var_dict() _, old_idx = var_dict[old_iname] _, new_idx = var_dict[new_iname] par_idx = dom.dim(dim_type.param) dom_old = dom.move_dims( dim_type.param, par_idx, dim_type.set, old_idx, 1) dom_old = dom_old.move_dims( dim_type.set, dom_old.dim(dim_type.set), dim_type.param, par_idx, 1) dom_old = dom_old.project_out( dim_type.set, new_idx if new_idx < old_idx else new_idx - 1, 1) par_idx = dom.dim(dim_type.param) dom_new = dom.move_dims( dim_type.param, par_idx, dim_type.set, new_idx, 1) dom_new = dom_new.move_dims( dim_type.set, dom_new.dim(dim_type.set), dim_type.param, par_idx, 1) dom_new = dom_new.project_out( dim_type.set, old_idx if old_idx < new_idx else old_idx - 1, 1) if not (dom_old <= dom_new and dom_new <= dom_old): raise LoopyError( "inames {old} and {new} do not iterate over the same domain" .format(old=old_iname, new=new_iname)) # }}} from pymbolic import var subst_dict = {old_iname: var(new_iname)} from loopy.context_matching import parse_stack_match within = parse_stack_match(within) from pymbolic.mapper.substitutor import make_subst_func rule_mapping_context = SubstitutionRuleMappingContext( knl.substitutions, var_name_gen) ijoin = RuleAwareSubstitutionMapper(rule_mapping_context, make_subst_func(subst_dict), within) knl = rule_mapping_context.finish_kernel( ijoin.map_kernel(knl)) new_instructions = [] for insn in knl.instructions: if (old_iname in insn.forced_iname_deps and within(knl, insn, ())): insn = insn.copy( forced_iname_deps=( (insn.forced_iname_deps - frozenset([old_iname])) | frozenset([new_iname]))) new_instructions.append(insn) knl = knl.copy(instructions=new_instructions) else: knl = duplicate_inames( knl, [old_iname], within=within, new_inames=[new_iname]) knl = remove_unused_inames(knl, [old_iname]) return knl
def duplicate_inames(knl, inames, within, new_inames=None, suffix=None, tags={}): """ :arg within: a stack match as understood by :func:`loopy.context_matching.parse_stack_match`. """ # {{{ normalize arguments, find unique new_inames if isinstance(inames, str): inames = [iname.strip() for iname in inames.split(",")] if isinstance(new_inames, str): new_inames = [iname.strip() for iname in new_inames.split(",")] from loopy.context_matching import parse_stack_match within = parse_stack_match(within) if new_inames is None: new_inames = [None] * len(inames) if len(new_inames) != len(inames): raise ValueError("new_inames must have the same number of entries as inames") name_gen = knl.get_var_name_generator() for i, iname in enumerate(inames): new_iname = new_inames[i] if new_iname is None: new_iname = iname if suffix is not None: new_iname += suffix new_iname = name_gen(new_iname) else: if name_gen.is_name_conflicting(new_iname): raise ValueError("new iname '%s' conflicts with existing names" % new_iname) name_gen.add_name(new_iname) new_inames[i] = new_iname # }}} # {{{ duplicate the inames for old_iname, new_iname in zip(inames, new_inames): from loopy.kernel.tools import DomainChanger domch = DomainChanger(knl, frozenset([old_iname])) from loopy.isl_helpers import duplicate_axes knl = knl.copy( domains=domch.get_domains_with( duplicate_axes(domch.domain, [old_iname], [new_iname]))) # }}} # {{{ change the inames in the code rule_mapping_context = SubstitutionRuleMappingContext( knl.substitutions, name_gen) indup = _InameDuplicator(rule_mapping_context, old_to_new=dict(list(zip(inames, new_inames))), within=within) knl = rule_mapping_context.finish_kernel( indup.map_kernel(knl)) # }}} # {{{ realize tags for old_iname, new_iname in zip(inames, new_inames): new_tag = tags.get(old_iname) if new_tag is not None: knl = tag_inames(knl, {new_iname: new_tag}) # }}} return knl
def rename_iname(knl, old_iname, new_iname, existing_ok=False, within=None): """ :arg within: a stack match as understood by :func:`loopy.match.parse_stack_match`. :arg existing_ok: execute even if *new_iname* already exists """ var_name_gen = knl.get_var_name_generator() does_exist = var_name_gen.is_name_conflicting(new_iname) if old_iname not in knl.all_inames(): raise LoopyError("old iname '%s' does not exist" % old_iname) if does_exist and not existing_ok: raise LoopyError("iname '%s' conflicts with an existing identifier" "--cannot rename" % new_iname) if does_exist: # {{{ check that the domains match up dom = knl.get_inames_domain(frozenset((old_iname, new_iname))) var_dict = dom.get_var_dict() _, old_idx = var_dict[old_iname] _, new_idx = var_dict[new_iname] par_idx = dom.dim(dim_type.param) dom_old = dom.move_dims(dim_type.param, par_idx, dim_type.set, old_idx, 1) dom_old = dom_old.move_dims(dim_type.set, dom_old.dim(dim_type.set), dim_type.param, par_idx, 1) dom_old = dom_old.project_out( dim_type.set, new_idx if new_idx < old_idx else new_idx - 1, 1) par_idx = dom.dim(dim_type.param) dom_new = dom.move_dims(dim_type.param, par_idx, dim_type.set, new_idx, 1) dom_new = dom_new.move_dims(dim_type.set, dom_new.dim(dim_type.set), dim_type.param, par_idx, 1) dom_new = dom_new.project_out( dim_type.set, old_idx if old_idx < new_idx else old_idx - 1, 1) if not (dom_old <= dom_new and dom_new <= dom_old): raise LoopyError( "inames {old} and {new} do not iterate over the same domain". format(old=old_iname, new=new_iname)) # }}} from pymbolic import var subst_dict = {old_iname: var(new_iname)} from loopy.match import parse_stack_match within = parse_stack_match(within) from pymbolic.mapper.substitutor import make_subst_func rule_mapping_context = SubstitutionRuleMappingContext( knl.substitutions, var_name_gen) smap = RuleAwareSubstitutionMapper(rule_mapping_context, make_subst_func(subst_dict), within) knl = rule_mapping_context.finish_kernel(smap.map_kernel(knl)) new_instructions = [] for insn in knl.instructions: if (old_iname in insn.within_inames and within(knl, insn, ())): insn = insn.copy(within_inames=( (insn.within_inames - frozenset([old_iname])) | frozenset([new_iname]))) new_instructions.append(insn) knl = knl.copy(instructions=new_instructions) else: knl = duplicate_inames(knl, [old_iname], within=within, new_inames=[new_iname]) knl = remove_unused_inames(knl, [old_iname]) return knl
def to_batched(kernel, nbatches, batch_varying_args, batch_iname_prefix="ibatch", sequential=False): """Takes in a kernel that carries out an operation and returns a kernel that carries out a batch of these operations. .. note:: For temporaries in a kernel that are private or read only globals and if `sequential=True`, loopy does not does not batch these variables unless explicitly mentioned in `batch_varying_args`. :arg nbatches: the number of batches. May be a constant non-negative integer or a string, which will be added as an integer argument. :arg batch_varying_args: a list of argument names that vary per-batch. Each such variable will have a batch index added. :arg sequential: A :class:`bool`. If *True*, do not duplicate temporary variables for each batch. This automatically tags the batch iname for sequential execution. """ from pymbolic import var vng = kernel.get_var_name_generator() batch_iname = vng(batch_iname_prefix) batch_iname_expr = var(batch_iname) new_args = [] batch_dom_str = "{{[{iname}]: 0 <= {iname} < {nbatches}}}".format( iname=batch_iname, nbatches=nbatches, ) if not isinstance(nbatches, int): batch_dom_str = "[%s] -> " % nbatches + batch_dom_str new_args.append(ValueArg(nbatches, dtype=kernel.index_dtype)) nbatches_expr = var(nbatches) else: nbatches_expr = nbatches batch_domain = isl.BasicSet(batch_dom_str) new_domains = [batch_domain] + kernel.domains for arg in kernel.args: if arg.name in batch_varying_args: if isinstance(arg, ValueArg): arg = ArrayArg(arg.name, arg.dtype, shape=(nbatches_expr,), dim_tags="c") else: arg = arg.copy( shape=(nbatches_expr,) + arg.shape, dim_tags=("c",) * (len(arg.shape) + 1), dim_names=_add_unique_dim_name("ibatch", arg.dim_names)) new_args.append(arg) kernel = kernel.copy( domains=new_domains, args=new_args) if not sequential: new_temps = {} for temp in kernel.temporary_variables.values(): if temp_needs_batching_if_not_sequential(temp, batch_varying_args): new_temps[temp.name] = temp.copy( shape=(nbatches_expr,) + temp.shape, dim_tags=("c",) * (len(temp.shape) + 1), dim_names=_add_unique_dim_name("ibatch", temp.dim_names)) else: new_temps[temp.name] = temp kernel = kernel.copy(temporary_variables=new_temps) else: import loopy as lp from loopy.kernel.data import ForceSequentialTag kernel = lp.tag_inames(kernel, [(batch_iname, ForceSequentialTag())]) rule_mapping_context = SubstitutionRuleMappingContext( kernel.substitutions, vng) bvc = _BatchVariableChanger(rule_mapping_context, kernel, batch_varying_args, batch_iname_expr, sequential=sequential) kernel = rule_mapping_context.finish_kernel( bvc.map_kernel(kernel)) batch_iname_set = frozenset([batch_iname]) kernel = kernel.copy( instructions=[ insn.copy(within_inames=insn.within_inames | batch_iname_set) for insn in kernel.instructions]) return kernel
def temporary_to_subst(kernel, temp_name, extra_arguments=(), within=None): """Extract an assignment to a temporary variable as a :ref:`substituiton-rule`. The temporary may be an array, in which case the array indices will become arguments to the substitution rule. :arg within: a stack match as understood by :func:`loopy.context_matching.parse_stack_match`. This operation will change all usage sites of *temp_name* matched by *within*. If there are further usage sites of *temp_name*, then the original assignment to *temp_name* as well as the temporary variable is left in place. """ if isinstance(extra_arguments, str): extra_arguments = tuple(s.strip() for s in extra_arguments.split(",")) # {{{ establish the relevant definition of temp_name for each usage site dep_kernel = expand_subst(kernel) from loopy.preprocess import add_default_dependencies dep_kernel = add_default_dependencies(dep_kernel) id_to_insn = dep_kernel.id_to_insn def get_relevant_definition_insn_id(usage_insn_id): insn = id_to_insn[usage_insn_id] def_id = set() for dep_id in insn.insn_deps: dep_insn = id_to_insn[dep_id] if temp_name in dep_insn.write_dependency_names(): if temp_name in dep_insn.read_dependency_names(): raise LoopyError("instruction '%s' both reads *and* " "writes '%s'--cannot transcribe to substitution " "rule" % (dep_id, temp_name)) def_id.add(dep_id) else: rec_result = get_relevant_definition_insn_id(dep_id) if rec_result is not None: def_id.add(rec_result) if len(def_id) > 1: raise LoopyError("more than one write to '%s' found in " "depdendencies of '%s'--definition cannot be resolved " "(writer instructions ids: %s)" % (temp_name, usage_insn_id, ", ".join(def_id))) if not def_id: return None else: def_id, = def_id return def_id usage_to_definition = {} for insn in kernel.instructions: if temp_name not in insn.read_dependency_names(): continue def_id = get_relevant_definition_insn_id(insn.id) if def_id is None: raise LoopyError("no write to '%s' found in dependency tree " "of '%s'--definition cannot be resolved" % (temp_name, insn.id)) usage_to_definition[insn.id] = def_id definition_insn_ids = set() for insn in kernel.instructions: if temp_name in insn.write_dependency_names(): definition_insn_ids.add(insn.id) # }}} from loopy.context_matching import parse_stack_match within = parse_stack_match(within) rule_mapping_context = SubstitutionRuleMappingContext( kernel.substitutions, kernel.get_var_name_generator()) tts = TemporaryToSubstChanger(rule_mapping_context, temp_name, definition_insn_ids, usage_to_definition, extra_arguments, within) kernel = rule_mapping_context.finish_kernel(tts.map_kernel(kernel)) from loopy.kernel.data import SubstitutionRule # {{{ create new substitution rules new_substs = kernel.substitutions.copy() for def_id, subst_name in six.iteritems(tts.definition_insn_id_to_subst_name): def_insn = kernel.id_to_insn[def_id] (_, indices), = def_insn.assignees_and_indices() arguments = [] from pymbolic.primitives import Variable for i in indices: if not isinstance(i, Variable): raise LoopyError("In defining instruction '%s': " "asignee index '%s' is not a plain variable. " "Perhaps use loopy.affine_map_inames() " "to perform substitution." % (def_id, i)) arguments.append(i.name) new_substs[subst_name] = SubstitutionRule( name=subst_name, arguments=tuple(arguments) + extra_arguments, expression=def_insn.expression) # }}} # {{{ delete temporary variable if possible new_temp_vars = kernel.temporary_variables if not any(six.itervalues(tts.saw_unmatched_usage_sites)): # All usage sites matched--they're now substitution rules. # We can get rid of the variable. new_temp_vars = new_temp_vars.copy() del new_temp_vars[temp_name] # }}} import loopy as lp kernel = lp.remove_instructions( kernel, set( insn_id for insn_id, still_used in six.iteritems( tts.saw_unmatched_usage_sites) if not still_used)) return kernel.copy( substitutions=new_substs, temporary_variables=new_temp_vars, )
def _fix_parameter(kernel, name, value, within=None): def process_set(s): var_dict = s.get_var_dict() try: dt, idx = var_dict[name] except KeyError: return s value_aff = isl.Aff.zero_on_domain(s.space) + value from loopy.isl_helpers import iname_rel_aff name_equal_value_aff = iname_rel_aff(s.space, name, "==", value_aff) s = (s.add_constraint( isl.Constraint.equality_from_aff( name_equal_value_aff)).project_out(dt, idx, 1)) return s new_domains = [process_set(dom) for dom in kernel.domains] from pymbolic.mapper.substitutor import make_subst_func subst_func = make_subst_func({name: value}) from loopy.symbolic import SubstitutionMapper, PartialEvaluationMapper subst_map = SubstitutionMapper(subst_func) ev_map = PartialEvaluationMapper() def map_expr(expr): return ev_map(subst_map(expr)) from loopy.kernel.array import ArrayBase new_args = [] for arg in kernel.args: if arg.name == name: # remove from argument list continue if not isinstance(arg, ArrayBase): new_args.append(arg) else: new_args.append(arg.map_exprs(map_expr)) new_temp_vars = {} for tv in kernel.temporary_variables.values(): new_temp_vars[tv.name] = tv.map_exprs(map_expr) from loopy.match import parse_stack_match within = parse_stack_match(within) rule_mapping_context = SubstitutionRuleMappingContext( kernel.substitutions, kernel.get_var_name_generator()) esubst_map = RuleAwareSubstitutionMapper(rule_mapping_context, subst_func, within=within) return (rule_mapping_context.finish_kernel( esubst_map.map_kernel( kernel, within=within, # overwritten below, no need to map map_tvs=False, map_args=False)).copy( domains=new_domains, args=new_args, temporary_variables=new_temp_vars, assumptions=process_set(kernel.assumptions), ))
def _inline_call_instruction(caller_knl, callee_knl, call_insn): """ Returns a copy of *caller_knl* with the *call_insn* in the *kernel* replaced by inlining *callee_knl* into it within it. :arg call_insn: An instance of `loopy.CallInstruction` of the call-site. """ import pymbolic.primitives as prim from pymbolic.mapper.substitutor import make_subst_func from loopy.kernel.data import ValueArg # {{{ sanity checks assert call_insn.expression.function.name == callee_knl.name # }}} callee_label = callee_knl.name[:4] + "_" vng = caller_knl.get_var_name_generator() ing = caller_knl.get_instruction_id_generator() # {{{ construct callee->caller name mappings # name_map: Mapping[str, str] # A mapping from variable names in the callee kernel's namespace to # the ones they would be referred by in the caller's namespace post inlining. name_map = {} # only consider temporary variables and inames, arguments would be mapping # according to the invocation in call_insn. for name in (callee_knl.all_inames() | set(callee_knl.temporary_variables.keys())): new_name = vng(callee_label + name) name_map[name] = new_name # }}} # {{{ iname_to_tags # new_inames: caller's inames post inlining new_inames = caller_knl.inames for old_name, callee_iname in callee_knl.inames.items(): new_name = name_map[old_name] new_inames[new_name] = callee_iname.copy(name=new_name) # }}} # {{{ register callee's temps as caller's # new_temps: caller's temps post inlining new_temps = caller_knl.temporary_variables.copy() for name, tv in callee_knl.temporary_variables.items(): new_temps[name_map[name]] = tv.copy(name=name_map[name]) # }}} # {{{ get callee args -> parameters passed to the call arg_map = {} # callee arg name -> caller symbols (e.g. SubArrayRef) assignees = call_insn.assignees # writes parameters = call_insn.expression.parameters # reads from loopy.kernel.function_interface import get_kw_pos_association kw_to_pos, pos_to_kw = get_kw_pos_association(callee_knl) for i, par in enumerate(parameters): arg_map[pos_to_kw[i]] = par for i, assignee in enumerate(assignees): arg_map[pos_to_kw[-i - 1]] = assignee # }}} # {{{ process domains/assumptions # rename inames new_domains = callee_knl.domains.copy() for old_iname in callee_knl.all_inames(): new_domains = [ rename_iname(dom, old_iname, name_map[old_iname]) for dom in new_domains ] # realize domains' dim params in terms of caller's variables new_assumptions = callee_knl.assumptions for callee_arg_name, param_expr in arg_map.items(): if isinstance(callee_knl.arg_dict[callee_arg_name], ValueArg): new_domains = [ substitute_into_domain( dom, callee_arg_name, param_expr, get_valid_domain_param_names(caller_knl)) for dom in new_domains ] new_assumptions = substitute_into_domain( new_assumptions, callee_arg_name, param_expr, get_valid_domain_param_names(caller_knl)) # }}} # {{{ rename inames/temporaries in the program rule_mapping_context = SubstitutionRuleMappingContext( callee_knl.substitutions, vng) subst_func = make_subst_func({ old_name: prim.Variable(new_name) for old_name, new_name in name_map.items() }) inames_temps_renamer = RuleAwareSubstitutionMapper( rule_mapping_context, subst_func, within=lambda *args: True) callee_knl = rule_mapping_context.finish_kernel( inames_temps_renamer.map_kernel(callee_knl)) # }}} # {{{ map callee's expressions to get expressions after inlining rule_mapping_context = SubstitutionRuleMappingContext( callee_knl.substitutions, vng) smap = KernelArgumentSubstitutor(rule_mapping_context, caller_knl, callee_knl, arg_map) callee_knl = rule_mapping_context.finish_kernel( smap.map_kernel(callee_knl)) # }}} # {{{ generate new ids for instructions insn_id_map = {} for insn in callee_knl.instructions: insn_id_map[insn.id] = ing(callee_label + insn.id) # }}} # {{{ use NoOp to mark the start and end of callee kernel from loopy.kernel.instruction import NoOpInstruction noop_start = NoOpInstruction(id=ing(callee_label + "_start"), within_inames=call_insn.within_inames, depends_on=call_insn.depends_on) noop_end = NoOpInstruction(id=call_insn.id, within_inames=call_insn.within_inames, depends_on=frozenset(insn_id_map.values())) # }}} # {{{ map callee's instruction ids inlined_insns = [noop_start] for insn in callee_knl.instructions: new_within_inames = (frozenset(name_map[iname] for iname in insn.within_inames) | call_insn.within_inames) new_depends_on = (frozenset(insn_id_map[dep] for dep in insn.depends_on) | {noop_start.id}) new_no_sync_with = frozenset( (insn_id_map[id], scope) for id, scope in insn.no_sync_with) new_id = insn_id_map[insn.id] if isinstance(insn, Assignment): new_atomicity = tuple( type(atomicity)(name_map[atomicity.var_name]) for atomicity in insn.atomicity) insn = insn.copy(id=insn_id_map[insn.id], within_inames=new_within_inames, depends_on=new_depends_on, tags=insn.tags | call_insn.tags, atomicity=new_atomicity, no_sync_with=new_no_sync_with) else: insn = insn.copy(id=new_id, within_inames=new_within_inames, depends_on=new_depends_on, tags=insn.tags | call_insn.tags, no_sync_with=new_no_sync_with) inlined_insns.append(insn) inlined_insns.append(noop_end) # }}} # {{{ swap out call_insn with inlined_instructions idx = caller_knl.instructions.index(call_insn) new_insns = (caller_knl.instructions[:idx] + inlined_insns + caller_knl.instructions[idx + 1:]) # }}} old_assumptions, new_assumptions = isl.align_two(caller_knl.assumptions, new_assumptions) return caller_knl.copy(instructions=new_insns, temporary_variables=new_temps, domains=caller_knl.domains + new_domains, assumptions=(old_assumptions.params() & new_assumptions.params()), inames=new_inames)
def affine_map_inames(kernel, old_inames, new_inames, equations): """Return a new *kernel* where the affine transform specified by *equations* has been applied to the inames. :arg old_inames: A list of inames to be replaced by affine transforms of their values. May also be a string of comma-separated inames. :arg new_inames: A list of new inames that are not yet used in *kernel*, but have their values established in terms of *old_inames* by *equations*. May also be a string of comma-separated inames. :arg equations: A list of equations estabilishing a relationship between *old_inames* and *new_inames*. Each equation may be a tuple ``(lhs, rhs)`` of expressions or a string, with left and right hand side of the equation separated by ``=``. """ # {{{ check and parse arguments if isinstance(new_inames, str): new_inames = new_inames.split(",") new_inames = [iname.strip() for iname in new_inames] if isinstance(old_inames, str): old_inames = old_inames.split(",") old_inames = [iname.strip() for iname in old_inames] if isinstance(equations, str): equations = [equations] import re eqn_re = re.compile(r"^([^=]+)=([^=]+)$") def parse_equation(eqn): if isinstance(eqn, str): eqn_match = eqn_re.match(eqn) if not eqn_match: raise ValueError("invalid equation: %s" % eqn) from loopy.symbolic import parse lhs = parse(eqn_match.group(1)) rhs = parse(eqn_match.group(2)) return (lhs, rhs) elif isinstance(eqn, tuple): if len(eqn) != 2: raise ValueError("unexpected length of equation tuple, " "got %d, should be 2" % len(eqn)) return eqn else: raise ValueError("unexpected type of equation" "got %d, should be string or tuple" % type(eqn).__name__) equations = [parse_equation(eqn) for eqn in equations] all_vars = kernel.all_variable_names() for iname in new_inames: if iname in all_vars: raise LoopyError("new iname '%s' is already used in kernel" % iname) for iname in old_inames: if iname not in kernel.all_inames(): raise LoopyError("old iname '%s' not known" % iname) # }}} # {{{ substitute iname use from pymbolic.algorithm import solve_affine_equations_for old_inames_to_expr = solve_affine_equations_for(old_inames, equations) subst_dict = dict((v.name, expr) for v, expr in old_inames_to_expr.items()) var_name_gen = kernel.get_var_name_generator() from pymbolic.mapper.substitutor import make_subst_func from loopy.match import parse_stack_match rule_mapping_context = SubstitutionRuleMappingContext( kernel.substitutions, var_name_gen) old_to_new = RuleAwareSubstitutionMapper(rule_mapping_context, make_subst_func(subst_dict), within=parse_stack_match(None)) kernel = (rule_mapping_context.finish_kernel( old_to_new.map_kernel(kernel)).copy( applied_iname_rewrites=kernel.applied_iname_rewrites + [subst_dict])) # }}} # {{{ change domains new_inames_set = frozenset(new_inames) old_inames_set = frozenset(old_inames) new_domains = [] for idom, dom in enumerate(kernel.domains): dom_var_dict = dom.get_var_dict() old_iname_overlap = [ iname for iname in old_inames if iname in dom_var_dict ] if not old_iname_overlap: new_domains.append(dom) continue from loopy.symbolic import get_dependencies dom_new_inames = set() dom_old_inames = set() # mapping for new inames to dim_types new_iname_dim_types = {} dom_equations = [] for iname in old_iname_overlap: for ieqn, (lhs, rhs) in enumerate(equations): eqn_deps = get_dependencies(lhs) | get_dependencies(rhs) if iname in eqn_deps: dom_new_inames.update(eqn_deps & new_inames_set) dom_old_inames.update(eqn_deps & old_inames_set) if dom_old_inames: dom_equations.append((lhs, rhs)) this_eqn_old_iname_dim_types = set(dom_var_dict[old_iname][0] for old_iname in eqn_deps & old_inames_set) if this_eqn_old_iname_dim_types: if len(this_eqn_old_iname_dim_types) > 1: raise ValueError( "inames '%s' (from equation %d (0-based)) " "in domain %d (0-based) are not " "of a uniform dim_type" % (", ".join(eqn_deps & old_inames_set), ieqn, idom)) this_eqn_new_iname_dim_type, = this_eqn_old_iname_dim_types for new_iname in eqn_deps & new_inames_set: if new_iname in new_iname_dim_types: if (this_eqn_new_iname_dim_type != new_iname_dim_types[new_iname]): raise ValueError( "dim_type disagreement for " "iname '%s' (from equation %d (0-based)) " "in domain %d (0-based)" % (new_iname, ieqn, idom)) else: new_iname_dim_types[new_iname] = \ this_eqn_new_iname_dim_type if not dom_old_inames <= set(dom_var_dict): raise ValueError( "domain %d (0-based) does not know about " "all old inames (specifically '%s') needed to define new inames" % (idom, ", ".join(dom_old_inames - set(dom_var_dict)))) # add inames to domain with correct dim_types dom_new_inames = list(dom_new_inames) for iname in dom_new_inames: dt = new_iname_dim_types[iname] iname_idx = dom.dim(dt) dom = dom.add_dims(dt, 1) dom = dom.set_dim_name(dt, iname_idx, iname) # add equations from loopy.symbolic import aff_from_expr for lhs, rhs in dom_equations: dom = dom.add_constraint( isl.Constraint.equality_from_aff( aff_from_expr(dom.space, rhs - lhs))) # project out old inames for iname in dom_old_inames: dt, idx = dom.get_var_dict()[iname] dom = dom.project_out(dt, idx, 1) new_domains.append(dom) # }}} # {{{ switch iname refs in instructions def fix_iname_set(insn_id, inames): if old_inames_set <= inames: return (inames - old_inames_set) | new_inames_set elif old_inames_set & inames: raise LoopyError( "instruction '%s' uses only a part (%s), not all, " "of the old inames" % (insn_id, ", ".join(old_inames_set & inames))) else: return inames new_instructions = [ insn.copy(within_inames=fix_iname_set(insn.id, insn.within_inames)) for insn in kernel.instructions ] # }}} return kernel.copy(domains=new_domains, instructions=new_instructions)
def to_batched(knl, nbatches, batch_varying_args, batch_iname_prefix="ibatch", sequential=False): """Takes in a kernel that carries out an operation and returns a kernel that carries out a batch of these operations. :arg nbatches: the number of batches. May be a constant non-negative integer or a string, which will be added as an integer argument. :arg batch_varying_args: a list of argument names that vary per-batch. Each such variable will have a batch index added. :arg sequential: A :class:`bool`. If *True*, do not duplicate temporary variables for each batch. This automatically tags the batch iname for sequential execution. """ from pymbolic import var vng = knl.get_var_name_generator() batch_iname = vng(batch_iname_prefix) batch_iname_expr = var(batch_iname) new_args = [] batch_dom_str = "{[%(iname)s]: 0 <= %(iname)s < %(nbatches)s}" % { "iname": batch_iname, "nbatches": nbatches, } if not isinstance(nbatches, int): batch_dom_str = "[%s] -> " % nbatches + batch_dom_str new_args.append(ValueArg(nbatches, dtype=knl.index_dtype)) nbatches_expr = var(nbatches) else: nbatches_expr = nbatches batch_domain = isl.BasicSet(batch_dom_str) new_domains = [batch_domain] + knl.domains for arg in knl.args: if arg.name in batch_varying_args: if isinstance(arg, ValueArg): arg = GlobalArg(arg.name, arg.dtype, shape=(nbatches_expr,), dim_tags="c") else: arg = arg.copy( shape=(nbatches_expr,) + arg.shape, dim_tags=("c",) * (len(arg.shape) + 1), dim_names=_add_unique_dim_name("ibatch", arg.dim_names)) new_args.append(arg) knl = knl.copy( domains=new_domains, args=new_args) if not sequential: new_temps = {} for temp in six.itervalues(knl.temporary_variables): if temp.initializer is not None and temp.read_only: new_temps[temp.name] = temp else: new_temps[temp.name] = temp.copy( shape=(nbatches_expr,) + temp.shape, dim_tags=("c",) * (len(temp.shape) + 1), dim_names=_add_unique_dim_name("ibatch", temp.dim_names)) knl = knl.copy(temporary_variables=new_temps) else: import loopy as lp from loopy.kernel.data import ForceSequentialTag knl = lp.tag_inames(knl, [(batch_iname, ForceSequentialTag())]) rule_mapping_context = SubstitutionRuleMappingContext( knl.substitutions, vng) bvc = _BatchVariableChanger(rule_mapping_context, knl, batch_varying_args, batch_iname_expr, sequential=sequential) kernel = rule_mapping_context.finish_kernel( bvc.map_kernel(knl)) batch_iname_set = frozenset([batch_iname]) kernel = kernel.copy( instructions=[ insn.copy(forced_iname_deps=insn.forced_iname_deps | batch_iname_set) for insn in kernel.instructions]) return kernel
def resolve_callables(program): """ Returns a :class:`TranslationUnit` with known :class:`pymbolic.primitives.Call` expression nodes converted to :class:`loopy.symbolic.ResolvedFunction`. """ from loopy.library.function import get_loopy_callables from loopy.check import validate_kernel_call_sites from loopy.kernel import KernelState if program.state >= KernelState.CALLS_RESOLVED: # program's callables have been resolved return program # get registered callables known_callables = dict(program.callables_table) # get target specific callables known_callables.update( program.target.get_device_ast_builder().known_callables) # get loopy specific callables known_callables.update(get_loopy_callables()) callables_table = {} # callables: name of the calls seen in the program callables = { name for name, clbl in program.callables_table.items() if isinstance(clbl, CallableKernel) } while callables: clbl_name = callables.pop() clbl = known_callables[clbl_name] if isinstance(clbl, CallableKernel): knl = clbl.subkernel rule_mapping_context = SubstitutionRuleMappingContext( knl.substitutions, knl.get_var_name_generator()) clbl_resolver = CallableResolver(rule_mapping_context, frozenset(known_callables)) knl = rule_mapping_context.finish_kernel( clbl_resolver.map_kernel(knl)) knl = knl.copy(state=KernelState.CALLS_RESOLVED) # add the updated callable kernel to the table callables_table[clbl_name] = clbl.copy(subkernel=knl) # note the resolved callable for traversal callables.update(clbl_resolver.calls_resolved - set(callables_table)) elif isinstance(clbl, ScalarCallable): # nothing to resolve within a scalar callable callables_table[clbl_name] = clbl else: raise NotImplementedError(f"{type(clbl)}") program = program.copy(callables_table=callables_table) validate_kernel_call_sites(program) return program
def buffer_array(kernel, var_name, buffer_inames, init_expression=None, store_expression=None, within=None, default_tag="l.auto", temporary_scope=None, temporary_is_local=None, fetch_bounding_box=False): """Replace accesses to *var_name* with ones to a temporary, which is created and acts as a buffer. To perform this transformation, the access footprint to *var_name* is determined and a temporary of a suitable :class:`loopy.AddressSpace` and shape is created. By default, the value of the buffered cells in *var_name* are read prior to any (read/write) use, and the modified values are written out after use has concluded, but for special use cases (e.g. additive accumulation), the behavior can be modified using *init_expression* and *store_expression*. :arg buffer_inames: The inames across which the buffer should be usable--i.e. all possible values of these inames will be covered by the buffer footprint. A tuple of inames or a comma-separated string. :arg init_expression: Either *None* (indicating the prior value of the buffered array should be read) or an expression optionally involving the variable 'base' (which references the associated location in the array being buffered). :arg store_expression: Either *None*, *False*, or an expression involving variables 'base' and 'buffer' (without array indices). (*None* indicates that a default storage instruction should be used, *False* indicates that no storing of the temporary should occur at all.) :arg within: If not None, limit the action of the transformation to matching contexts. See :func:`loopy.match.parse_stack_match` for syntax. :arg temporary_scope: If given, override the choice of :class:`AddressSpace` for the created temporary. :arg default_tag: The default :ref:`iname-tags` to be assigned to the inames used for fetching and storing :arg fetch_bounding_box: If the access footprint is non-convex (resulting in an error), setting this argument to *True* will force a rectangular (and hence convex) superset of the footprint to be fetched. """ # {{{ unify temporary_scope / temporary_is_local from loopy.kernel.data import AddressSpace if temporary_is_local is not None: from warnings import warn warn("temporary_is_local is deprecated. Use temporary_scope instead", DeprecationWarning, stacklevel=2) if temporary_scope is not None: raise LoopyError("may not specify both temporary_is_local and " "temporary_scope") if temporary_is_local: temporary_scope = AddressSpace.LOCAL else: temporary_scope = AddressSpace.PRIVATE del temporary_is_local # }}} # {{{ process arguments if isinstance(init_expression, str): from loopy.symbolic import parse init_expression = parse(init_expression) if isinstance(store_expression, str): from loopy.symbolic import parse store_expression = parse(store_expression) if isinstance(buffer_inames, str): buffer_inames = [s.strip() for s in buffer_inames.split(",") if s.strip()] for iname in buffer_inames: if iname not in kernel.all_inames(): raise RuntimeError("sweep iname '%s' is not a known iname" % iname) buffer_inames = list(buffer_inames) buffer_inames_set = frozenset(buffer_inames) from loopy.match import parse_stack_match within = parse_stack_match(within) if var_name in kernel.arg_dict: var_descr = kernel.arg_dict[var_name] elif var_name in kernel.temporary_variables: var_descr = kernel.temporary_variables[var_name] else: raise ValueError("variable '%s' not found" % var_name) from loopy.kernel.data import ArrayBase if isinstance(var_descr, ArrayBase): var_shape = var_descr.shape else: var_shape = () if temporary_scope is None: import loopy as lp temporary_scope = lp.auto # }}} # {{{ caching from loopy import CACHING_ENABLED from loopy.preprocess import prepare_for_caching key_kernel = prepare_for_caching(kernel) cache_key = (key_kernel, var_name, tuple(buffer_inames), PymbolicExpressionHashWrapper(init_expression), PymbolicExpressionHashWrapper(store_expression), within, default_tag, temporary_scope, fetch_bounding_box) if CACHING_ENABLED: try: result = buffer_array_cache[cache_key] logger.info("%s: buffer_array cache hit" % kernel.name) return result except KeyError: pass # }}} var_name_gen = kernel.get_var_name_generator() within_inames = set() access_descriptors = [] for insn in kernel.instructions: if not within(kernel, insn.id, ()): continue from pymbolic.primitives import Variable, Subscript from loopy.symbolic import LinearSubscript for assignee in insn.assignees: if isinstance(assignee, Variable): assignee_name = assignee.name index = () elif isinstance(assignee, Subscript): assignee_name = assignee.aggregate.name index = assignee.index_tuple elif isinstance(assignee, LinearSubscript): if assignee.aggregate.name == var_name: raise LoopyError("buffer_array may not be applied in the " "presence of linear write indexing into '%s'" % var_name) else: raise LoopyError("invalid lvalue '%s'" % assignee) if assignee_name == var_name: within_inames.update( (get_dependencies(index) & kernel.all_inames()) - buffer_inames_set) access_descriptors.append( AccessDescriptor( identifier=insn.id, storage_axis_exprs=index)) # {{{ find fetch/store inames init_inames = [] store_inames = [] new_iname_to_tag = {} for i in range(len(var_shape)): dim_name = str(i) if isinstance(var_descr, ArrayBase) and var_descr.dim_names is not None: dim_name = var_descr.dim_names[i] init_iname = var_name_gen(f"{var_name}_init_{dim_name}") store_iname = var_name_gen(f"{var_name}_store_{dim_name}") new_iname_to_tag[init_iname] = default_tag new_iname_to_tag[store_iname] = default_tag init_inames.append(init_iname) store_inames.append(store_iname) # }}} # {{{ modify loop domain non1_init_inames = [] non1_store_inames = [] if var_shape: # {{{ find domain to be changed from loopy.kernel.tools import DomainChanger domch = DomainChanger(kernel, buffer_inames_set | within_inames) if domch.leaf_domain_index is not None: # If the sweep inames are at home in parent domains, then we'll add # fetches with loops over copies of these parent inames that will end # up being scheduled *within* loops over these parents. for iname in buffer_inames_set: if kernel.get_home_domain_index(iname) != domch.leaf_domain_index: raise RuntimeError("buffer iname '%s' is not 'at home' in the " "sweep's leaf domain" % iname) # }}} abm = ArrayToBufferMap(kernel, domch.domain, buffer_inames, access_descriptors, len(var_shape)) for i in range(len(var_shape)): if abm.non1_storage_axis_flags[i]: non1_init_inames.append(init_inames[i]) non1_store_inames.append(store_inames[i]) else: del new_iname_to_tag[init_inames[i]] del new_iname_to_tag[store_inames[i]] new_domain = domch.domain new_domain = abm.augment_domain_with_sweep( new_domain, non1_init_inames, boxify_sweep=fetch_bounding_box) new_domain = abm.augment_domain_with_sweep( new_domain, non1_store_inames, boxify_sweep=fetch_bounding_box) new_kernel_domains = domch.get_domains_with(new_domain) del new_domain else: # leave kernel domains unchanged new_kernel_domains = kernel.domains abm = NoOpArrayToBufferMap() # }}} # {{{ set up temp variable import loopy as lp buf_var_name = var_name_gen(based_on=var_name+"_buf") new_temporary_variables = kernel.temporary_variables.copy() temp_var = lp.TemporaryVariable( name=buf_var_name, dtype=var_descr.dtype, base_indices=(0,)*len(abm.non1_storage_shape), shape=tuple(abm.non1_storage_shape), address_space=temporary_scope) new_temporary_variables[buf_var_name] = temp_var # }}} new_insns = [] buf_var = var(buf_var_name) # {{{ generate init instruction buf_var_init = buf_var if non1_init_inames: buf_var_init = buf_var_init.index( tuple(var(iname) for iname in non1_init_inames)) init_base = var(var_name) init_subscript = [] init_iname_idx = 0 if var_shape: for i in range(len(var_shape)): ax_subscript = abm.storage_base_indices[i] if abm.non1_storage_axis_flags[i]: ax_subscript += var(non1_init_inames[init_iname_idx]) init_iname_idx += 1 init_subscript.append(ax_subscript) if init_subscript: init_base = init_base.index(tuple(init_subscript)) if init_expression is None: init_expression = init_base else: init_expression = init_expression init_expression = SubstitutionMapper( make_subst_func({ "base": init_base, }))(init_expression) init_insn_id = kernel.make_unique_instruction_id(based_on="init_"+var_name) from loopy.kernel.data import Assignment init_instruction = Assignment(id=init_insn_id, assignee=buf_var_init, expression=init_expression, within_inames=( frozenset(within_inames) | frozenset(non1_init_inames)), depends_on=frozenset(), depends_on_is_final=True) # }}} rule_mapping_context = SubstitutionRuleMappingContext( kernel.substitutions, kernel.get_var_name_generator()) aar = ArrayAccessReplacer(rule_mapping_context, var_name, within, abm, buf_var) kernel = rule_mapping_context.finish_kernel(aar.map_kernel(kernel)) did_write = False for insn_id in aar.modified_insn_ids: insn = kernel.id_to_insn[insn_id] if buf_var_name in insn.assignee_var_names(): did_write = True # {{{ add init_insn_id to depends_on new_insns = [] def none_to_empty_set(s): if s is None: return frozenset() else: return s for insn in kernel.instructions: if insn.id in aar.modified_insn_ids: new_insns.append( insn.copy( depends_on=( none_to_empty_set(insn.depends_on) | frozenset([init_insn_id])))) else: new_insns.append(insn) # }}} # {{{ generate store instruction buf_var_store = buf_var if non1_store_inames: buf_var_store = buf_var_store.index( tuple(var(iname) for iname in non1_store_inames)) store_subscript = [] store_iname_idx = 0 if var_shape: for i in range(len(var_shape)): ax_subscript = abm.storage_base_indices[i] if abm.non1_storage_axis_flags[i]: ax_subscript += var(non1_store_inames[store_iname_idx]) store_iname_idx += 1 store_subscript.append(ax_subscript) store_target = var(var_name) if store_subscript: store_target = store_target.index(tuple(store_subscript)) if store_expression is None: store_expression = buf_var_store else: store_expression = SubstitutionMapper( make_subst_func({ "base": store_target, "buffer": buf_var_store, }))(store_expression) if store_expression is not False: from loopy.kernel.data import Assignment store_instruction = Assignment( id=kernel.make_unique_instruction_id(based_on="store_"+var_name), depends_on=frozenset(aar.modified_insn_ids), no_sync_with=frozenset([(init_insn_id, "any")]), assignee=store_target, expression=store_expression, within_inames=( frozenset(within_inames) | frozenset(non1_store_inames))) else: did_write = False # }}} new_insns.append(init_instruction) if did_write: new_insns.append(store_instruction) else: for iname in store_inames: del new_iname_to_tag[iname] kernel = kernel.copy( domains=new_kernel_domains, instructions=new_insns, temporary_variables=new_temporary_variables) from loopy import tag_inames kernel = tag_inames(kernel, new_iname_to_tag) from loopy.kernel.tools import assign_automatic_axes kernel = assign_automatic_axes(kernel) if CACHING_ENABLED: from loopy.preprocess import prepare_for_caching buffer_array_cache.store_if_not_present( cache_key, prepare_for_caching(kernel)) return kernel
def buffer_array(kernel, var_name, buffer_inames, init_expression=None, store_expression=None, within=None, default_tag="l.auto", temporary_scope=None, temporary_is_local=None, fetch_bounding_box=False): """ :arg init_expression: Either *None* (indicating the prior value of the buffered array should be read) or an expression optionally involving the variable 'base' (which references the associated location in the array being buffered). :arg store_expression: Either *None*, *False*, or an expression involving variables 'base' and 'buffer' (without array indices). (*None* indicates that a default storage instruction should be used, *False* indicates that no storing of the temporary should occur at all.) """ # {{{ unify temporary_scope / temporary_is_local from loopy.kernel.data import temp_var_scope if temporary_is_local is not None: from warnings import warn warn("temporary_is_local is deprecated. Use temporary_scope instead", DeprecationWarning, stacklevel=2) if temporary_scope is not None: raise LoopyError("may not specify both temporary_is_local and " "temporary_scope") if temporary_is_local: temporary_scope = temp_var_scope.LOCAL else: temporary_scope = temp_var_scope.PRIVATE del temporary_is_local # }}} # {{{ process arguments if isinstance(init_expression, str): from loopy.symbolic import parse init_expression = parse(init_expression) if isinstance(store_expression, str): from loopy.symbolic import parse store_expression = parse(store_expression) if isinstance(buffer_inames, str): buffer_inames = [s.strip() for s in buffer_inames.split(",") if s.strip()] for iname in buffer_inames: if iname not in kernel.all_inames(): raise RuntimeError("sweep iname '%s' is not a known iname" % iname) buffer_inames = list(buffer_inames) buffer_inames_set = frozenset(buffer_inames) from loopy.match import parse_stack_match within = parse_stack_match(within) if var_name in kernel.arg_dict: var_descr = kernel.arg_dict[var_name] elif var_name in kernel.temporary_variables: var_descr = kernel.temporary_variables[var_name] else: raise ValueError("variable '%s' not found" % var_name) from loopy.kernel.data import ArrayBase if isinstance(var_descr, ArrayBase): var_shape = var_descr.shape else: var_shape = () if temporary_scope is None: import loopy as lp temporary_scope = lp.auto # }}} # {{{ caching from loopy import CACHING_ENABLED from loopy.preprocess import prepare_for_caching key_kernel = prepare_for_caching(kernel) cache_key = (key_kernel, var_name, tuple(buffer_inames), PymbolicExpressionHashWrapper(init_expression), PymbolicExpressionHashWrapper(store_expression), within, default_tag, temporary_scope, fetch_bounding_box) if CACHING_ENABLED: try: result = buffer_array_cache[cache_key] logger.info("%s: buffer_array cache hit" % kernel.name) return result except KeyError: pass # }}} var_name_gen = kernel.get_var_name_generator() within_inames = set() access_descriptors = [] for insn in kernel.instructions: if not within(kernel, insn.id, ()): continue from pymbolic.primitives import Variable, Subscript from loopy.symbolic import LinearSubscript for assignee in insn.assignees: if isinstance(assignee, Variable): assignee_name = assignee.name index = () elif isinstance(assignee, Subscript): assignee_name = assignee.aggregate.name index = assignee.index_tuple elif isinstance(assignee, LinearSubscript): if assignee.aggregate.name == var_name: raise LoopyError("buffer_array may not be applied in the " "presence of linear write indexing into '%s'" % var_name) else: raise LoopyError("invalid lvalue '%s'" % assignee) if assignee_name == var_name: within_inames.update( (get_dependencies(index) & kernel.all_inames()) - buffer_inames_set) access_descriptors.append( AccessDescriptor( identifier=insn.id, storage_axis_exprs=index)) # {{{ find fetch/store inames init_inames = [] store_inames = [] new_iname_to_tag = {} for i in range(len(var_shape)): dim_name = str(i) if isinstance(var_descr, ArrayBase) and var_descr.dim_names is not None: dim_name = var_descr.dim_names[i] init_iname = var_name_gen("%s_init_%s" % (var_name, dim_name)) store_iname = var_name_gen("%s_store_%s" % (var_name, dim_name)) new_iname_to_tag[init_iname] = default_tag new_iname_to_tag[store_iname] = default_tag init_inames.append(init_iname) store_inames.append(store_iname) # }}} # {{{ modify loop domain non1_init_inames = [] non1_store_inames = [] if var_shape: # {{{ find domain to be changed from loopy.kernel.tools import DomainChanger domch = DomainChanger(kernel, buffer_inames_set | within_inames) if domch.leaf_domain_index is not None: # If the sweep inames are at home in parent domains, then we'll add # fetches with loops over copies of these parent inames that will end # up being scheduled *within* loops over these parents. for iname in buffer_inames_set: if kernel.get_home_domain_index(iname) != domch.leaf_domain_index: raise RuntimeError("buffer iname '%s' is not 'at home' in the " "sweep's leaf domain" % iname) # }}} abm = ArrayToBufferMap(kernel, domch.domain, buffer_inames, access_descriptors, len(var_shape)) for i in range(len(var_shape)): if abm.non1_storage_axis_flags[i]: non1_init_inames.append(init_inames[i]) non1_store_inames.append(store_inames[i]) else: del new_iname_to_tag[init_inames[i]] del new_iname_to_tag[store_inames[i]] new_domain = domch.domain new_domain = abm.augment_domain_with_sweep( new_domain, non1_init_inames, boxify_sweep=fetch_bounding_box) new_domain = abm.augment_domain_with_sweep( new_domain, non1_store_inames, boxify_sweep=fetch_bounding_box) new_kernel_domains = domch.get_domains_with(new_domain) del new_domain else: # leave kernel domains unchanged new_kernel_domains = kernel.domains abm = NoOpArrayToBufferMap() # }}} # {{{ set up temp variable import loopy as lp buf_var_name = var_name_gen(based_on=var_name+"_buf") new_temporary_variables = kernel.temporary_variables.copy() temp_var = lp.TemporaryVariable( name=buf_var_name, dtype=var_descr.dtype, base_indices=(0,)*len(abm.non1_storage_shape), shape=tuple(abm.non1_storage_shape), scope=temporary_scope) new_temporary_variables[buf_var_name] = temp_var # }}} new_insns = [] buf_var = var(buf_var_name) # {{{ generate init instruction buf_var_init = buf_var if non1_init_inames: buf_var_init = buf_var_init.index( tuple(var(iname) for iname in non1_init_inames)) init_base = var(var_name) init_subscript = [] init_iname_idx = 0 if var_shape: for i in range(len(var_shape)): ax_subscript = abm.storage_base_indices[i] if abm.non1_storage_axis_flags[i]: ax_subscript += var(non1_init_inames[init_iname_idx]) init_iname_idx += 1 init_subscript.append(ax_subscript) if init_subscript: init_base = init_base.index(tuple(init_subscript)) if init_expression is None: init_expression = init_base else: init_expression = init_expression init_expression = SubstitutionMapper( make_subst_func({ "base": init_base, }))(init_expression) init_insn_id = kernel.make_unique_instruction_id(based_on="init_"+var_name) from loopy.kernel.data import Assignment init_instruction = Assignment(id=init_insn_id, assignee=buf_var_init, expression=init_expression, forced_iname_deps=( frozenset(within_inames) | frozenset(non1_init_inames)), depends_on=frozenset(), depends_on_is_final=True) # }}} rule_mapping_context = SubstitutionRuleMappingContext( kernel.substitutions, kernel.get_var_name_generator()) aar = ArrayAccessReplacer(rule_mapping_context, var_name, within, abm, buf_var) kernel = rule_mapping_context.finish_kernel(aar.map_kernel(kernel)) did_write = False for insn_id in aar.modified_insn_ids: insn = kernel.id_to_insn[insn_id] if buf_var_name in insn.assignee_var_names(): did_write = True # {{{ add init_insn_id to depends_on new_insns = [] def none_to_empty_set(s): if s is None: return frozenset() else: return s for insn in kernel.instructions: if insn.id in aar.modified_insn_ids: new_insns.append( insn.copy( depends_on=( none_to_empty_set(insn.depends_on) | frozenset([init_insn_id])))) else: new_insns.append(insn) # }}} # {{{ generate store instruction buf_var_store = buf_var if non1_store_inames: buf_var_store = buf_var_store.index( tuple(var(iname) for iname in non1_store_inames)) store_subscript = [] store_iname_idx = 0 if var_shape: for i in range(len(var_shape)): ax_subscript = abm.storage_base_indices[i] if abm.non1_storage_axis_flags[i]: ax_subscript += var(non1_store_inames[store_iname_idx]) store_iname_idx += 1 store_subscript.append(ax_subscript) store_target = var(var_name) if store_subscript: store_target = store_target.index(tuple(store_subscript)) if store_expression is None: store_expression = buf_var_store else: store_expression = SubstitutionMapper( make_subst_func({ "base": store_target, "buffer": buf_var_store, }))(store_expression) if store_expression is not False: from loopy.kernel.data import Assignment store_instruction = Assignment( id=kernel.make_unique_instruction_id(based_on="store_"+var_name), depends_on=frozenset(aar.modified_insn_ids), no_sync_with=frozenset([init_insn_id]), assignee=store_target, expression=store_expression, forced_iname_deps=( frozenset(within_inames) | frozenset(non1_store_inames))) else: did_write = False # }}} new_insns.append(init_instruction) if did_write: new_insns.append(store_instruction) else: for iname in store_inames: del new_iname_to_tag[iname] kernel = kernel.copy( domains=new_kernel_domains, instructions=new_insns, temporary_variables=new_temporary_variables) from loopy import tag_inames kernel = tag_inames(kernel, new_iname_to_tag) from loopy.kernel.tools import assign_automatic_axes kernel = assign_automatic_axes(kernel) if CACHING_ENABLED: from loopy.preprocess import prepare_for_caching buffer_array_cache[cache_key] = prepare_for_caching(kernel) return kernel
def split_array_dim(kernel, arrays_and_axes, count, auto_split_inames=True, split_kwargs=None): """ :arg arrays_and_axes: a list of tuples *(array, axis_nr)* indicating that the index in *axis_nr* should be split. The tuples may also be *(array, axis_nr, "F")*, indicating that the index will be split as it would be according to Fortran order. *array* may name a temporary variable or an argument. If *arrays_and_axes* is a :class:`tuple`, it is automatically wrapped in a list, to make single splits easier. :arg count: The group size to use in the split. :arg auto_split_inames: Whether to automatically split inames encountered in the specified indices. :arg split_kwargs: arguments to pass to :func:`loopy.split_inames` Note that splits on the corresponding inames are carried out implicitly. The inames may *not* be split beforehand. (There's no *really* good reason for this--this routine is just not smart enough to deal with this.) """ if count == 1: return kernel if split_kwargs is None: split_kwargs = {} # {{{ process input into array_to_rest # where "rest" is the non-argument-name part of the input tuples # in args_and_axes def normalize_rest(rest): if len(rest) == 1: return (rest[0], "C") elif len(rest) == 2: return rest else: raise RuntimeError("split instruction '%s' not understood" % rest) if isinstance(arrays_and_axes, tuple): arrays_and_axes = [arrays_and_axes] array_to_rest = { tup[0]: normalize_rest(tup[1:]) for tup in arrays_and_axes } if len(arrays_and_axes) != len(array_to_rest): raise RuntimeError("cannot split multiple axes of the same variable") del arrays_and_axes # }}} # {{{ adjust arrays from loopy.kernel.tools import ArrayChanger for array_name, (axis, order) in array_to_rest.items(): achng = ArrayChanger(kernel, array_name) ary = achng.get() from pytools import div_ceil # {{{ adjust shape new_shape = ary.shape if new_shape is not None: new_shape = list(new_shape) axis_len = new_shape[axis] new_shape[axis] = count outer_len = div_ceil(axis_len, count) if order == "F": new_shape.insert(axis + 1, outer_len) elif order == "C": new_shape.insert(axis, outer_len) else: raise RuntimeError("order '%s' not understood" % order) new_shape = tuple(new_shape) # }}} # {{{ adjust dim tags if ary.dim_tags is None: raise RuntimeError("dim_tags of '%s' are not known" % array_name) new_dim_tags = list(ary.dim_tags) old_dim_tag = ary.dim_tags[axis] from loopy.kernel.array import FixedStrideArrayDimTag if not isinstance(old_dim_tag, FixedStrideArrayDimTag): raise RuntimeError("axis %d of '%s' is not tagged fixed-stride" % (axis, array_name)) old_stride = old_dim_tag.stride outer_stride = count * old_stride if order == "F": new_dim_tags.insert(axis + 1, FixedStrideArrayDimTag(outer_stride)) elif order == "C": new_dim_tags.insert(axis, FixedStrideArrayDimTag(outer_stride)) else: raise RuntimeError("order '%s' not understood" % order) new_dim_tags = tuple(new_dim_tags) # }}} # {{{ adjust dim_names new_dim_names = ary.dim_names if new_dim_names is not None: new_dim_names = list(new_dim_names) existing_name = new_dim_names[axis] new_dim_names[axis] = existing_name + "_inner" outer_name = existing_name + "_outer" if order == "F": new_dim_names.insert(axis + 1, outer_name) elif order == "C": new_dim_names.insert(axis, outer_name) else: raise RuntimeError("order '%s' not understood" % order) new_dim_names = tuple(new_dim_names) # }}} kernel = achng.with_changed_array( ary.copy(shape=new_shape, dim_tags=new_dim_tags, dim_names=new_dim_names)) # }}} split_vars = {} var_name_gen = kernel.get_var_name_generator() def split_access_axis(expr): axis_nr, order = array_to_rest[expr.aggregate.name] idx = expr.index if not isinstance(idx, tuple): idx = (idx, ) idx = list(idx) axis_idx = idx[axis_nr] if auto_split_inames: from pymbolic.primitives import Variable if not isinstance(axis_idx, Variable): raise RuntimeError( "found access '%s' in which axis %d is not a " "single variable--cannot split " "(Have you tried to do the split yourself, manually, " "beforehand? If so, you shouldn't.)" % (expr, axis_nr)) split_iname = idx[axis_nr].name assert split_iname in kernel.all_inames() try: outer_iname, inner_iname = split_vars[split_iname] except KeyError: outer_iname = var_name_gen(split_iname + "_outer") inner_iname = var_name_gen(split_iname + "_inner") split_vars[split_iname] = outer_iname, inner_iname inner_index = Variable(inner_iname) outer_index = Variable(outer_iname) else: from loopy.symbolic import simplify_using_aff inner_index = simplify_using_aff(kernel, axis_idx % count) outer_index = simplify_using_aff(kernel, axis_idx // count) idx[axis_nr] = inner_index if order == "F": idx.insert(axis + 1, outer_index) elif order == "C": idx.insert(axis, outer_index) else: raise RuntimeError("order '%s' not understood" % order) return expr.aggregate.index(tuple(idx)) rule_mapping_context = SubstitutionRuleMappingContext( kernel.substitutions, var_name_gen) aash = ArrayAxisSplitHelper(rule_mapping_context, set(array_to_rest.keys()), split_access_axis) kernel = rule_mapping_context.finish_kernel(aash.map_kernel(kernel)) if auto_split_inames: from loopy import split_iname for iname, (outer_iname, inner_iname) in split_vars.items(): kernel = split_iname(kernel, iname, count, outer_iname=outer_iname, inner_iname=inner_iname, **split_kwargs) return kernel
def change_names_of_pymbolic_calls(kernel, pymbolic_calls_to_new_names): """ Returns a copy of *kernel* with the names of pymbolic calls changed according to the mapping given by *pymbolic_calls_new_names*. :arg pymbolic_calls_to_new_names: A mapping from instances of :class:`pymbolic.primitives.Call` to :class:`str`. **Example: ** - Given a *kernel* -- .. code:: ------------------------------------------------------------- KERNEL: loopy_kernel ------------------------------------------------------------- ARGUMENTS: x: type: <auto/runtime>, shape: (10), dim_tags: (N0:stride:1) y: type: <auto/runtime>, shape: (10), dim_tags: (N0:stride:1) ------------------------------------------------------------- DOMAINS: { [i] : 0 <= i <= 9 } ------------------------------------------------------------- INAME IMPLEMENTATION TAGS: i: None ------------------------------------------------------------- INSTRUCTIONS: for i y[i] = ResolvedFunction('sin')(x[i]) end i ------------------------------------------------------------- - And given a *pymbolic_calls_to_new_names* -- .. code:: {Call(ResolvedFunction(Variable('sin')), (Subscript(Variable('x'), Variable('i')),))": 'sin_1'} - The following *kernel* is returned -- .. code:: ------------------------------------------------------------- KERNEL: loopy_kernel ------------------------------------------------------------- ARGUMENTS: x: type: <auto/runtime>, shape: (10), dim_tags: (N0:stride:1) y: type: <auto/runtime>, shape: (10), dim_tags: (N0:stride:1) ------------------------------------------------------------- DOMAINS: { [i] : 0 <= i <= 9 } ------------------------------------------------------------- INAME IMPLEMENTATION TAGS: i: None ------------------------------------------------------------- INSTRUCTIONS: for i y[i] = ResolvedFunction('sin_1')(x[i]) end i ------------------------------------------------------------- """ rule_mapping_context = SubstitutionRuleMappingContext( kernel.substitutions, kernel.get_var_name_generator()) subst_expander = SubstitutionRuleExpander(kernel.substitutions) name_changer = FunctionNameChanger(rule_mapping_context, pymbolic_calls_to_new_names, subst_expander) return rule_mapping_context.finish_kernel(name_changer.map_kernel(kernel))
def _split_array_axis_inner(kernel, array_name, axis_nr, count, order="C"): if count == 1: return kernel # {{{ adjust arrays from loopy.kernel.tools import ArrayChanger achng = ArrayChanger(kernel, array_name) ary = achng.get() from pytools import div_ceil # {{{ adjust shape new_shape = ary.shape if new_shape is not None: new_shape = list(new_shape) axis_len = new_shape[axis_nr] new_shape[axis_nr] = count outer_len = div_ceil(axis_len, count) if order == "F": new_shape.insert(axis_nr+1, outer_len) elif order == "C": new_shape.insert(axis_nr, outer_len) else: raise RuntimeError("order '%s' not understood" % order) new_shape = tuple(new_shape) # }}} # {{{ adjust dim tags if ary.dim_tags is None: raise RuntimeError("dim_tags of '%s' are not known" % array_name) new_dim_tags = list(ary.dim_tags) old_dim_tag = ary.dim_tags[axis_nr] from loopy.kernel.array import FixedStrideArrayDimTag if not isinstance(old_dim_tag, FixedStrideArrayDimTag): raise RuntimeError("axis %d of '%s' is not tagged fixed-stride" % (axis_nr, array_name)) old_stride = old_dim_tag.stride outer_stride = count*old_stride if order == "F": new_dim_tags.insert(axis_nr+1, FixedStrideArrayDimTag(outer_stride)) elif order == "C": new_dim_tags.insert(axis_nr, FixedStrideArrayDimTag(outer_stride)) else: raise RuntimeError("order '%s' not understood" % order) new_dim_tags = tuple(new_dim_tags) # }}} # {{{ adjust dim_names new_dim_names = ary.dim_names if new_dim_names is not None: new_dim_names = list(new_dim_names) existing_name = new_dim_names[axis_nr] new_dim_names[axis_nr] = existing_name + "_inner" outer_name = existing_name + "_outer" if order == "F": new_dim_names.insert(axis_nr+1, outer_name) elif order == "C": new_dim_names.insert(axis_nr, outer_name) else: raise RuntimeError("order '%s' not understood" % order) new_dim_names = tuple(new_dim_names) # }}} kernel = achng.with_changed_array(ary.copy( shape=new_shape, dim_tags=new_dim_tags, dim_names=new_dim_names)) # }}} var_name_gen = kernel.get_var_name_generator() def split_access_axis(expr): idx = expr.index if not isinstance(idx, tuple): idx = (idx,) idx = list(idx) axis_idx = idx[axis_nr] from loopy.symbolic import simplify_using_aff inner_index = simplify_using_aff(kernel, axis_idx % count) outer_index = simplify_using_aff(kernel, axis_idx // count) idx[axis_nr] = inner_index if order == "F": idx.insert(axis_nr+1, outer_index) elif order == "C": idx.insert(axis_nr, outer_index) else: raise RuntimeError("order '%s' not understood" % order) return expr.aggregate.index(tuple(idx)) rule_mapping_context = SubstitutionRuleMappingContext( kernel.substitutions, var_name_gen) aash = ArrayAxisSplitHelper(rule_mapping_context, set([array_name]), split_access_axis) kernel = rule_mapping_context.finish_kernel(aash.map_kernel(kernel)) return kernel
def split_iname(kernel, split_iname, inner_length, outer_iname=None, inner_iname=None, outer_tag=None, inner_tag=None, slabs=(0, 0), do_tagged_check=True, within=None): """ :arg within: a stack match as understood by :func:`loopy.context_matching.parse_stack_match`. """ existing_tag = kernel.iname_to_tag.get(split_iname) from loopy.kernel.data import ForceSequentialTag if do_tagged_check and ( existing_tag is not None and not isinstance(existing_tag, ForceSequentialTag)): raise LoopyError("cannot split already tagged iname '%s'" % split_iname) if split_iname not in kernel.all_inames(): raise ValueError("cannot split loop for unknown variable '%s'" % split_iname) applied_iname_rewrites = kernel.applied_iname_rewrites[:] vng = kernel.get_var_name_generator() if outer_iname is None: outer_iname = vng(split_iname+"_outer") if inner_iname is None: inner_iname = vng(split_iname+"_inner") def process_set(s): var_dict = s.get_var_dict() if split_iname not in var_dict: return s orig_dim_type, _ = var_dict[split_iname] outer_var_nr = s.dim(orig_dim_type) inner_var_nr = s.dim(orig_dim_type)+1 s = s.add_dims(orig_dim_type, 2) s = s.set_dim_name(orig_dim_type, outer_var_nr, outer_iname) s = s.set_dim_name(orig_dim_type, inner_var_nr, inner_iname) from loopy.isl_helpers import make_slab space = s.get_space() inner_constraint_set = ( make_slab(space, inner_iname, 0, inner_length) # name = inner + length*outer .add_constraint(isl.Constraint.eq_from_names( space, { split_iname: 1, inner_iname: -1, outer_iname: -inner_length}))) name_dim_type, name_idx = space.get_var_dict()[split_iname] s = s.intersect(inner_constraint_set) if within is None: s = s.project_out(name_dim_type, name_idx, 1) return s new_domains = [process_set(dom) for dom in kernel.domains] from pymbolic import var inner = var(inner_iname) outer = var(outer_iname) new_loop_index = inner + outer*inner_length subst_map = {var(split_iname): new_loop_index} applied_iname_rewrites.append(subst_map) # {{{ update forced_iname deps new_insns = [] for insn in kernel.instructions: if split_iname in insn.forced_iname_deps: new_forced_iname_deps = ( (insn.forced_iname_deps.copy() - frozenset([split_iname])) | frozenset([outer_iname, inner_iname])) else: new_forced_iname_deps = insn.forced_iname_deps insn = insn.copy( forced_iname_deps=new_forced_iname_deps) new_insns.append(insn) # }}} iname_slab_increments = kernel.iname_slab_increments.copy() iname_slab_increments[outer_iname] = slabs new_loop_priority = [] for prio_iname in kernel.loop_priority: if prio_iname == split_iname: new_loop_priority.append(outer_iname) new_loop_priority.append(inner_iname) else: new_loop_priority.append(prio_iname) kernel = kernel.copy( domains=new_domains, iname_slab_increments=iname_slab_increments, instructions=new_insns, applied_iname_rewrites=applied_iname_rewrites, loop_priority=new_loop_priority) from loopy.context_matching import parse_stack_match within = parse_stack_match(within) rule_mapping_context = SubstitutionRuleMappingContext( kernel.substitutions, kernel.get_var_name_generator()) ins = _InameSplitter(rule_mapping_context, within, split_iname, outer_iname, inner_iname, new_loop_index) kernel = ins.map_kernel(kernel) kernel = rule_mapping_context.finish_kernel(kernel) if existing_tag is not None: kernel = tag_inames(kernel, {outer_iname: existing_tag, inner_iname: existing_tag}) return tag_inames(kernel, {outer_iname: outer_tag, inner_iname: inner_tag})
def precompute(kernel, subst_use, sweep_inames=[], within=None, storage_axes=None, temporary_name=None, precompute_inames=None, storage_axis_to_tag={}, default_tag="l.auto", dtype=None, fetch_bounding_box=False, temporary_is_local=None, compute_insn_id=None): """Precompute the expression described in the substitution rule determined by *subst_use* and store it in a temporary array. A precomputation needs two things to operate, a list of *sweep_inames* (order irrelevant) and an ordered list of *storage_axes* (whose order will describe the axis ordering of the temporary array). :arg subst_use: Describes what to prefetch. The following objects may be given for *subst_use*: * The name of the substitution rule. * The tagged name ("name$tag") of the substitution rule. * A list of invocations of the substitution rule. This list of invocations, when swept across *sweep_inames*, then serves to define the footprint of the precomputation. Invocations may be tagged ("name$tag") to filter out a subset of the usage sites of the substitution rule. (Namely those usage sites that use the same tagged name.) Invocations may be given as a string or as a :class:`pymbolic.primitives.Expression` object. If only one invocation is to be given, then the only entry of the list may be given directly. If the list of invocations generating the footprint is not given, all (tag-matching, if desired) usage sites of the substitution rule are used to determine the footprint. The following cases can arise for each sweep axis: * The axis is an iname that occurs within arguments specified at usage sites of the substitution rule. This case is assumed covered by the storage axes provided for the argument. * The axis is an iname that occurs within the *value* of the rule, but not within its arguments. A new, dedicated storage axis is allocated for such an axis. :arg sweep_inames: A :class:`list` of inames and/or rule argument names to be swept. May also equivalently be a comma-separated string. :arg storage_axes: A :class:`list` of inames and/or rule argument names/indices to be used as storage axes. May also equivalently be a comma-separated string. :arg within: a stack match as understood by :func:`loopy.context_matching.parse_stack_match`. :arg temporary_name: The temporary variable name to use for storing the precomputed data. If it does not exist, it will be created. If it does exist, its properties (such as size, type) are checked (and updated, if possible) to match its use. :arg precompute_inames: A tuple of inames to be used to carry out the precomputation. If the specified inames do not already exist, they will be created. If they do already exist, their loop domain is verified against the one required for this precomputation. This tuple may be shorter than the (provided or automatically found) *storage_axes* tuple, in which case names will be automatically created. May also equivalently be a comma-separated string. :arg compute_insn_id: The ID of the instruction performing the precomputation. If `storage_axes` is not specified, it defaults to the arrangement `<direct sweep axes><arguments>` with the direct sweep axes being the slower-varying indices. Trivial storage axes (i.e. axes of length 1 with respect to the sweep) are eliminated. """ # {{{ check, standardize arguments if isinstance(sweep_inames, str): sweep_inames = [iname.strip() for iname in sweep_inames.split(",")] for iname in sweep_inames: if iname not in kernel.all_inames(): raise RuntimeError("sweep iname '%s' is not a known iname" % iname) sweep_inames = list(sweep_inames) sweep_inames_set = frozenset(sweep_inames) if isinstance(storage_axes, str): storage_axes = [ax.strip() for ax in storage_axes.split(",")] if isinstance(precompute_inames, str): precompute_inames = [iname.strip() for iname in precompute_inames.split(",")] if isinstance(subst_use, str): subst_use = [subst_use] footprint_generators = None subst_name = None subst_tag = None from pymbolic.primitives import Variable, Call from loopy.symbolic import parse, TaggedVariable for use in subst_use: if isinstance(use, str): use = parse(use) if isinstance(use, Call): if footprint_generators is None: footprint_generators = [] footprint_generators.append(use) subst_name_as_expr = use.function else: subst_name_as_expr = use if isinstance(subst_name_as_expr, TaggedVariable): new_subst_name = subst_name_as_expr.name new_subst_tag = subst_name_as_expr.tag elif isinstance(subst_name_as_expr, Variable): new_subst_name = subst_name_as_expr.name new_subst_tag = None else: raise ValueError("unexpected type of subst_name") if (subst_name, subst_tag) == (None, None): subst_name, subst_tag = new_subst_name, new_subst_tag else: if (subst_name, subst_tag) != (new_subst_name, new_subst_tag): raise ValueError("not all uses in subst_use agree " "on rule name and tag") from loopy.context_matching import parse_stack_match within = parse_stack_match(within) from loopy.kernel.data import parse_tag default_tag = parse_tag(default_tag) subst = kernel.substitutions[subst_name] c_subst_name = subst_name.replace(".", "_") # }}} # {{{ process invocations in footprint generators, start access_descriptors if footprint_generators: from pymbolic.primitives import Variable, Call access_descriptors = [] for fpg in footprint_generators: if isinstance(fpg, Variable): args = () elif isinstance(fpg, Call): args = fpg.parameters else: raise ValueError("footprint generator must " "be substitution rule invocation") access_descriptors.append( RuleAccessDescriptor( identifier=access_descriptor_id(args, None), args=args )) # }}} # {{{ gather up invocations in kernel code, finish access_descriptors if not footprint_generators: rule_mapping_context = SubstitutionRuleMappingContext( kernel.substitutions, kernel.get_var_name_generator()) invg = RuleInvocationGatherer( rule_mapping_context, kernel, subst_name, subst_tag, within) del rule_mapping_context import loopy as lp for insn in kernel.instructions: if isinstance(insn, lp.Assignment): invg(insn.assignee, kernel, insn) invg(insn.expression, kernel, insn) access_descriptors = invg.access_descriptors if not access_descriptors: raise RuntimeError("no invocations of '%s' found" % subst_name) # }}} # {{{ find inames used in arguments expanding_usage_arg_deps = set() for accdesc in access_descriptors: for arg in accdesc.args: expanding_usage_arg_deps.update( get_dependencies(arg) & kernel.all_inames()) # }}} var_name_gen = kernel.get_var_name_generator() # {{{ use given / find new storage_axes # extra axes made necessary because they don't occur in the arguments extra_storage_axes = set(sweep_inames_set - expanding_usage_arg_deps) from loopy.symbolic import SubstitutionRuleExpander submap = SubstitutionRuleExpander(kernel.substitutions) value_inames = get_dependencies( submap(subst.expression) ) & kernel.all_inames() if value_inames - expanding_usage_arg_deps < extra_storage_axes: raise RuntimeError("unreferenced sweep inames specified: " + ", ".join(extra_storage_axes - value_inames - expanding_usage_arg_deps)) new_iname_to_tag = {} if storage_axes is None: storage_axes = [] # Add sweep_inames (in given--rather than arbitrary--order) to # storage_axes *if* they are part of extra_storage_axes. for iname in sweep_inames: if iname in extra_storage_axes: extra_storage_axes.remove(iname) storage_axes.append(iname) if extra_storage_axes: if (precompute_inames is not None and len(storage_axes) < len(precompute_inames)): raise LoopyError("must specify a sufficient number of " "storage_axes to uniquely determine the meaning " "of the given precompute_inames. (%d storage_axes " "needed)" % len(precompute_inames)) storage_axes.extend(sorted(extra_storage_axes)) storage_axes.extend(range(len(subst.arguments))) del extra_storage_axes prior_storage_axis_name_dict = {} storage_axis_names = [] storage_axis_sources = [] # number for arg#, or iname # {{{ check for pre-existing precompute_inames if precompute_inames is not None: preexisting_precompute_inames = ( set(precompute_inames) & kernel.all_inames()) else: preexisting_precompute_inames = set() # }}} for i, saxis in enumerate(storage_axes): tag_lookup_saxis = saxis if saxis in subst.arguments: saxis = subst.arguments.index(saxis) storage_axis_sources.append(saxis) if isinstance(saxis, int): # argument index name = old_name = subst.arguments[saxis] else: old_name = saxis name = "%s_%s" % (c_subst_name, old_name) if (precompute_inames is not None and i < len(precompute_inames) and precompute_inames[i]): name = precompute_inames[i] tag_lookup_saxis = name if (name not in preexisting_precompute_inames and var_name_gen.is_name_conflicting(name)): raise RuntimeError("new storage axis name '%s' " "conflicts with existing name" % name) else: name = var_name_gen(name) storage_axis_names.append(name) if name not in preexisting_precompute_inames: new_iname_to_tag[name] = storage_axis_to_tag.get( tag_lookup_saxis, default_tag) prior_storage_axis_name_dict[name] = old_name del storage_axis_to_tag del storage_axes del precompute_inames # }}} # {{{ fill out access_descriptors[...].storage_axis_exprs access_descriptors = [ accdesc.copy( storage_axis_exprs=storage_axis_exprs( storage_axis_sources, accdesc.args)) for accdesc in access_descriptors] # }}} expanding_inames = sweep_inames_set | frozenset(expanding_usage_arg_deps) assert expanding_inames <= kernel.all_inames() if storage_axis_names: # {{{ find domain to be changed change_inames = expanding_inames | preexisting_precompute_inames from loopy.kernel.tools import DomainChanger domch = DomainChanger(kernel, change_inames) if domch.leaf_domain_index is not None: # If the sweep inames are at home in parent domains, then we'll add # fetches with loops over copies of these parent inames that will end # up being scheduled *within* loops over these parents. for iname in sweep_inames_set: if kernel.get_home_domain_index(iname) != domch.leaf_domain_index: raise RuntimeError("sweep iname '%s' is not 'at home' in the " "sweep's leaf domain" % iname) # }}} abm = ArrayToBufferMap(kernel, domch.domain, sweep_inames, access_descriptors, len(storage_axis_names)) non1_storage_axis_names = [] for i, saxis in enumerate(storage_axis_names): if abm.non1_storage_axis_flags[i]: non1_storage_axis_names.append(saxis) else: del new_iname_to_tag[saxis] if saxis in preexisting_precompute_inames: raise LoopyError("precompute axis %d (1-based) was " "eliminated as " "having length 1 but also mapped to existing " "iname '%s'" % (i+1, saxis)) mod_domain = domch.domain # {{{ modify the domain, taking into account preexisting inames # inames may already exist in mod_domain, add them primed to start primed_non1_saxis_names = [ iname+"'" for iname in non1_storage_axis_names] mod_domain = abm.augment_domain_with_sweep( domch.domain, primed_non1_saxis_names, boxify_sweep=fetch_bounding_box) check_domain = mod_domain for i, saxis in enumerate(non1_storage_axis_names): var_dict = mod_domain.get_var_dict(isl.dim_type.set) if saxis in preexisting_precompute_inames: # add equality constraint between existing and new variable dt, dim_idx = var_dict[saxis] saxis_aff = isl.Aff.var_on_domain(mod_domain.space, dt, dim_idx) dt, dim_idx = var_dict[primed_non1_saxis_names[i]] new_var_aff = isl.Aff.var_on_domain(mod_domain.space, dt, dim_idx) mod_domain = mod_domain.add_constraint( isl.Constraint.equality_from_aff(new_var_aff - saxis_aff)) # project out the new one mod_domain = mod_domain.project_out(dt, dim_idx, 1) else: # remove the prime from the new variable dt, dim_idx = var_dict[primed_non1_saxis_names[i]] mod_domain = mod_domain.set_dim_name(dt, dim_idx, saxis) # {{{ check that we got the desired domain check_domain = check_domain.project_out_except( primed_non1_saxis_names, [isl.dim_type.set]) mod_check_domain = mod_domain # re-add the prime from the new variable var_dict = mod_check_domain.get_var_dict(isl.dim_type.set) for saxis in non1_storage_axis_names: dt, dim_idx = var_dict[saxis] mod_check_domain = mod_check_domain.set_dim_name(dt, dim_idx, saxis+"'") mod_check_domain = mod_check_domain.project_out_except( primed_non1_saxis_names, [isl.dim_type.set]) mod_check_domain, check_domain = isl.align_two( mod_check_domain, check_domain) # The modified domain can't get bigger by adding constraints assert mod_check_domain <= check_domain if not check_domain <= mod_check_domain: print(check_domain) print(mod_check_domain) raise LoopyError("domain of preexisting inames does not match " "domain needed for precompute") # }}} # {{{ check that we didn't shrink the original domain # project out the new names from the modified domain orig_domain_inames = list(domch.domain.get_var_dict(isl.dim_type.set)) mod_check_domain = mod_domain.project_out_except( orig_domain_inames, [isl.dim_type.set]) check_domain = domch.domain mod_check_domain, check_domain = isl.align_two( mod_check_domain, check_domain) # The modified domain can't get bigger by adding constraints assert mod_check_domain <= check_domain if not check_domain <= mod_check_domain: print(check_domain) print(mod_check_domain) raise LoopyError("original domain got shrunk by applying the precompute") # }}} # }}} new_kernel_domains = domch.get_domains_with(mod_domain) else: # leave kernel domains unchanged new_kernel_domains = kernel.domains non1_storage_axis_names = [] abm = NoOpArrayToBufferMap() kernel = kernel.copy(domains=new_kernel_domains) # {{{ set up compute insn if temporary_name is None: temporary_name = var_name_gen(based_on=c_subst_name) assignee = var(temporary_name) if non1_storage_axis_names: assignee = assignee.index( tuple(var(iname) for iname in non1_storage_axis_names)) # {{{ process substitutions on compute instruction storage_axis_subst_dict = {} for arg_name, bi in zip(storage_axis_names, abm.storage_base_indices): if arg_name in non1_storage_axis_names: arg = var(arg_name) else: arg = 0 storage_axis_subst_dict[ prior_storage_axis_name_dict.get(arg_name, arg_name)] = arg+bi rule_mapping_context = SubstitutionRuleMappingContext( kernel.substitutions, kernel.get_var_name_generator()) from loopy.context_matching import parse_stack_match expr_subst_map = RuleAwareSubstitutionMapper( rule_mapping_context, make_subst_func(storage_axis_subst_dict), within=parse_stack_match(None)) compute_expression = expr_subst_map(subst.expression, kernel, None) # }}} from loopy.kernel.data import Assignment if compute_insn_id is None: compute_insn_id = kernel.make_unique_instruction_id(based_on=c_subst_name) compute_insn = Assignment( id=compute_insn_id, assignee=assignee, expression=compute_expression) # }}} # {{{ substitute rule into expressions in kernel (if within footprint) invr = RuleInvocationReplacer(rule_mapping_context, subst_name, subst_tag, within, access_descriptors, abm, storage_axis_names, storage_axis_sources, non1_storage_axis_names, temporary_name, compute_insn_id) kernel = invr.map_kernel(kernel) kernel = kernel.copy( instructions=[compute_insn] + kernel.instructions) kernel = rule_mapping_context.finish_kernel(kernel) # }}} # {{{ set up temp variable import loopy as lp if dtype is None: dtype = lp.auto else: dtype = np.dtype(dtype) import loopy as lp if temporary_is_local is None: temporary_is_local = lp.auto new_temp_shape = tuple(abm.non1_storage_shape) new_temporary_variables = kernel.temporary_variables.copy() if temporary_name not in new_temporary_variables: temp_var = lp.TemporaryVariable( name=temporary_name, dtype=dtype, base_indices=(0,)*len(new_temp_shape), shape=tuple(abm.non1_storage_shape), is_local=temporary_is_local) else: temp_var = new_temporary_variables[temporary_name] # {{{ check and adapt existing temporary if temp_var.dtype is lp.auto: pass elif temp_var.dtype is not lp.auto and dtype is lp.auto: dtype = temp_var.dtype elif temp_var.dtype is not lp.auto and dtype is not lp.auto: if temp_var.dtype != dtype: raise LoopyError("Existing and new dtype of temporary '%s' " "do not match (existing: %s, new: %s)" % (temporary_name, temp_var.dtype, dtype)) temp_var = temp_var.copy(dtype=dtype) if len(temp_var.shape) != len(new_temp_shape): raise LoopyError("Existing and new temporary '%s' do not " "have matching number of dimensions " % (temporary_name, len(temp_var.shape), len(new_temp_shape))) if temp_var.base_indices != (0,) * len(new_temp_shape): raise LoopyError("Existing and new temporary '%s' do not " "have matching number of dimensions " % (temporary_name, len(temp_var.shape), len(new_temp_shape))) new_temp_shape = tuple( max(i, ex_i) for i, ex_i in zip(new_temp_shape, temp_var.shape)) temp_var = temp_var.copy(shape=new_temp_shape) if temporary_is_local == temp_var.is_local: pass elif temporary_is_local is lp.auto: temporary_is_local = temp_var.is_local elif temp_var.is_local is lp.auto: pass else: raise LoopyError("Existing and new temporary '%s' do not " "have matching values of 'is_local'" % (temporary_name, temp_var.is_local, temporary_is_local)) temp_var = temp_var.copy(is_local=temporary_is_local) # }}} new_temporary_variables[temporary_name] = temp_var kernel = kernel.copy( temporary_variables=new_temporary_variables) # }}} from loopy import tag_inames kernel = tag_inames(kernel, new_iname_to_tag) from loopy.kernel.data import AutoFitLocalIndexTag has_automatic_axes = any( isinstance(tag, AutoFitLocalIndexTag) for tag in new_iname_to_tag.values()) if has_automatic_axes: from loopy.kernel.tools import assign_automatic_axes kernel = assign_automatic_axes(kernel) return kernel
def _split_iname_backend(kernel, split_iname, fixed_length, fixed_length_is_inner, make_new_loop_index, outer_iname=None, inner_iname=None, outer_tag=None, inner_tag=None, slabs=(0, 0), do_tagged_check=True, within=None): """ :arg within: a stack match as understood by :func:`loopy.match.parse_stack_match`. """ existing_tag = kernel.iname_to_tag.get(split_iname) from loopy.kernel.data import ForceSequentialTag if do_tagged_check and (existing_tag is not None and not isinstance(existing_tag, ForceSequentialTag)): raise LoopyError("cannot split already tagged iname '%s'" % split_iname) if split_iname not in kernel.all_inames(): raise ValueError("cannot split loop for unknown variable '%s'" % split_iname) applied_iname_rewrites = kernel.applied_iname_rewrites[:] vng = kernel.get_var_name_generator() if outer_iname is None: outer_iname = vng(split_iname + "_outer") if inner_iname is None: inner_iname = vng(split_iname + "_inner") def process_set(s): var_dict = s.get_var_dict() if split_iname not in var_dict: return s orig_dim_type, _ = var_dict[split_iname] outer_var_nr = s.dim(orig_dim_type) inner_var_nr = s.dim(orig_dim_type) + 1 s = s.add_dims(orig_dim_type, 2) s = s.set_dim_name(orig_dim_type, outer_var_nr, outer_iname) s = s.set_dim_name(orig_dim_type, inner_var_nr, inner_iname) from loopy.isl_helpers import make_slab if fixed_length_is_inner: fixed_iname, var_length_iname = inner_iname, outer_iname else: fixed_iname, var_length_iname = outer_iname, inner_iname space = s.get_space() fixed_constraint_set = ( make_slab(space, fixed_iname, 0, fixed_length) # name = fixed_iname + fixed_length*var_length_iname .add_constraint( isl.Constraint.eq_from_names( space, { split_iname: 1, fixed_iname: -1, var_length_iname: -fixed_length }))) name_dim_type, name_idx = space.get_var_dict()[split_iname] s = s.intersect(fixed_constraint_set) if within is None: s = s.project_out(name_dim_type, name_idx, 1) return s new_domains = [process_set(dom) for dom in kernel.domains] from pymbolic import var inner = var(inner_iname) outer = var(outer_iname) new_loop_index = make_new_loop_index(inner, outer) subst_map = {var(split_iname): new_loop_index} applied_iname_rewrites.append(subst_map) # {{{ update forced_iname deps new_insns = [] for insn in kernel.instructions: if split_iname in insn.within_inames: new_within_inames = ( (insn.within_inames.copy() - frozenset([split_iname])) | frozenset([outer_iname, inner_iname])) else: new_within_inames = insn.within_inames insn = insn.copy(within_inames=new_within_inames) new_insns.append(insn) # }}} iname_slab_increments = kernel.iname_slab_increments.copy() iname_slab_increments[outer_iname] = slabs new_loop_priority = [] for prio_iname in kernel.loop_priority: if prio_iname == split_iname: new_loop_priority.append(outer_iname) new_loop_priority.append(inner_iname) else: new_loop_priority.append(prio_iname) kernel = kernel.copy(domains=new_domains, iname_slab_increments=iname_slab_increments, instructions=new_insns, applied_iname_rewrites=applied_iname_rewrites, loop_priority=new_loop_priority) from loopy.match import parse_stack_match within = parse_stack_match(within) rule_mapping_context = SubstitutionRuleMappingContext( kernel.substitutions, kernel.get_var_name_generator()) ins = _InameSplitter(rule_mapping_context, within, split_iname, outer_iname, inner_iname, new_loop_index) kernel = ins.map_kernel(kernel) kernel = rule_mapping_context.finish_kernel(kernel) if existing_tag is not None: kernel = tag_inames(kernel, { outer_iname: existing_tag, inner_iname: existing_tag }) return tag_inames(kernel, {outer_iname: outer_tag, inner_iname: inner_tag})
class DifferentiationContext(object): def __init__(self, kernel, var_name_gen, by_name, diff_iname_prefix, additional_shape): self.kernel = kernel self.by_name = by_name self.diff_iname_prefix = diff_iname_prefix self.additional_shape = additional_shape self.imported_outputs = set() self.output_to_diff_output = {} self.generate_instruction_id = self.kernel.get_instruction_id_generator( ) self.new_args = [] self.new_temporary_variables = {} self.new_instructions = [] self.imported_instructions = set() self.new_domains = [] self.rule_mapping_context = SubstitutionRuleMappingContext( kernel.substitutions, var_name_gen) def get_new_kernel(self): knl = self.kernel new_args = knl.args + self.new_args new_temp_vars = knl.temporary_variables.copy() new_temp_vars.update(self.new_temporary_variables) knl = knl.copy(args=new_args, temporary_variables=new_temp_vars, instructions=self.new_instructions, domains=knl.domains + self.new_domains) del new_args del new_temp_vars knl = self.rule_mapping_context.finish_kernel(knl) return knl # {{{ kernel gen entrypoints def add_diff_inames(self): diff_inames = tuple( self.rule_mapping_context.make_unique_var_name( self.diff_iname_prefix + str(i)) for i in range(len(self.additional_shape))) diff_parameters = set() from loopy.symbolic import get_dependencies for s in self.additional_shape: diff_parameters.update(get_dependencies(s)) diff_domain = isl.BasicSet( "[%s] -> {[%s]}" % (", ".join(diff_parameters), ", ".join(diff_inames))) for i, diff_iname in enumerate(diff_inames): diff_domain = diff_domain & make_slab( diff_domain.space, diff_iname, 0, self.additional_shape[i]) self.new_domains.append(diff_domain) return diff_inames # }}} def import_instruction_and_deps(self, insn_id): if insn_id in self.imported_instructions: return insn = self.kernel.id_to_insn[insn_id] self.new_instructions.append(insn) self.imported_instructions.add(insn_id) id_map = RuleAwareIdentityMapper(self.rule_mapping_context) if isinstance(insn, lp.Assignment): id_map(insn.expression, self.kernel, insn) else: raise RuntimeError("do not know how to deal with " "instruction of type %s" % type(insn)) for dep in insn.depends_on: self.import_instruction_and_deps(dep) def import_output_var(self, var_name): writers = self.kernel.writer_map().get(var_name, []) if len(writers) > 1: raise LoopyError("%s is written in more than one place" % var_name) if not writers: return insn_id, = writers self.import_instruction_and_deps(insn_id) def get_diff_var(self, var_name): """ :return: a string containing the name of a new variable holding the derivative of *var_name* by the desired *diff_context.by_name*, or *None* if no dependency exists. """ new_var_name = self.rule_mapping_context.make_unique_var_name( var_name + "_d" + self.by_name) writers = self.kernel.writer_map().get(var_name, []) if not writers: # FIXME: There should be hooks to supply earlier dvar_dby # This would be the spot to think about them. return None if len(writers) > 1: raise LoopyError("%s is written in more than one place" % var_name) orig_writer_id, = writers orig_writer_insn = self.kernel.id_to_insn[orig_writer_id] diff_inames = self.add_diff_inames() diff_iname_exprs = tuple(var(diname) for diname in diff_inames) # {{{ write code diff_mapper = LoopyDiffMapper(self.rule_mapping_context, self, diff_inames) diff_expr = diff_mapper(orig_writer_insn.expression, self.kernel, orig_writer_insn) if not diff_expr: return None assert isinstance(orig_writer_insn, lp.Assignment) if isinstance(orig_writer_insn.assignee, p.Subscript): lhs_ind = orig_writer_insn.assignee.index_tuple elif isinstance(orig_writer_insn.assignee, p.Variable): lhs_ind = () else: raise LoopyError("Unrecognized LHS type in differentiation: %s" % type(orig_writer_insn.assignee).__name__) new_insn_id = self.generate_instruction_id() insn = lp.Assignment(id=new_insn_id, assignee=var(new_var_name)[lhs_ind + diff_iname_exprs], expression=diff_expr, within_inames=(orig_writer_insn.within_inames | frozenset(diff_inames))) self.new_instructions.append(insn) # }}} # {{{ manage variable declaration if var_name in self.kernel.arg_dict: arg = self.kernel.arg_dict[var_name] orig_shape = arg.shape elif var_name in self.kernel.temporary_variables: tv = self.kernel.temporary_variables[var_name] orig_shape = tv.shape else: raise ValueError("%s: variable not found" % var_name) shape = orig_shape + self.additional_shape dim_tags = ("c", ) * len(shape) if var_name in self.kernel.arg_dict: self.new_args.append( lp.GlobalArg( new_var_name, arg.dtype, shape=shape, dim_tags=dim_tags, )) elif var_name in self.kernel.temporary_variables: self.new_temporary_variables[new_var_name] = lp.TemporaryVariable( new_var_name, tv.dtype, shape=shape, dim_tags=dim_tags) # }}} return new_var_name
def to_batched(knl, nbatches, batch_varying_args, batch_iname_prefix="ibatch"): """Takes in a kernel that carries out an operation and returns a kernel that carries out a batch of these operations. :arg nbatches: the number of batches. May be a constant non-negative integer or a string, which will be added as an integer argument. :arg batch_varying_args: a list of argument names that depend vary per-batch. Each such variable will have a batch index added. """ from pymbolic import var vng = knl.get_var_name_generator() batch_iname = vng(batch_iname_prefix) batch_iname_expr = var(batch_iname) new_args = [] batch_dom_str = "{[%(iname)s]: 0 <= %(iname)s < %(nbatches)s}" % { "iname": batch_iname, "nbatches": nbatches, } if not isinstance(nbatches, int): batch_dom_str = "[%s] -> " % nbatches + batch_dom_str new_args.append(ValueArg(nbatches, dtype=knl.index_dtype)) nbatches_expr = var(nbatches) else: nbatches_expr = nbatches batch_domain = isl.BasicSet(batch_dom_str) new_domains = [batch_domain] + knl.domains for arg in knl.args: if arg.name in batch_varying_args: if isinstance(arg, ValueArg): arg = GlobalArg(arg.name, arg.dtype, shape=(nbatches_expr,), dim_tags="c") else: arg = arg.copy( shape=(nbatches_expr,) + arg.shape, dim_tags=("c",) * (len(arg.shape) + 1)) new_args.append(arg) new_temps = {} for temp in six.itervalues(knl.temporary_variables): new_temps[temp.name] = temp.copy( shape=(nbatches_expr,) + temp.shape, dim_tags=("c",) * (len(arg.shape) + 1)) knl = knl.copy( domains=new_domains, args=new_args, temporary_variables=new_temps) rule_mapping_context = SubstitutionRuleMappingContext( knl.substitutions, vng) bvc = _BatchVariableChanger(rule_mapping_context, knl, batch_varying_args, batch_iname_expr) return rule_mapping_context.finish_kernel( bvc.map_kernel(knl))
def precompute( kernel, subst_use, sweep_inames=[], within=None, storage_axes=None, temporary_name=None, precompute_inames=None, precompute_outer_inames=None, storage_axis_to_tag={}, # "None" is a valid value here, distinct from the default. default_tag=_not_provided, dtype=None, fetch_bounding_box=False, temporary_address_space=None, compute_insn_id=None, **kwargs): """Precompute the expression described in the substitution rule determined by *subst_use* and store it in a temporary array. A precomputation needs two things to operate, a list of *sweep_inames* (order irrelevant) and an ordered list of *storage_axes* (whose order will describe the axis ordering of the temporary array). :arg subst_use: Describes what to prefetch. The following objects may be given for *subst_use*: * The name of the substitution rule. * The tagged name ("name$tag") of the substitution rule. * A list of invocations of the substitution rule. This list of invocations, when swept across *sweep_inames*, then serves to define the footprint of the precomputation. Invocations may be tagged ("name$tag") to filter out a subset of the usage sites of the substitution rule. (Namely those usage sites that use the same tagged name.) Invocations may be given as a string or as a :class:`pymbolic.primitives.Expression` object. If only one invocation is to be given, then the only entry of the list may be given directly. If the list of invocations generating the footprint is not given, all (tag-matching, if desired) usage sites of the substitution rule are used to determine the footprint. The following cases can arise for each sweep axis: * The axis is an iname that occurs within arguments specified at usage sites of the substitution rule. This case is assumed covered by the storage axes provided for the argument. * The axis is an iname that occurs within the *value* of the rule, but not within its arguments. A new, dedicated storage axis is allocated for such an axis. :arg sweep_inames: A :class:`list` of inames to be swept. May also equivalently be a comma-separated string. :arg within: a stack match as understood by :func:`loopy.match.parse_stack_match`. :arg storage_axes: A :class:`list` of inames and/or rule argument names/indices to be used as storage axes. May also equivalently be a comma-separated string. :arg temporary_name: The temporary variable name to use for storing the precomputed data. If it does not exist, it will be created. If it does exist, its properties (such as size, type) are checked (and updated, if possible) to match its use. :arg precompute_inames: A tuple of inames to be used to carry out the precomputation. If the specified inames do not already exist, they will be created. If they do already exist, their loop domain is verified against the one required for this precomputation. This tuple may be shorter than the (provided or automatically found) *storage_axes* tuple, in which case names will be automatically created. May also equivalently be a comma-separated string. :arg precompute_outer_inames: A :class:`frozenset` of inames within which the compute instruction is nested. If *None*, make an educated guess. May also be specified as a comma-separated string. :arg default_tag: The :ref:`iname tag <iname-tags>` to be applied to the inames created to perform the precomputation. The current default will make them local axes and automatically split them to fit the work group size, but this default will disappear in favor of simply leaving them untagged in 2019. For 2018, a warning will be issued if no *default_tag* is specified. :arg compute_insn_id: The ID of the instruction generated to perform the precomputation. If `storage_axes` is not specified, it defaults to the arrangement `<direct sweep axes><arguments>` with the direct sweep axes being the slower-varying indices. Trivial storage axes (i.e. axes of length 1 with respect to the sweep) are eliminated. """ # {{{ unify temporary_address_space / temporary_scope temporary_scope = kwargs.pop("temporary_scope", None) from loopy.kernel.data import AddressSpace if temporary_scope is not None: from warnings import warn warn( "temporary_scope is deprecated. Use temporary_address_space instead", DeprecationWarning, stacklevel=2) if temporary_address_space is not None: raise LoopyError( "may not specify both temporary_address_space and " "temporary_scope") temporary_address_space = temporary_scope del temporary_scope # }}} if kwargs: raise TypeError("unrecognized keyword arguments: %s" % ", ".join(kwargs.keys())) # {{{ check, standardize arguments if isinstance(sweep_inames, str): sweep_inames = [iname.strip() for iname in sweep_inames.split(",")] for iname in sweep_inames: if iname not in kernel.all_inames(): raise RuntimeError("sweep iname '%s' is not a known iname" % iname) sweep_inames = list(sweep_inames) sweep_inames_set = frozenset(sweep_inames) if isinstance(storage_axes, str): storage_axes = [ax.strip() for ax in storage_axes.split(",")] if isinstance(precompute_inames, str): precompute_inames = [ iname.strip() for iname in precompute_inames.split(",") ] if isinstance(precompute_outer_inames, str): precompute_outer_inames = frozenset( iname.strip() for iname in precompute_outer_inames.split(",")) if isinstance(subst_use, str): subst_use = [subst_use] footprint_generators = None subst_name = None subst_tag = None from pymbolic.primitives import Variable, Call from loopy.symbolic import parse, TaggedVariable for use in subst_use: if isinstance(use, str): use = parse(use) if isinstance(use, Call): if footprint_generators is None: footprint_generators = [] footprint_generators.append(use) subst_name_as_expr = use.function else: subst_name_as_expr = use if isinstance(subst_name_as_expr, TaggedVariable): new_subst_name = subst_name_as_expr.name new_subst_tag = subst_name_as_expr.tag elif isinstance(subst_name_as_expr, Variable): new_subst_name = subst_name_as_expr.name new_subst_tag = None else: raise ValueError("unexpected type of subst_name") if (subst_name, subst_tag) == (None, None): subst_name, subst_tag = new_subst_name, new_subst_tag else: if (subst_name, subst_tag) != (new_subst_name, new_subst_tag): raise ValueError("not all uses in subst_use agree " "on rule name and tag") from loopy.match import parse_stack_match within = parse_stack_match(within) try: subst = kernel.substitutions[subst_name] except KeyError: raise LoopyError("substitution rule '%s' not found" % subst_name) c_subst_name = subst_name.replace(".", "_") # {{{ handle default_tag from loopy.transform.data import _not_provided \ as transform_data_not_provided if default_tag is _not_provided or default_tag is transform_data_not_provided: # no need to warn for scalar precomputes if sweep_inames: from warnings import warn warn( "Not specifying default_tag is deprecated, and default_tag " "will become mandatory in 2019.x. " "Pass 'default_tag=\"l.auto\" to match the current default, " "or Pass 'default_tag=None to leave the loops untagged, which " "is the recommended behavior.", DeprecationWarning, stacklevel=( # In this case, we came here through add_prefetch. Increase # the stacklevel. 3 if default_tag is transform_data_not_provided else 2)) default_tag = "l.auto" from loopy.kernel.data import parse_tag default_tag = parse_tag(default_tag) # }}} # }}} # {{{ process invocations in footprint generators, start access_descriptors if footprint_generators: from pymbolic.primitives import Variable, Call access_descriptors = [] for fpg in footprint_generators: if isinstance(fpg, Variable): args = () elif isinstance(fpg, Call): args = fpg.parameters else: raise ValueError("footprint generator must " "be substitution rule invocation") access_descriptors.append( RuleAccessDescriptor(identifier=access_descriptor_id( args, None), args=args)) # }}} # {{{ gather up invocations in kernel code, finish access_descriptors if not footprint_generators: rule_mapping_context = SubstitutionRuleMappingContext( kernel.substitutions, kernel.get_var_name_generator()) invg = RuleInvocationGatherer(rule_mapping_context, kernel, subst_name, subst_tag, within) del rule_mapping_context import loopy as lp for insn in kernel.instructions: if isinstance(insn, lp.MultiAssignmentBase): for assignee in insn.assignees: invg(assignee, kernel, insn) invg(insn.expression, kernel, insn) access_descriptors = invg.access_descriptors if not access_descriptors: raise RuntimeError("no invocations of '%s' found" % subst_name) # }}} # {{{ find inames used in arguments expanding_usage_arg_deps = set() for accdesc in access_descriptors: for arg in accdesc.args: expanding_usage_arg_deps.update( get_dependencies(arg) & kernel.all_inames()) # }}} var_name_gen = kernel.get_var_name_generator() # {{{ use given / find new storage_axes # extra axes made necessary because they don't occur in the arguments extra_storage_axes = set(sweep_inames_set - expanding_usage_arg_deps) from loopy.symbolic import SubstitutionRuleExpander submap = SubstitutionRuleExpander(kernel.substitutions) value_inames = (get_dependencies(submap(subst.expression)) - frozenset(subst.arguments)) & kernel.all_inames() if value_inames - expanding_usage_arg_deps < extra_storage_axes: raise RuntimeError("unreferenced sweep inames specified: " + ", ".join(extra_storage_axes - value_inames - expanding_usage_arg_deps)) new_iname_to_tag = {} if storage_axes is None: storage_axes = [] # Add sweep_inames (in given--rather than arbitrary--order) to # storage_axes *if* they are part of extra_storage_axes. for iname in sweep_inames: if iname in extra_storage_axes: extra_storage_axes.remove(iname) storage_axes.append(iname) if extra_storage_axes: if (precompute_inames is not None and len(storage_axes) < len(precompute_inames)): raise LoopyError( "must specify a sufficient number of " "storage_axes to uniquely determine the meaning " "of the given precompute_inames. (%d storage_axes " "needed)" % len(precompute_inames)) storage_axes.extend(sorted(extra_storage_axes)) storage_axes.extend(range(len(subst.arguments))) del extra_storage_axes prior_storage_axis_name_dict = {} storage_axis_names = [] storage_axis_sources = [] # number for arg#, or iname # {{{ check for pre-existing precompute_inames if precompute_inames is not None: preexisting_precompute_inames = (set(precompute_inames) & kernel.all_inames()) else: preexisting_precompute_inames = set() # }}} for i, saxis in enumerate(storage_axes): tag_lookup_saxis = saxis if saxis in subst.arguments: saxis = subst.arguments.index(saxis) storage_axis_sources.append(saxis) if isinstance(saxis, int): # argument index name = old_name = subst.arguments[saxis] else: old_name = saxis name = "%s_%s" % (c_subst_name, old_name) if (precompute_inames is not None and i < len(precompute_inames) and precompute_inames[i]): name = precompute_inames[i] tag_lookup_saxis = name if (name not in preexisting_precompute_inames and var_name_gen.is_name_conflicting(name)): raise RuntimeError("new storage axis name '%s' " "conflicts with existing name" % name) else: name = var_name_gen(name) storage_axis_names.append(name) if name not in preexisting_precompute_inames: new_iname_to_tag[name] = storage_axis_to_tag.get( tag_lookup_saxis, default_tag) prior_storage_axis_name_dict[name] = old_name del storage_axis_to_tag del storage_axes del precompute_inames # }}} # {{{ fill out access_descriptors[...].storage_axis_exprs access_descriptors = [ accdesc.copy(storage_axis_exprs=storage_axis_exprs( storage_axis_sources, accdesc.args)) for accdesc in access_descriptors ] # }}} expanding_inames = sweep_inames_set | frozenset(expanding_usage_arg_deps) assert expanding_inames <= kernel.all_inames() if storage_axis_names: # {{{ find domain to be changed change_inames = expanding_inames | preexisting_precompute_inames from loopy.kernel.tools import DomainChanger domch = DomainChanger(kernel, change_inames) if domch.leaf_domain_index is not None: # If the sweep inames are at home in parent domains, then we'll add # fetches with loops over copies of these parent inames that will end # up being scheduled *within* loops over these parents. for iname in sweep_inames_set: if kernel.get_home_domain_index( iname) != domch.leaf_domain_index: raise RuntimeError( "sweep iname '%s' is not 'at home' in the " "sweep's leaf domain" % iname) # }}} abm = ArrayToBufferMap(kernel, domch.domain, sweep_inames, access_descriptors, len(storage_axis_names)) non1_storage_axis_names = [] for i, saxis in enumerate(storage_axis_names): if abm.non1_storage_axis_flags[i]: non1_storage_axis_names.append(saxis) else: del new_iname_to_tag[saxis] if saxis in preexisting_precompute_inames: raise LoopyError( "precompute axis %d (1-based) was " "eliminated as " "having length 1 but also mapped to existing " "iname '%s'" % (i + 1, saxis)) mod_domain = domch.domain # {{{ modify the domain, taking into account preexisting inames # inames may already exist in mod_domain, add them primed to start primed_non1_saxis_names = [ iname + "'" for iname in non1_storage_axis_names ] mod_domain = abm.augment_domain_with_sweep( domch.domain, primed_non1_saxis_names, boxify_sweep=fetch_bounding_box) check_domain = mod_domain for i, saxis in enumerate(non1_storage_axis_names): var_dict = mod_domain.get_var_dict(isl.dim_type.set) if saxis in preexisting_precompute_inames: # add equality constraint between existing and new variable dt, dim_idx = var_dict[saxis] saxis_aff = isl.Aff.var_on_domain(mod_domain.space, dt, dim_idx) dt, dim_idx = var_dict[primed_non1_saxis_names[i]] new_var_aff = isl.Aff.var_on_domain(mod_domain.space, dt, dim_idx) mod_domain = mod_domain.add_constraint( isl.Constraint.equality_from_aff(new_var_aff - saxis_aff)) # project out the new one mod_domain = mod_domain.project_out(dt, dim_idx, 1) else: # remove the prime from the new variable dt, dim_idx = var_dict[primed_non1_saxis_names[i]] mod_domain = mod_domain.set_dim_name(dt, dim_idx, saxis) def add_assumptions(d): assumption_non_param = isl.BasicSet.from_params(kernel.assumptions) assumptions, domain = isl.align_two(assumption_non_param, d) return assumptions & domain # {{{ check that we got the desired domain check_domain = add_assumptions( check_domain.project_out_except(primed_non1_saxis_names, [isl.dim_type.set])) mod_check_domain = add_assumptions(mod_domain) # re-add the prime from the new variable var_dict = mod_check_domain.get_var_dict(isl.dim_type.set) for saxis in non1_storage_axis_names: dt, dim_idx = var_dict[saxis] mod_check_domain = mod_check_domain.set_dim_name( dt, dim_idx, saxis + "'") mod_check_domain = mod_check_domain.project_out_except( primed_non1_saxis_names, [isl.dim_type.set]) mod_check_domain, check_domain = isl.align_two(mod_check_domain, check_domain) # The modified domain can't get bigger by adding constraints assert mod_check_domain <= check_domain if not check_domain <= mod_check_domain: print(check_domain) print(mod_check_domain) raise LoopyError("domain of preexisting inames does not match " "domain needed for precompute") # }}} # {{{ check that we didn't shrink the original domain # project out the new names from the modified domain orig_domain_inames = list(domch.domain.get_var_dict(isl.dim_type.set)) mod_check_domain = add_assumptions( mod_domain.project_out_except(orig_domain_inames, [isl.dim_type.set])) check_domain = add_assumptions(domch.domain) mod_check_domain, check_domain = isl.align_two(mod_check_domain, check_domain) # The modified domain can't get bigger by adding constraints assert mod_check_domain <= check_domain if not check_domain <= mod_check_domain: print(check_domain) print(mod_check_domain) raise LoopyError( "original domain got shrunk by applying the precompute") # }}} # }}} new_kernel_domains = domch.get_domains_with(mod_domain) else: # leave kernel domains unchanged new_kernel_domains = kernel.domains non1_storage_axis_names = [] abm = NoOpArrayToBufferMap() kernel = kernel.copy(domains=new_kernel_domains) # {{{ set up compute insn if temporary_name is None: temporary_name = var_name_gen(based_on=c_subst_name) assignee = var(temporary_name) if non1_storage_axis_names: assignee = assignee[tuple( var(iname) for iname in non1_storage_axis_names)] # {{{ process substitutions on compute instruction storage_axis_subst_dict = {} for arg_name, bi in zip(storage_axis_names, abm.storage_base_indices): if arg_name in non1_storage_axis_names: arg = var(arg_name) else: arg = 0 storage_axis_subst_dict[prior_storage_axis_name_dict.get( arg_name, arg_name)] = arg + bi rule_mapping_context = SubstitutionRuleMappingContext( kernel.substitutions, kernel.get_var_name_generator()) from loopy.match import parse_stack_match expr_subst_map = RuleAwareSubstitutionMapper( rule_mapping_context, make_subst_func(storage_axis_subst_dict), within=parse_stack_match(None)) compute_expression = expr_subst_map(subst.expression, kernel, None) # }}} from loopy.kernel.data import Assignment if compute_insn_id is None: compute_insn_id = kernel.make_unique_instruction_id( based_on=c_subst_name) compute_insn = Assignment( id=compute_insn_id, assignee=assignee, expression=compute_expression, # within_inames determined below ) compute_dep_id = compute_insn_id added_compute_insns = [compute_insn] if temporary_address_space == AddressSpace.GLOBAL: barrier_insn_id = kernel.make_unique_instruction_id( based_on=c_subst_name + "_barrier") from loopy.kernel.instruction import BarrierInstruction barrier_insn = BarrierInstruction(id=barrier_insn_id, depends_on=frozenset( [compute_insn_id]), synchronization_kind="global", mem_kind="global") compute_dep_id = barrier_insn_id added_compute_insns.append(barrier_insn) # }}} # {{{ substitute rule into expressions in kernel (if within footprint) from loopy.symbolic import SubstitutionRuleExpander expander = SubstitutionRuleExpander(kernel.substitutions) invr = RuleInvocationReplacer(rule_mapping_context, subst_name, subst_tag, within, access_descriptors, abm, storage_axis_names, storage_axis_sources, non1_storage_axis_names, temporary_name, compute_insn_id, compute_dep_id, compute_read_variables=get_dependencies( expander(compute_expression))) kernel = invr.map_kernel(kernel) kernel = kernel.copy(instructions=added_compute_insns + kernel.instructions) kernel = rule_mapping_context.finish_kernel(kernel) # }}} # {{{ add dependencies to compute insn kernel = kernel.copy(instructions=[ insn.copy(depends_on=frozenset(invr.compute_insn_depends_on)) if insn. id == compute_insn_id else insn for insn in kernel.instructions ]) # }}} # {{{ propagate storage iname subst to dependencies of compute instructions from loopy.kernel.tools import find_recursive_dependencies compute_deps = find_recursive_dependencies(kernel, frozenset([compute_insn_id])) # FIXME: Need to verify that there are no outside dependencies # on compute_deps prior_storage_axis_names = frozenset(storage_axis_subst_dict) new_insns = [] for insn in kernel.instructions: if (insn.id in compute_deps and insn.within_inames & prior_storage_axis_names): insn = (insn.with_transformed_expressions( lambda expr: expr_subst_map(expr, kernel, insn)).copy( within_inames=frozenset( storage_axis_subst_dict.get(iname, var(iname)).name for iname in insn.within_inames))) new_insns.append(insn) else: new_insns.append(insn) kernel = kernel.copy(instructions=new_insns) # }}} # {{{ determine inames for compute insn if precompute_outer_inames is None: from loopy.kernel.tools import guess_iname_deps_based_on_var_use precompute_outer_inames = ( frozenset(non1_storage_axis_names) | frozenset((expanding_usage_arg_deps | value_inames) - sweep_inames_set) | guess_iname_deps_based_on_var_use(kernel, compute_insn)) else: if not isinstance(precompute_outer_inames, frozenset): raise TypeError("precompute_outer_inames must be a frozenset") precompute_outer_inames = precompute_outer_inames \ | frozenset(non1_storage_axis_names) kernel = kernel.copy(instructions=[ insn.copy(within_inames=precompute_outer_inames) if insn.id == compute_insn_id else insn for insn in kernel.instructions ]) # }}} # {{{ set up temp variable import loopy as lp if dtype is not None: dtype = np.dtype(dtype) if temporary_address_space is None: temporary_address_space = lp.auto new_temp_shape = tuple(abm.non1_storage_shape) new_temporary_variables = kernel.temporary_variables.copy() if temporary_name not in new_temporary_variables: temp_var = lp.TemporaryVariable( name=temporary_name, dtype=dtype, base_indices=(0, ) * len(new_temp_shape), shape=tuple(abm.non1_storage_shape), address_space=temporary_address_space, dim_names=tuple(non1_storage_axis_names)) else: temp_var = new_temporary_variables[temporary_name] # {{{ check and adapt existing temporary if temp_var.dtype is lp.auto: pass elif temp_var.dtype is not lp.auto and dtype is lp.auto: dtype = temp_var.dtype elif temp_var.dtype is not lp.auto and dtype is not lp.auto: if temp_var.dtype != dtype: raise LoopyError("Existing and new dtype of temporary '%s' " "do not match (existing: %s, new: %s)" % (temporary_name, temp_var.dtype, dtype)) temp_var = temp_var.copy(dtype=dtype) if len(temp_var.shape) != len(new_temp_shape): raise LoopyError( "Existing and new temporary '%s' do not " "have matching number of dimensions ('%d' vs. '%d') " % (temporary_name, len(temp_var.shape), len(new_temp_shape))) if temp_var.base_indices != (0, ) * len(new_temp_shape): raise LoopyError( "Existing and new temporary '%s' do not " "have matching number of dimensions ('%d' vs. '%d') " % (temporary_name, len(temp_var.shape), len(new_temp_shape))) new_temp_shape = tuple( max(i, ex_i) for i, ex_i in zip(new_temp_shape, temp_var.shape)) temp_var = temp_var.copy(shape=new_temp_shape) if temporary_address_space == temp_var.address_space: pass elif temporary_address_space is lp.auto: temporary_address_space = temp_var.address_space elif temp_var.address_space is lp.auto: pass else: raise LoopyError("Existing and new temporary '%s' do not " "have matching scopes (existing: %s, new: %s)" % (temporary_name, AddressSpace.stringify(temp_var.address_space), AddressSpace.stringify(temporary_address_space))) temp_var = temp_var.copy(address_space=temporary_address_space) # }}} new_temporary_variables[temporary_name] = temp_var kernel = kernel.copy(temporary_variables=new_temporary_variables) # }}} from loopy import tag_inames kernel = tag_inames(kernel, new_iname_to_tag) from loopy.kernel.data import AutoFitLocalIndexTag, filter_iname_tags_by_type if filter_iname_tags_by_type(new_iname_to_tag.values(), AutoFitLocalIndexTag): from loopy.kernel.tools import assign_automatic_axes kernel = assign_automatic_axes(kernel) return kernel
def _fix_parameter(kernel, name, value): def process_set(s): var_dict = s.get_var_dict() try: dt, idx = var_dict[name] except KeyError: return s value_aff = isl.Aff.zero_on_domain(s.space) + value from loopy.isl_helpers import iname_rel_aff name_equal_value_aff = iname_rel_aff(s.space, name, "==", value_aff) s = s.add_constraint(isl.Constraint.equality_from_aff(name_equal_value_aff)).project_out(dt, idx, 1) return s new_domains = [process_set(dom) for dom in kernel.domains] from pymbolic.mapper.substitutor import make_subst_func subst_func = make_subst_func({name: value}) from loopy.symbolic import SubstitutionMapper, PartialEvaluationMapper subst_map = SubstitutionMapper(subst_func) ev_map = PartialEvaluationMapper() def map_expr(expr): return ev_map(subst_map(expr)) from loopy.kernel.array import ArrayBase new_args = [] for arg in kernel.args: if arg.name == name: # remove from argument list continue if not isinstance(arg, ArrayBase): new_args.append(arg) else: new_args.append(arg.map_exprs(map_expr)) new_temp_vars = {} for tv in six.itervalues(kernel.temporary_variables): new_temp_vars[tv.name] = tv.map_exprs(map_expr) from loopy.match import parse_stack_match within = parse_stack_match(None) rule_mapping_context = SubstitutionRuleMappingContext(kernel.substitutions, kernel.get_var_name_generator()) esubst_map = RuleAwareSubstitutionMapper(rule_mapping_context, subst_func, within=within) return rule_mapping_context.finish_kernel(esubst_map.map_kernel(kernel)).copy( domains=new_domains, args=new_args, temporary_variables=new_temp_vars, assumptions=process_set(kernel.assumptions), )
def split_array_dim(kernel, arrays_and_axes, count, auto_split_inames=True, split_kwargs=None): """ :arg arrays_and_axes: a list of tuples *(array, axis_nr)* indicating that the index in *axis_nr* should be split. The tuples may also be *(array, axis_nr, "F")*, indicating that the index will be split as it would be according to Fortran order. *array* may name a temporary variable or an argument. If *arrays_and_axes* is a :class:`tuple`, it is automatically wrapped in a list, to make single splits easier. :arg count: The group size to use in the split. :arg auto_split_inames: Whether to automatically split inames encountered in the specified indices. :arg split_kwargs: arguments to pass to :func:`loopy.split_inames` Note that splits on the corresponding inames are carried out implicitly. The inames may *not* be split beforehand. (There's no *really* good reason for this--this routine is just not smart enough to deal with this.) """ if count == 1: return kernel if split_kwargs is None: split_kwargs = {} # {{{ process input into array_to_rest # where "rest" is the non-argument-name part of the input tuples # in args_and_axes def normalize_rest(rest): if len(rest) == 1: return (rest[0], "C") elif len(rest) == 2: return rest else: raise RuntimeError("split instruction '%s' not understood" % rest) if isinstance(arrays_and_axes, tuple): arrays_and_axes = [arrays_and_axes] array_to_rest = dict( (tup[0], normalize_rest(tup[1:])) for tup in arrays_and_axes) if len(arrays_and_axes) != len(array_to_rest): raise RuntimeError("cannot split multiple axes of the same variable") del arrays_and_axes # }}} # {{{ adjust arrays from loopy.kernel.tools import ArrayChanger for array_name, (axis, order) in six.iteritems(array_to_rest): achng = ArrayChanger(kernel, array_name) ary = achng.get() from pytools import div_ceil # {{{ adjust shape new_shape = ary.shape if new_shape is not None: new_shape = list(new_shape) axis_len = new_shape[axis] new_shape[axis] = count outer_len = div_ceil(axis_len, count) if order == "F": new_shape.insert(axis+1, outer_len) elif order == "C": new_shape.insert(axis, outer_len) else: raise RuntimeError("order '%s' not understood" % order) new_shape = tuple(new_shape) # }}} # {{{ adjust dim tags if ary.dim_tags is None: raise RuntimeError("dim_tags of '%s' are not known" % array_name) new_dim_tags = list(ary.dim_tags) old_dim_tag = ary.dim_tags[axis] from loopy.kernel.array import FixedStrideArrayDimTag if not isinstance(old_dim_tag, FixedStrideArrayDimTag): raise RuntimeError("axis %d of '%s' is not tagged fixed-stride" % (axis, array_name)) old_stride = old_dim_tag.stride outer_stride = count*old_stride if order == "F": new_dim_tags.insert(axis+1, FixedStrideArrayDimTag(outer_stride)) elif order == "C": new_dim_tags.insert(axis, FixedStrideArrayDimTag(outer_stride)) else: raise RuntimeError("order '%s' not understood" % order) new_dim_tags = tuple(new_dim_tags) # }}} # {{{ adjust dim_names new_dim_names = ary.dim_names if new_dim_names is not None: new_dim_names = list(new_dim_names) existing_name = new_dim_names[axis] new_dim_names[axis] = existing_name + "_inner" outer_name = existing_name + "_outer" if order == "F": new_dim_names.insert(axis+1, outer_name) elif order == "C": new_dim_names.insert(axis, outer_name) else: raise RuntimeError("order '%s' not understood" % order) new_dim_names = tuple(new_dim_names) # }}} kernel = achng.with_changed_array(ary.copy( shape=new_shape, dim_tags=new_dim_tags, dim_names=new_dim_names)) # }}} split_vars = {} var_name_gen = kernel.get_var_name_generator() def split_access_axis(expr): axis_nr, order = array_to_rest[expr.aggregate.name] idx = expr.index if not isinstance(idx, tuple): idx = (idx,) idx = list(idx) axis_idx = idx[axis_nr] if auto_split_inames: from pymbolic.primitives import Variable if not isinstance(axis_idx, Variable): raise RuntimeError("found access '%s' in which axis %d is not a " "single variable--cannot split " "(Have you tried to do the split yourself, manually, " "beforehand? If so, you shouldn't.)" % (expr, axis_nr)) split_iname = idx[axis_nr].name assert split_iname in kernel.all_inames() try: outer_iname, inner_iname = split_vars[split_iname] except KeyError: outer_iname = var_name_gen(split_iname+"_outer") inner_iname = var_name_gen(split_iname+"_inner") split_vars[split_iname] = outer_iname, inner_iname inner_index = Variable(inner_iname) outer_index = Variable(outer_iname) else: from loopy.symbolic import simplify_using_aff inner_index = simplify_using_aff(kernel, axis_idx % count) outer_index = simplify_using_aff(kernel, axis_idx // count) idx[axis_nr] = inner_index if order == "F": idx.insert(axis+1, outer_index) elif order == "C": idx.insert(axis, outer_index) else: raise RuntimeError("order '%s' not understood" % order) return expr.aggregate.index(tuple(idx)) rule_mapping_context = SubstitutionRuleMappingContext( kernel.substitutions, var_name_gen) aash = ArrayAxisSplitHelper(rule_mapping_context, set(six.iterkeys(array_to_rest)), split_access_axis) kernel = rule_mapping_context.finish_kernel(aash.map_kernel(kernel)) if auto_split_inames: from loopy import split_iname for iname, (outer_iname, inner_iname) in six.iteritems(split_vars): kernel = split_iname(kernel, iname, count, outer_iname=outer_iname, inner_iname=inner_iname, **split_kwargs) return kernel
def to_batched(knl, nbatches, batch_varying_args, batch_iname_prefix="ibatch", sequential=False): """Takes in a kernel that carries out an operation and returns a kernel that carries out a batch of these operations. :arg nbatches: the number of batches. May be a constant non-negative integer or a string, which will be added as an integer argument. :arg batch_varying_args: a list of argument names that vary per-batch. Each such variable will have a batch index added. :arg sequential: A :class:`bool`. If *True*, do not duplicate temporary variables for each batch. This automatically tags the batch iname for sequential execution. """ from pymbolic import var vng = knl.get_var_name_generator() batch_iname = vng(batch_iname_prefix) batch_iname_expr = var(batch_iname) new_args = [] batch_dom_str = "{[%(iname)s]: 0 <= %(iname)s < %(nbatches)s}" % { "iname": batch_iname, "nbatches": nbatches, } if not isinstance(nbatches, int): batch_dom_str = "[%s] -> " % nbatches + batch_dom_str new_args.append(ValueArg(nbatches, dtype=knl.index_dtype)) nbatches_expr = var(nbatches) else: nbatches_expr = nbatches batch_domain = isl.BasicSet(batch_dom_str) new_domains = [batch_domain] + knl.domains for arg in knl.args: if arg.name in batch_varying_args: if isinstance(arg, ValueArg): arg = GlobalArg(arg.name, arg.dtype, shape=(nbatches_expr, ), dim_tags="c") else: arg = arg.copy(shape=(nbatches_expr, ) + arg.shape, dim_tags=("c", ) * (len(arg.shape) + 1), dim_names=_add_unique_dim_name( "ibatch", arg.dim_names)) new_args.append(arg) knl = knl.copy(domains=new_domains, args=new_args) if not sequential: new_temps = {} for temp in six.itervalues(knl.temporary_variables): if temp.initializer is not None and temp.read_only: new_temps[temp.name] = temp else: new_temps[temp.name] = temp.copy( shape=(nbatches_expr, ) + temp.shape, dim_tags=("c", ) * (len(temp.shape) + 1), dim_names=_add_unique_dim_name("ibatch", temp.dim_names)) knl = knl.copy(temporary_variables=new_temps) else: import loopy as lp from loopy.kernel.data import ForceSequentialTag knl = lp.tag_inames(knl, [(batch_iname, ForceSequentialTag())]) rule_mapping_context = SubstitutionRuleMappingContext( knl.substitutions, vng) bvc = _BatchVariableChanger(rule_mapping_context, knl, batch_varying_args, batch_iname_expr, sequential=sequential) kernel = rule_mapping_context.finish_kernel(bvc.map_kernel(knl)) batch_iname_set = frozenset([batch_iname]) kernel = kernel.copy(instructions=[ insn.copy(forced_iname_deps=insn.forced_iname_deps | batch_iname_set) for insn in kernel.instructions ]) return kernel