def check_variable_access_ordered(kernel): """Checks that between each write to a variable and all other accesses to the variable there is either: * an (at least indirect) depdendency edge, or * an explicit statement that no ordering is necessary (expressed through a bi-directional :attr:`loopy.Instruction.no_sync_with`) """ if kernel.options.enforce_variable_access_ordered not in [ "no_check", True, False]: raise LoopyError("invalid value for option " "'enforce_variable_access_ordered': %s" % kernel.options.enforce_variable_access_ordered) if kernel.options.enforce_variable_access_ordered == "no_check": return if kernel.options.enforce_variable_access_ordered: _check_variable_access_ordered_inner(kernel) else: from loopy.diagnostic import VariableAccessNotOrdered try: _check_variable_access_ordered_inner(kernel) except VariableAccessNotOrdered as e: from loopy.diagnostic import warn_with_kernel warn_with_kernel(kernel, "variable_access_ordered", str(e))
def check_for_unused_inames(kernel): # Warn if kernel has unused inames from loopy.transform.iname import get_used_inames unused_inames = kernel.all_inames() - get_used_inames(kernel) if unused_inames: warn_with_kernel( kernel, "unused_inames", "Found unused inames in kernel: %s " "Unused inames during linearization will be prohibited in " "Loopy version 2021.X." % unused_inames)
def _get_iname_order_for_printing(self): try: from loopy.kernel.tools import get_visual_iname_order_embedding embedding = get_visual_iname_order_embedding(self) except ValueError: from loopy.diagnostic import warn_with_kernel warn_with_kernel(self, "iname-order", "get_visual_iname_order_embedding() could not determine a " "consistent iname nesting order") embedding = dict((iname, iname) for iname in self.all_inames()) return embedding
def add_default_dependencies(kernel): logger.debug("%s: default deps" % kernel.name) from loopy.transform.subst import expand_subst expanded_kernel = expand_subst(kernel) writer_map = kernel.writer_map() arg_names = set(arg.name for arg in kernel.args) var_names = arg_names | set(six.iterkeys(kernel.temporary_variables)) dep_map = dict( (insn.id, insn.read_dependency_names() & var_names) for insn in expanded_kernel.instructions) new_insns = [] for insn in kernel.instructions: if not insn.depends_on_is_final: auto_deps = set() # {{{ add automatic dependencies all_my_var_writers = set() for var in dep_map[insn.id]: var_writers = writer_map.get(var, set()) all_my_var_writers |= var_writers if not var_writers and var not in arg_names: tv = kernel.temporary_variables[var] if tv.initializer is None: warn_with_kernel(kernel, "read_no_write(%s)" % var, "temporary variable '%s' is read, but never written." % var) if len(var_writers) == 1: auto_deps.update( var_writers - set([insn.id])) # }}} depends_on = insn.depends_on if depends_on is None: depends_on = frozenset() insn = insn.copy(depends_on=frozenset(auto_deps) | depends_on) new_insns.append(insn) return kernel.copy(instructions=new_insns)
def check_for_write_races(kernel): """ Check if any memory accesses lead to write races. """ from loopy.kernel.data import ConcurrentTag for insn in kernel.instructions: for assignee_name, assignee_indices in zip( insn.assignee_var_names(), insn.assignee_subscript_deps()): assignee_inames = assignee_indices & kernel.all_inames() if not assignee_inames <= insn.within_inames: raise LoopyError( "assignee of instructions '%s' references " "iname that the instruction does not depend on" % insn.id) if assignee_name in kernel.arg_dict: # Any concurrent tags that are not depended upon by the assignee # will cause write races. raceable_parallel_insn_inames = { iname for iname in insn.within_inames if kernel.iname_tags_of_type(iname, ConcurrentTag) } elif assignee_name in kernel.temporary_variables: temp_var = kernel.temporary_variables[assignee_name] raceable_parallel_insn_inames = { iname for iname in insn.within_inames if any( _is_racing_iname_tag(temp_var, tag) for tag in kernel.iname_tags(iname)) } else: raise LoopyError("invalid assignee name in instruction '%s'" % insn.id) race_inames = \ raceable_parallel_insn_inames - assignee_inames if race_inames: warn_with_kernel( kernel, "write_race(%s)" % insn.id, "instruction '%s' contains a write race: " "instruction will be run across parallel iname(s) " "'%s', which is/are not referenced in the lhs index" % (insn.id, ",".join(race_inames)), WriteRaceConditionWarning)
def _get_iname_order_for_printing(self): try: from loopy.kernel.tools import get_visual_iname_order_embedding embedding = get_visual_iname_order_embedding(self) except ValueError: from loopy.diagnostic import warn_with_kernel warn_with_kernel(self, "iname-order", "get_visual_iname_order_embedding() could not determine a " "consistent iname nesting order. This is a possible indication " "that the kernel may not schedule successfully, but for now " "it only impacts printing of the kernel.") embedding = dict((iname, iname) for iname in self.all_inames()) return embedding
def check_for_write_races(kernel): from loopy.kernel.data import ParallelTag iname_to_tag = kernel.iname_to_tag.get for insn in kernel.instructions: for assignee_name, assignee_indices in zip( insn.assignee_var_names(), insn.assignee_subscript_deps()): assignee_inames = assignee_indices & kernel.all_inames() if not assignee_inames <= kernel.insn_inames(insn): raise LoopyError( "assignee of instructiosn '%s' references " "iname that the instruction does not depend on" % insn.id) if assignee_name in kernel.arg_dict: # Any parallel tags that are not depended upon by the assignee # will cause write races. raceable_parallel_insn_inames = set( iname for iname in kernel.insn_inames(insn) if isinstance(iname_to_tag(iname), ParallelTag)) elif assignee_name in kernel.temporary_variables: temp_var = kernel.temporary_variables[assignee_name] raceable_parallel_insn_inames = set( iname for iname in kernel.insn_inames(insn) if _is_racing_iname_tag(temp_var, iname_to_tag(iname))) else: raise LoopyError("invalid assignee name in instruction '%s'" % insn.id) race_inames = \ raceable_parallel_insn_inames - assignee_inames if race_inames: warn_with_kernel(kernel, "write_race(%s)" % insn.id, "instruction '%s' contains a write race: " "instruction will be run across parallel iname(s) " "'%s', which is/are not referenced in the lhs index" % (insn.id, ",".join(race_inames)), WriteRaceConditionWarning)
def check_for_write_races(kernel): from loopy.kernel.data import ParallelTag iname_to_tag = kernel.iname_to_tag.get for insn in kernel.instructions: for assignee_name, assignee_indices in zip( insn.assignee_var_names(), insn.assignee_subscript_deps()): assignee_inames = assignee_indices & kernel.all_inames() if not assignee_inames <= kernel.insn_inames(insn): raise LoopyError( "assignee of instructiosn '%s' references " "iname that the instruction does not depend on" % insn.id) if assignee_name in kernel.arg_dict: # Any parallel tags that are not depended upon by the assignee # will cause write races. raceable_parallel_insn_inames = set( iname for iname in kernel.insn_inames(insn) if isinstance(iname_to_tag(iname), ParallelTag)) elif assignee_name in kernel.temporary_variables: temp_var = kernel.temporary_variables[assignee_name] raceable_parallel_insn_inames = set( iname for iname in kernel.insn_inames(insn) if _is_racing_iname_tag(temp_var, iname_to_tag(iname))) else: raise LoopyError("invalid assignee name in instruction '%s'" % insn.id) race_inames = \ raceable_parallel_insn_inames - assignee_inames if race_inames: warn_with_kernel( kernel, "write_race(%s)" % insn.id, "instruction '%s' contains a write race: " "instruction will be run across parallel iname(s) " "'%s', which is/are not referenced in the lhs index" % (insn.id, ",".join(race_inames)), WriteRaceConditionWarning)
def map_fortran_division(self, expr, *args): # We remove all these before type inference ever sees them. from loopy.type_inference import TypeInferenceFailure try: num_dtype = self.infer_type(expr.numerator).numpy_dtype den_dtype = self.infer_type(expr.denominator).numpy_dtype except TypeInferenceFailure: return super().map_fortran_division(expr, *args) from pymbolic.primitives import Quotient, FloorDiv if num_dtype.kind in "iub" and den_dtype.kind in "iub": warn_with_kernel( self.kernel, "fortran_int_div", "Integer division in Fortran code. Loopy currently gets this " "wrong for negative arguments.") return FloorDiv(self.rec(expr.numerator, *args), self.rec(expr.denominator, *args)) else: return Quotient(self.rec(expr.numerator, *args), self.rec(expr.denominator, *args))
def map_reduction(expr, rec, nresults=1): # Only expand one level of reduction at a time, going from outermost to # innermost. Otherwise we get the (iname + insn) dependencies wrong. try: arg_dtype = type_inf_mapper(expr.expr) except DependencyTypeInferenceFailure: if unknown_types_ok: arg_dtype = lp.auto reduction_dtypes = (lp.auto,)*nresults else: raise LoopyError("failed to determine type of accumulator for " "reduction '%s'" % expr) else: arg_dtype = arg_dtype.with_target(kernel.target) reduction_dtypes = expr.operation.result_dtypes( kernel, arg_dtype, expr.inames) reduction_dtypes = tuple( dt.with_target(kernel.target) for dt in reduction_dtypes) outer_insn_inames = temp_kernel.insn_inames(insn) bad_inames = frozenset(expr.inames) & outer_insn_inames if bad_inames: raise LoopyError("reduction used within loop(s) that it was " "supposed to reduce over: " + ", ".join(bad_inames)) n_sequential = 0 n_local_par = 0 from loopy.kernel.data import ( LocalIndexTagBase, UnrolledIlpTag, UnrollTag, VectorizeTag, ParallelTag) for iname in expr.inames: iname_tag = kernel.iname_to_tag.get(iname) if isinstance(iname_tag, (UnrollTag, UnrolledIlpTag)): # These are nominally parallel, but we can live with # them as sequential. n_sequential += 1 elif isinstance(iname_tag, LocalIndexTagBase): n_local_par += 1 elif isinstance(iname_tag, (ParallelTag, VectorizeTag)): raise LoopyError("the only form of parallelism supported " "by reductions is 'local'--found iname '%s' " "tagged '%s'" % (iname, type(iname_tag).__name__)) else: n_sequential += 1 if n_local_par and n_sequential: raise LoopyError("Reduction over '%s' contains both parallel and " "sequential inames. It must be split " "(using split_reduction_{in,out}ward) " "before code generation." % ", ".join(expr.inames)) if n_local_par > 1: raise LoopyError("Reduction over '%s' contains more than" "one parallel iname. It must be split " "(using split_reduction_{in,out}ward) " "before code generation." % ", ".join(expr.inames)) if n_sequential: assert n_local_par == 0 return map_reduction_seq(expr, rec, nresults, arg_dtype, reduction_dtypes) elif n_local_par: return map_reduction_local(expr, rec, nresults, arg_dtype, reduction_dtypes) else: from loopy.diagnostic import warn_with_kernel warn_with_kernel(kernel, "empty_reduction", "Empty reduction found (no inames to reduce over). " "Eliminating.") return expr.expr
def check_sizes(kernel, device): import loopy as lp from loopy.diagnostic import LoopyAdvisory, LoopyError if device is None: warn_with_kernel(kernel, "no_device_in_pre_codegen_checks", "No device parameter was passed to the PyOpenCLTarget. " "Perhaps you want to pass a device to benefit from " "additional checking.", LoopyAdvisory) return parameters = {} for arg in kernel.args: if isinstance(arg, lp.ValueArg) and arg.approximately is not None: parameters[arg.name] = arg.approximately glens, llens = kernel.get_grid_size_upper_bounds_as_exprs() if (max(len(glens), len(llens)) > device.max_work_item_dimensions): raise LoopyError("too many work item dimensions") from pymbolic import evaluate from pymbolic.mapper.evaluator import UnknownVariableError try: glens = evaluate(glens, parameters) llens = evaluate(llens, parameters) except UnknownVariableError as name: from warnings import warn warn("could not check axis bounds because no value " "for variable '%s' was passed to check_kernels()" % name, LoopyAdvisory) else: for i in range(len(llens)): if llens[i] > device.max_work_item_sizes[i]: raise LoopyError("group axis %d too big" % i) from pytools import product if product(llens) > device.max_work_group_size: raise LoopyError("work group too big") local_mem_use = kernel.local_mem_use() from pyopencl.characterize import usable_local_mem_size import numbers if isinstance(local_mem_use, numbers.Integral): if local_mem_use > usable_local_mem_size(device): raise LoopyError("using too much local memory") else: warn_with_kernel(kernel, "non_constant_local_mem", "The amount of local memory used by the kernel " "is not a constant. This will likely cause problems.") from loopy.kernel.data import ConstantArg const_arg_count = sum( 1 for arg in kernel.args if isinstance(arg, ConstantArg)) if const_arg_count > device.max_constant_args: raise LoopyError("too many constant arguments")
def find_temporary_scope(kernel): logger.debug("%s: mark local temporaries" % kernel.name) new_temp_vars = {} from loopy.kernel.data import (LocalIndexTagBase, GroupIndexTag, temp_var_scope) import loopy as lp writers = kernel.writer_map() for temp_var in six.itervalues(kernel.temporary_variables): # Only fill out for variables that do not yet know if they're # local. (I.e. those generated by implicit temporary generation.) if temp_var.scope is not lp.auto: new_temp_vars[temp_var.name] = temp_var continue my_writers = writers.get(temp_var.name, []) desired_scope_per_insn = [] for insn_id in my_writers: insn = kernel.id_to_insn[insn_id] # A write race will emerge if: # # - the variable is local # and # - the instruction is run across more inames (locally) parallel # than are reflected in the assignee indices. locparallel_compute_inames = _get_compute_inames_tagged( kernel, insn, LocalIndexTagBase) locparallel_assignee_inames = _get_assignee_inames_tagged( kernel, insn, LocalIndexTagBase, temp_var.name) grpparallel_compute_inames = _get_compute_inames_tagged( kernel, insn, GroupIndexTag) grpparallel_assignee_inames = _get_assignee_inames_tagged( kernel, insn, GroupIndexTag, temp_var.name) assert locparallel_assignee_inames <= locparallel_compute_inames assert grpparallel_assignee_inames <= grpparallel_compute_inames desired_scope = temp_var_scope.PRIVATE for iname_descr, scope_descr, apin, cpin, scope in [ ("local", "local", locparallel_assignee_inames, locparallel_compute_inames, temp_var_scope.LOCAL), ("group", "global", grpparallel_assignee_inames, grpparallel_compute_inames, temp_var_scope.GLOBAL), ]: if (apin != cpin and bool(locparallel_assignee_inames)): warn_with_kernel(kernel, "write_race_local(%s)" % insn_id, "instruction '%s' looks invalid: " "it assigns to indices based on %s IDs, but " "its temporary '%s' cannot be made %s because " "a write race across the iname(s) '%s' would emerge. " "(Do you need to add an extra iname to your prefetch?)" % (insn_id, iname_descr, temp_var.name, scope_descr, ", ".join(cpin - apin)), WriteRaceConditionWarning) if (apin == cpin # doesn't want to be in this scope if there aren't any # parallel inames of that kind: and bool(cpin)): desired_scope = max(desired_scope, scope) break desired_scope_per_insn.append(desired_scope) if not desired_scope_per_insn: if temp_var.initializer is None: warn_with_kernel(kernel, "temp_to_write(%s)" % temp_var.name, "temporary variable '%s' never written, eliminating" % temp_var.name, LoopyAdvisory) else: raise LoopyError("temporary variable '%s': never written, " "cannot automatically determine scope" % temp_var.name) continue overall_scope = max(desired_scope_per_insn) from pytools import all if not all(iscope == overall_scope for iscope in desired_scope_per_insn): raise LoopyError("not all instructions agree on the " "the desired scope (private/local/global) of the " "temporary '%s'" % temp_var.name) new_temp_vars[temp_var.name] = temp_var.copy(scope=overall_scope) return kernel.copy(temporary_variables=new_temp_vars)
def find_all_insn_inames(kernel): logger.debug("%s: find_all_insn_inames: start" % kernel.name) writer_map = kernel.writer_map() insn_id_to_inames = {} insn_assignee_inames = {} all_read_deps = {} all_write_deps = {} from loopy.transform.subst import expand_subst kernel = expand_subst(kernel) for insn in kernel.instructions: all_read_deps[insn.id] = read_deps = insn.read_dependency_names() all_write_deps[insn.id] = write_deps = insn.write_dependency_names() deps = read_deps | write_deps if insn.forced_iname_deps_is_final: iname_deps = insn.forced_iname_deps else: iname_deps = ( deps & kernel.all_inames() | insn.forced_iname_deps) assert isinstance(read_deps, frozenset), type(insn) assert isinstance(write_deps, frozenset), type(insn) assert isinstance(iname_deps, frozenset), type(insn) logger.debug("%s: find_all_insn_inames: %s (init): %s - " "read deps: %s - write deps: %s" % ( kernel.name, insn.id, ", ".join(sorted(iname_deps)), ", ".join(sorted(read_deps)), ", ".join(sorted(write_deps)), )) insn_id_to_inames[insn.id] = iname_deps insn_assignee_inames[insn.id] = write_deps & kernel.all_inames() # fixed point iteration until all iname dep sets have converged # Why is fixed point iteration necessary here? Consider the following # scenario: # # z = expr(iname) # y = expr(z) # x = expr(y) # # x clearly has a dependency on iname, but this is not found until that # dependency has propagated all the way up. Doing this recursively is # not guaranteed to terminate because of circular dependencies. while True: did_something = False for insn in kernel.instructions: if insn.forced_iname_deps_is_final: continue # {{{ depdency-based propagation inames_old = insn_id_to_inames[insn.id] inames_new = inames_old | guess_iname_deps_based_on_var_use( kernel, insn, insn_id_to_inames) insn_id_to_inames[insn.id] = inames_new if inames_new != inames_old: did_something = True warn_with_kernel(kernel, "inferred_iname", "The iname(s) '%s' on instruction '%s' in kernel '%s' " "was/were automatically added. " "This is deprecated. Please add the iname " "to the instruction " "explicitly, e.g. by adding 'for' loops" % (", ".join(inames_new-inames_old), insn.id, kernel.name)) # }}} # {{{ domain-based propagation inames_old = insn_id_to_inames[insn.id] inames_new = set(insn_id_to_inames[insn.id]) for iname in inames_old: home_domain = kernel.domains[kernel.get_home_domain_index(iname)] for par in home_domain.get_var_names(dim_type.param): # Add all inames occurring in parameters of domains that my # current inames refer to. if par in kernel.all_inames(): inames_new.add(intern(par)) # If something writes the bounds of a loop in which I'm # sitting, I had better be in the inames that the writer is # in. if par in kernel.temporary_variables: for writer_id in writer_map.get(par, []): inames_new.update(insn_id_to_inames[writer_id]) if inames_new != inames_old: did_something = True insn_id_to_inames[insn.id] = frozenset(inames_new) warn_with_kernel(kernel, "inferred_iname", "The iname(s) '%s' on instruction '%s' was " "automatically added. " "This is deprecated. Please add the iname " "to the instruction " "explicitly, e.g. by adding '{inames=...}" % (", ".join(inames_new-inames_old), insn.id)) # }}} if not did_something: break logger.debug("%s: find_all_insn_inames: done" % kernel.name) for v in six.itervalues(insn_id_to_inames): assert isinstance(v, frozenset) return insn_id_to_inames
def add_axes_to_temporaries_for_ilp_and_vec(kernel, iname=None): if iname is not None: logger.debug("%s: add axes to temporaries for ilp" % kernel.name) wmap = kernel.writer_map() from loopy.kernel.data import IlpBaseTag, VectorizeTag var_to_new_ilp_inames = {} # {{{ find variables that need extra indices for tv in six.itervalues(kernel.temporary_variables): for writer_insn_id in wmap.get(tv.name, []): writer_insn = kernel.id_to_insn[writer_insn_id] if iname is None: ilp_inames = frozenset(iname for iname in kernel.insn_inames(writer_insn) if isinstance( kernel.iname_to_tag.get(iname), (IlpBaseTag, VectorizeTag))) else: if not isinstance( kernel.iname_to_tag.get(iname), (IlpBaseTag, VectorizeTag)): raise LoopyError( "'%s' is not an ILP iname" % iname) ilp_inames = frozenset([iname]) referenced_ilp_inames = (ilp_inames & writer_insn.write_dependency_names()) new_ilp_inames = ilp_inames - referenced_ilp_inames if not new_ilp_inames: break if tv.name in var_to_new_ilp_inames: if new_ilp_inames != set(var_to_new_ilp_inames[tv.name]): raise LoopyError("instruction '%s' requires adding " "indices for ILP inames '%s' on var '%s', but previous " "instructions required inames '%s'" % (writer_insn_id, ", ".join(new_ilp_inames), ", ".join(var_to_new_ilp_inames[tv.name]))) continue var_to_new_ilp_inames[tv.name] = set(new_ilp_inames) # }}} # {{{ find ilp iname lengths from loopy.isl_helpers import static_max_of_pw_aff from loopy.symbolic import pw_aff_to_expr ilp_iname_to_length = {} for ilp_inames in six.itervalues(var_to_new_ilp_inames): for iname in ilp_inames: if iname in ilp_iname_to_length: continue bounds = kernel.get_iname_bounds(iname, constants_only=True) ilp_iname_to_length[iname] = int(pw_aff_to_expr( static_max_of_pw_aff(bounds.size, constants_only=True))) assert static_max_of_pw_aff( bounds.lower_bound_pw_aff, constants_only=True).plain_is_zero() # }}} # {{{ change temporary variables new_temp_vars = kernel.temporary_variables.copy() for tv_name, inames in six.iteritems(var_to_new_ilp_inames): tv = new_temp_vars[tv_name] extra_shape = tuple(ilp_iname_to_length[iname] for iname in inames) shape = tv.shape if shape is None: shape = () dim_tags = ["c"] * (len(shape) + len(extra_shape)) for i, iname in enumerate(inames): if isinstance(kernel.iname_to_tag.get(iname), VectorizeTag): dim_tags[len(shape) + i] = "vec" new_temp_vars[tv.name] = tv.copy(shape=shape + extra_shape, # Forget what you knew about data layout, # create from scratch. dim_tags=dim_tags, dim_names=None) # }}} from pymbolic import var var_to_extra_iname = dict( (var_name, tuple(var(iname) for iname in inames)) for var_name, inames in six.iteritems(var_to_new_ilp_inames)) new_insns = [] for insn in kernel.instructions: eiii = ExtraInameIndexInserter(var_to_extra_iname) new_insn = insn.with_transformed_expressions(eiii) if not eiii.seen_ilp_inames <= insn.within_inames: from loopy.diagnostic import warn_with_kernel warn_with_kernel( kernel, "implicit_ilp_iname", "Instruction '%s': touched variable that (for ILP) " "required iname(s) '%s', but that the instruction was not " "previously within the iname(s). Previously, this would " "implicitly promote the instruction, but that behavior is " "deprecated and will stop working in 2018.1." % (insn.id, ", ".join( eiii.seen_ilp_inames - insn.within_inames))) new_insns.append(new_insn) return kernel.copy( temporary_variables=new_temp_vars, instructions=new_insns)
def find_all_insn_inames(kernel): logger.debug("%s: find_all_insn_inames: start" % kernel.name) writer_map = kernel.writer_map() insn_id_to_inames = {} insn_assignee_inames = {} all_read_deps = {} all_write_deps = {} from loopy.transform.subst import expand_subst kernel = expand_subst(kernel) for insn in kernel.instructions: all_read_deps[insn.id] = read_deps = insn.read_dependency_names() all_write_deps[insn.id] = write_deps = insn.write_dependency_names() deps = read_deps | write_deps if insn.within_inames_is_final: iname_deps = insn.within_inames else: iname_deps = (deps & kernel.all_inames() | insn.within_inames) assert isinstance(read_deps, frozenset), type(insn) assert isinstance(write_deps, frozenset), type(insn) assert isinstance(iname_deps, frozenset), type(insn) logger.debug("%s: find_all_insn_inames: %s (init): %s - " "read deps: %s - write deps: %s" % ( kernel.name, insn.id, ", ".join(sorted(iname_deps)), ", ".join(sorted(read_deps)), ", ".join(sorted(write_deps)), )) insn_id_to_inames[insn.id] = iname_deps insn_assignee_inames[insn.id] = write_deps & kernel.all_inames() # fixed point iteration until all iname dep sets have converged # Why is fixed point iteration necessary here? Consider the following # scenario: # # z = expr(iname) # y = expr(z) # x = expr(y) # # x clearly has a dependency on iname, but this is not found until that # dependency has propagated all the way up. Doing this recursively is # not guaranteed to terminate because of circular dependencies. while True: did_something = False for insn in kernel.instructions: if insn.within_inames_is_final: continue # {{{ depdency-based propagation inames_old = insn_id_to_inames[insn.id] inames_new = inames_old | guess_iname_deps_based_on_var_use( kernel, insn, insn_id_to_inames) insn_id_to_inames[insn.id] = inames_new if inames_new != inames_old: did_something = True warn_with_kernel( kernel, "inferred_iname", "The iname(s) '%s' on instruction '%s' " "was/were automatically added. " "This is deprecated. Please add the iname " "to the instruction " "explicitly, e.g. by adding 'for' loops" % (", ".join(inames_new - inames_old), insn.id)) # }}} # {{{ domain-based propagation inames_old = insn_id_to_inames[insn.id] inames_new = set(insn_id_to_inames[insn.id]) for iname in inames_old: home_domain = kernel.domains[kernel.get_home_domain_index( iname)] for par in home_domain.get_var_names(dim_type.param): # Add all inames occurring in parameters of domains that my # current inames refer to. if par in kernel.all_inames(): inames_new.add(intern(par)) # If something writes the bounds of a loop in which I'm # sitting, I had better be in the inames that the writer is # in. if par in kernel.temporary_variables: for writer_id in writer_map.get(par, []): inames_new.update(insn_id_to_inames[writer_id]) if inames_new != inames_old: did_something = True insn_id_to_inames[insn.id] = frozenset(inames_new) warn_with_kernel( kernel, "inferred_iname", "The iname(s) '%s' on instruction '%s' was " "automatically added. " "This is deprecated. Please add the iname " "to the instruction " "explicitly, e.g. by adding 'for' loops" % (", ".join(inames_new - inames_old), insn.id)) # }}} if not did_something: break logger.debug("%s: find_all_insn_inames: done" % kernel.name) for v in six.itervalues(insn_id_to_inames): assert isinstance(v, frozenset) return insn_id_to_inames