def __init__(self, kernel): self.kernel = kernel self.insn_query = InstructionQuery(kernel) self.var_name_gen = kernel.get_var_name_generator() self.insn_name_gen = kernel.get_instruction_id_generator() # These fields keep track of updates to the kernel. self.insns_to_insert = [] self.insns_to_update = {} self.extra_args_to_add = {} self.updated_iname_to_tag = {} self.updated_temporary_variables = {} self.saves_or_reloads_added = {}
def check_that_temporaries_are_defined_in_subkernels_where_used(kernel): from loopy.schedule.tools import InstructionQuery from loopy.kernel.data import temp_var_scope insn_query = InstructionQuery(kernel) for subkernel in insn_query.subkernels(): defined_base_storage = set() for temporary in insn_query.temporaries_written_in_subkernel( subkernel): tval = kernel.temporary_variables[temporary] if tval.base_storage is not None: defined_base_storage.add(tval.base_storage) for temporary in ( insn_query.temporaries_read_in_subkernel(subkernel) - insn_query.temporaries_written_in_subkernel(subkernel)): tval = kernel.temporary_variables[temporary] if tval.initializer is not None: continue # For aliased temporaries, check if there is an aliased definition. if tval.base_storage is not None: if tval.base_storage not in defined_base_storage: from loopy.diagnostic import MissingDefinitionError raise MissingDefinitionError( "temporary variable '%s' gets used " "in subkernel '%s' and neither it nor its aliases have a " "definition" % (temporary, subkernel)) continue if tval.scope in (temp_var_scope.PRIVATE, temp_var_scope.LOCAL): from loopy.diagnostic import MissingDefinitionError raise MissingDefinitionError( "temporary variable '%s' gets used in " "subkernel '%s' without a definition (maybe you forgot to call " "loopy.save_and_reload_temporaries?)" % (temporary, subkernel))
def save_and_reload_temporaries(knl): """ Add instructions to save and reload temporary variables that are live across kernel calls. The basic code transformation turns schedule segments:: t = <...> <return followed by call> <...> = t into this code:: t = <...> t_save_slot = t <return followed by call> t = t_save_slot <...> = t where `t_save_slot` is a newly-created global temporary variable. :returns: The resulting kernel """ liveness = LivenessAnalysis(knl) saver = TemporarySaver(knl) insn_query = InstructionQuery(knl) for sched_idx, sched_item in enumerate(knl.schedule): if isinstance(sched_item, CallKernel): # Any written temporary that is live-out needs to be read into # memory because of the potential for partial writes. if sched_idx == 0: # Kernel entry: nothing live interesting_temporaries = set() else: interesting_temporaries = ( insn_query.temporaries_read_or_written_in_subkernel( sched_item.kernel_name)) for temporary in liveness[ sched_idx].live_out & interesting_temporaries: logger.info("reloading {0} at entry of {1}".format( temporary, sched_item.kernel_name)) saver.reload(temporary, sched_item.kernel_name) elif isinstance(sched_item, ReturnFromKernel): if sched_idx == len(knl.schedule) - 1: # Kernel exit: nothing live interesting_temporaries = set() else: interesting_temporaries = ( insn_query.temporaries_written_in_subkernel( sched_item.kernel_name)) for temporary in liveness[ sched_idx].live_in & interesting_temporaries: logger.info("saving {0} before return of {1}".format( temporary, sched_item.kernel_name)) saver.save(temporary, sched_item.kernel_name) return saver.finish()
class TemporarySaver(object): class PromotedTemporary(Record): """ .. attribute:: name The name of the new temporary. .. attribute:: orig_temporary The original temporary variable object. .. attribute:: hw_inames The common list of hw axes that define the original object. .. attribute:: hw_dims A list of expressions, to be added in front of the shape of the promoted temporary value, corresponding to hardware dimensions .. attribute:: non_hw_dims A list of expressions, to be added in front of the shape of the promoted temporary value, corresponding to non-hardware dimensions """ @memoize_method def as_variable(self): temporary = self.orig_temporary from loopy.kernel.data import TemporaryVariable return TemporaryVariable(name=self.name, dtype=temporary.dtype, scope=temp_var_scope.GLOBAL, shape=self.new_shape) @property def new_shape(self): return self.hw_dims + self.non_hw_dims def __init__(self, kernel): self.kernel = kernel self.insn_query = InstructionQuery(kernel) self.var_name_gen = kernel.get_var_name_generator() self.insn_name_gen = kernel.get_instruction_id_generator() # These fields keep track of updates to the kernel. self.insns_to_insert = [] self.insns_to_update = {} self.extra_args_to_add = {} self.updated_iname_to_tag = {} self.updated_temporary_variables = {} self.saves_or_reloads_added = {} @memoize_method def auto_promote_temporary(self, temporary_name): temporary = self.kernel.temporary_variables[temporary_name] if temporary.scope == temp_var_scope.GLOBAL: # Nothing to be done for global temporaries (I hope) return None if temporary.initializer is not None: # Temporaries with initializers do not need saving/reloading - the # code generation takes care of emitting the initializers. assert temporary.read_only return None if temporary.base_storage is not None: raise ValueError( "Cannot promote temporaries with base_storage to global") # `hw_inames`: The set of hw-parallel tagged inames that this temporary # is associated with. This is used for determining the shape of the # global storage needed for saving and restoring the temporary across # kernel calls. # # TODO: Make a policy decision about which dimensions to use. Currently, # the code looks at each instruction that defines or uses the temporary, # and takes the common set of hw-parallel tagged inames associated with # these instructions. # # Furthermore, in the case of local temporaries, inames that are tagged # hw-local do not contribute to the global storage shape. hw_inames = self.insn_query.common_hw_inames( self.insn_query.insns_reading_or_writing(temporary.name)) # We want hw_inames to be arranged according to the order: # g.0 < g.1 < ... < l.0 < l.1 < ... # Sorting lexicographically accomplishes this. hw_inames = sorted( hw_inames, key=lambda iname: str(self.kernel.iname_to_tag[iname])) # Calculate the sizes of the dimensions that get added in front for # the global storage of the temporary. hw_dims = [] backing_hw_inames = [] for iname in hw_inames: tag = self.kernel.iname_to_tag[iname] from loopy.kernel.data import LocalIndexTag is_local_iname = isinstance(tag, LocalIndexTag) if is_local_iname and temporary.scope == temp_var_scope.LOCAL: # Restrict shape to that of group inames for locals. continue backing_hw_inames.append(iname) from loopy.isl_helpers import static_max_of_pw_aff from loopy.symbolic import aff_to_expr hw_dims.append( aff_to_expr( static_max_of_pw_aff( self.kernel.get_iname_bounds(iname).size, False))) non_hw_dims = temporary.shape if len(non_hw_dims) == 0 and len(hw_dims) == 0: # Scalar not in hardware: ensure at least one dimension. non_hw_dims = (1, ) backing_temporary = self.PromotedTemporary( name=self.var_name_gen(temporary.name + "_save_slot"), orig_temporary=temporary, hw_dims=tuple(hw_dims), non_hw_dims=non_hw_dims, hw_inames=backing_hw_inames) return backing_temporary def save_or_reload_impl(self, temporary, subkernel, mode, promoted_temporary=lp.auto): assert mode in ("save", "reload") if promoted_temporary is auto: promoted_temporary = self.auto_promote_temporary(temporary) if promoted_temporary is None: return from loopy.kernel.tools import DomainChanger dchg = DomainChanger( self.kernel, frozenset( self.insn_query.inames_in_subkernel(subkernel) | set(promoted_temporary.hw_inames))) domain, hw_inames, dim_inames, iname_to_tag = \ self.augment_domain_for_save_or_reload( dchg.domain, promoted_temporary, mode, subkernel) self.kernel = dchg.get_kernel_with(domain) save_or_load_insn_id = self.insn_name_gen("{name}.{mode}".format( name=temporary, mode=mode)) def subscript_or_var(agg, subscript=()): from pymbolic.primitives import Subscript, Variable if len(subscript) == 0: return Variable(agg) else: return Subscript(Variable(agg), tuple(map(Variable, subscript))) dim_inames_trunc = dim_inames[:len(promoted_temporary.orig_temporary. shape)] args = (subscript_or_var(temporary, dim_inames_trunc), subscript_or_var(promoted_temporary.name, hw_inames + dim_inames)) if mode == "save": args = reversed(args) accessing_insns_in_subkernel = ( self.insn_query.insns_reading_or_writing(temporary) & self.insn_query.insns_in_subkernel(subkernel)) if mode == "save": depends_on = accessing_insns_in_subkernel update_deps = frozenset() elif mode == "reload": depends_on = frozenset() update_deps = accessing_insns_in_subkernel pre_barrier, post_barrier = self.insn_query.pre_and_post_barriers( subkernel) if pre_barrier is not None: depends_on |= set([pre_barrier]) if post_barrier is not None: update_deps |= set([post_barrier]) # Create the load / store instruction. from loopy.kernel.data import Assignment save_or_load_insn = Assignment( *args, id=save_or_load_insn_id, within_inames=(self.insn_query.inames_in_subkernel(subkernel) | frozenset(hw_inames + dim_inames)), within_inames_is_final=True, depends_on=depends_on, boostable=False, boostable_into=frozenset()) if temporary not in self.saves_or_reloads_added: self.saves_or_reloads_added[temporary] = set() self.saves_or_reloads_added[temporary].add(save_or_load_insn_id) self.insns_to_insert.append(save_or_load_insn) for insn_id in update_deps: insn = self.insns_to_update.get(insn_id, self.kernel.id_to_insn[insn_id]) self.insns_to_update[insn_id] = insn.copy( depends_on=insn.depends_on | frozenset([save_or_load_insn_id])) self.updated_temporary_variables[promoted_temporary.name] = \ promoted_temporary.as_variable() self.updated_iname_to_tag.update(iname_to_tag) @memoize_method def finish(self): new_instructions = [] insns_to_insert = dict( (insn.id, insn) for insn in self.insns_to_insert) # Add global no_sync_with between any added reloads and saves from six import iteritems for temporary, added_insns in iteritems(self.saves_or_reloads_added): for insn_id in added_insns: insn = insns_to_insert[insn_id] insns_to_insert[insn_id] = insn.copy(no_sync_with=frozenset( (added_insn, "global") for added_insn in added_insns)) for orig_insn in self.kernel.instructions: if orig_insn.id in self.insns_to_update: new_instructions.append(self.insns_to_update[orig_insn.id]) else: new_instructions.append(orig_insn) new_instructions.extend( sorted(insns_to_insert.values(), key=lambda insn: insn.id)) self.updated_iname_to_tag.update(self.kernel.iname_to_tag) self.updated_temporary_variables.update( self.kernel.temporary_variables) kernel = self.kernel.copy( instructions=new_instructions, iname_to_tag=self.updated_iname_to_tag, temporary_variables=self.updated_temporary_variables, overridden_get_grid_sizes_for_insn_ids=None) from loopy.kernel.tools import assign_automatic_axes return assign_automatic_axes(kernel) def save(self, temporary, subkernel): self.save_or_reload_impl(temporary, subkernel, "save") def reload(self, temporary, subkernel): self.save_or_reload_impl(temporary, subkernel, "reload") def augment_domain_for_save_or_reload(self, domain, promoted_temporary, mode, subkernel): """ Add new axes to the domain corresponding to the dimensions of `promoted_temporary`. These axes will be used in the save/ reload stage. """ assert mode in ("save", "reload") import islpy as isl orig_temporary = promoted_temporary.orig_temporary orig_dim = domain.dim(isl.dim_type.set) # Tags for newly added inames iname_to_tag = {} # FIXME: Restrict size of new inames to access footprint. # Add dimension-dependent inames. dim_inames = [] domain = domain.add(isl.dim_type.set, len(promoted_temporary.non_hw_dims)) for dim_idx, dim_size in enumerate(promoted_temporary.non_hw_dims): new_iname = self.insn_name_gen( "{name}_{mode}_axis_{dim}_{sk}".format( name=orig_temporary.name, mode=mode, dim=dim_idx, sk=subkernel)) domain = domain.set_dim_name(isl.dim_type.set, orig_dim + dim_idx, new_iname) if orig_temporary.is_local: # If the temporary has local scope, then loads / stores can # be done in parallel. from loopy.kernel.data import AutoFitLocalIndexTag iname_to_tag[new_iname] = AutoFitLocalIndexTag() dim_inames.append(new_iname) # Add size information. aff = isl.affs_from_space(domain.space) domain &= aff[0].le_set(aff[new_iname]) from loopy.symbolic import aff_from_expr domain &= aff[new_iname].lt_set( aff_from_expr(domain.space, dim_size)) # FIXME: Use promoted_temporary.hw_inames hw_inames = [] # Add hardware inames duplicates. for t_idx, hw_iname in enumerate(promoted_temporary.hw_inames): new_iname = self.insn_name_gen( "{name}_{mode}_hw_dim_{dim}_{sk}".format( name=orig_temporary.name, mode=mode, dim=t_idx, sk=subkernel)) hw_inames.append(new_iname) iname_to_tag[new_iname] = self.kernel.iname_to_tag[hw_iname] from loopy.isl_helpers import duplicate_axes domain = duplicate_axes(domain, promoted_temporary.hw_inames, hw_inames) # The operations on the domain above return a Set object, but the # underlying domain should be expressible as a single BasicSet. domain_list = domain.get_basic_set_list() assert domain_list.n_basic_set() == 1 domain = domain_list.get_basic_set(0) return domain, hw_inames, dim_inames, iname_to_tag