def check_that_temporaries_are_defined_in_subkernels_where_used(kernel): from loopy.kernel.data import AddressSpace from loopy.kernel.tools import get_subkernels for subkernel in get_subkernels(kernel): defined_base_storage = set() from loopy.schedule.tools import (temporaries_written_in_subkernel, temporaries_read_in_subkernel) for temporary in temporaries_written_in_subkernel(kernel, subkernel): tval = kernel.temporary_variables[temporary] if tval.base_storage is not None: defined_base_storage.add(tval.base_storage) for temporary in (temporaries_read_in_subkernel(kernel, subkernel) - temporaries_written_in_subkernel(kernel, subkernel)): tval = kernel.temporary_variables[temporary] if tval.initializer is not None: continue # For aliased temporaries, check if there is an aliased definition. if tval.base_storage is not None: if tval.base_storage not in defined_base_storage: from loopy.diagnostic import MissingDefinitionError raise MissingDefinitionError( "temporary variable '%s' gets " "used in subkernel '%s' and neither it nor its " "aliases have a definition" % (temporary, subkernel)) continue if tval.address_space in (AddressSpace.PRIVATE, AddressSpace.LOCAL): from loopy.diagnostic import MissingDefinitionError raise MissingDefinitionError( "temporary variable '%s' gets used " "in subkernel '%s' without a definition (maybe you forgot " "to call loopy.save_and_reload_temporaries?)" % (temporary, subkernel))
def check_that_temporaries_are_defined_in_subkernels_where_used(kernel): from loopy.kernel.data import AddressSpace from loopy.kernel.tools import get_subkernels for subkernel in get_subkernels(kernel): defined_base_storage = set() from loopy.schedule.tools import ( temporaries_written_in_subkernel, temporaries_read_in_subkernel) for temporary in temporaries_written_in_subkernel(kernel, subkernel): tval = kernel.temporary_variables[temporary] if tval.base_storage is not None: defined_base_storage.add(tval.base_storage) for temporary in ( temporaries_read_in_subkernel(kernel, subkernel) - temporaries_written_in_subkernel(kernel, subkernel)): tval = kernel.temporary_variables[temporary] if tval.initializer is not None: continue # For aliased temporaries, check if there is an aliased definition. if tval.base_storage is not None: if tval.base_storage not in defined_base_storage: from loopy.diagnostic import MissingDefinitionError raise MissingDefinitionError("temporary variable '%s' gets " "used in subkernel '%s' and neither it nor its " "aliases have a definition" % (temporary, subkernel)) continue if tval.address_space in (AddressSpace.PRIVATE, AddressSpace.LOCAL): from loopy.diagnostic import MissingDefinitionError raise MissingDefinitionError("temporary variable '%s' gets used " "in subkernel '%s' without a definition (maybe you forgot " "to call loopy.save_and_reload_temporaries?)" % (temporary, subkernel))
def save_and_reload_temporaries(knl): """ Add instructions to save and reload temporary variables that are live across kernel calls. The basic code transformation turns schedule segments:: t = <...> <return followed by call> <...> = t into this code:: t = <...> t_save_slot = t <return followed by call> t = t_save_slot <...> = t where `t_save_slot` is a newly-created global temporary variable. :returns: The resulting kernel """ liveness = LivenessAnalysis(knl) saver = TemporarySaver(knl) from loopy.schedule.tools import (temporaries_read_in_subkernel, temporaries_written_in_subkernel) for sched_idx, sched_item in enumerate(knl.schedule): if isinstance(sched_item, CallKernel): # Any written temporary that is live-out needs to be read into # memory because of the potential for partial writes. if sched_idx == 0: # Kernel entry: nothing live interesting_temporaries = set() else: subkernel = sched_item.kernel_name interesting_temporaries = ( temporaries_read_in_subkernel(knl, subkernel) | temporaries_written_in_subkernel(knl, subkernel)) for temporary in liveness[ sched_idx].live_out & interesting_temporaries: logger.info("reloading {0} at entry of {1}".format( temporary, sched_item.kernel_name)) saver.reload(temporary, sched_item.kernel_name) elif isinstance(sched_item, ReturnFromKernel): if sched_idx == len(knl.schedule) - 1: # Kernel exit: nothing live interesting_temporaries = set() else: subkernel = sched_item.kernel_name interesting_temporaries = (temporaries_written_in_subkernel( knl, subkernel)) for temporary in liveness[ sched_idx].live_in & interesting_temporaries: logger.info("saving {0} before return of {1}".format( temporary, sched_item.kernel_name)) saver.save(temporary, sched_item.kernel_name) return saver.finish()
def get_temporary_decls(self, codegen_state, schedule_index): from loopy.kernel.data import AddressSpace kernel = codegen_state.kernel base_storage_decls = [] temp_decls = [] # {{{ declare temporaries base_storage_sizes = {} base_storage_to_scope = {} base_storage_to_align_bytes = {} from cgen import ArrayOf, Initializer, AlignedAttribute, Value, Line # Getting the temporary variables that are needed for the current # sub-kernel. from loopy.schedule.tools import ( temporaries_read_in_subkernel, temporaries_written_in_subkernel) subkernel = kernel.schedule[schedule_index].kernel_name sub_knl_temps = ( temporaries_read_in_subkernel(kernel, subkernel) | temporaries_written_in_subkernel(kernel, subkernel)) for tv in sorted( six.itervalues(kernel.temporary_variables), key=lambda tv: tv.name): decl_info = tv.decl_info(self.target, index_dtype=kernel.index_dtype) if not tv.base_storage: for idi in decl_info: # global temp vars are mapped to arguments or global declarations if tv.address_space != AddressSpace.GLOBAL and ( tv.name in sub_knl_temps): decl = self.wrap_temporary_decl( self.get_temporary_decl( codegen_state, schedule_index, tv, idi), tv.address_space) if tv.initializer is not None: assert tv.read_only decl = Initializer(decl, generate_array_literal( codegen_state, tv, tv.initializer)) temp_decls.append(decl) else: assert tv.initializer is None offset = 0 base_storage_sizes.setdefault(tv.base_storage, []).append( tv.nbytes) base_storage_to_scope.setdefault(tv.base_storage, []).append( tv.address_space) align_size = tv.dtype.itemsize from loopy.kernel.array import VectorArrayDimTag for dim_tag, axis_len in zip(tv.dim_tags, tv.shape): if isinstance(dim_tag, VectorArrayDimTag): align_size *= axis_len base_storage_to_align_bytes.setdefault(tv.base_storage, []).append( align_size) for idi in decl_info: cast_decl = POD(self, idi.dtype, "") temp_var_decl = POD(self, idi.dtype, idi.name) cast_decl = self.wrap_temporary_decl(cast_decl, tv.address_space) temp_var_decl = self.wrap_temporary_decl( temp_var_decl, tv.address_space) if tv._base_storage_access_may_be_aliasing: ptrtype = _ConstPointer else: # The 'restrict' part of this is a complete lie--of course # all these temporaries are aliased. But we're promising to # not use them to shovel data from one representation to the # other. That counts, right? ptrtype = _ConstRestrictPointer cast_decl = ptrtype(cast_decl) temp_var_decl = ptrtype(temp_var_decl) cast_tp, cast_d = cast_decl.get_decl_pair() temp_var_decl = Initializer( temp_var_decl, "(%s %s) (%s + %s)" % ( " ".join(cast_tp), cast_d, tv.base_storage, offset)) temp_decls.append(temp_var_decl) from pytools import product offset += ( idi.dtype.itemsize * product(si for si in idi.shape)) ecm = self.get_expression_to_code_mapper(codegen_state) for bs_name, bs_sizes in sorted(six.iteritems(base_storage_sizes)): bs_var_decl = Value("char", bs_name) from pytools import single_valued bs_var_decl = self.wrap_temporary_decl( bs_var_decl, single_valued(base_storage_to_scope[bs_name])) # FIXME: Could try to use isl knowledge to simplify max. if all(isinstance(bs, int) for bs in bs_sizes): bs_size_max = max(bs_sizes) else: bs_size_max = p.Max(tuple(bs_sizes)) bs_var_decl = ArrayOf(bs_var_decl, ecm(bs_size_max)) alignment = max(base_storage_to_align_bytes[bs_name]) bs_var_decl = AlignedAttribute(alignment, bs_var_decl) base_storage_decls.append(bs_var_decl) # }}} result = base_storage_decls + temp_decls if result: result.append(Line()) return result
def get_temporary_decls(self, codegen_state, schedule_index): from loopy.kernel.data import AddressSpace kernel = codegen_state.kernel base_storage_decls = [] temp_decls = [] # {{{ declare temporaries base_storage_sizes = {} base_storage_to_scope = {} base_storage_to_align_bytes = {} from cgen import ArrayOf, Initializer, AlignedAttribute, Value, Line # Getting the temporary variables that are needed for the current # sub-kernel. from loopy.schedule.tools import ( temporaries_read_in_subkernel, temporaries_written_in_subkernel) subkernel = kernel.schedule[schedule_index].kernel_name sub_knl_temps = ( temporaries_read_in_subkernel(kernel, subkernel) | temporaries_written_in_subkernel(kernel, subkernel)) for tv in sorted( six.itervalues(kernel.temporary_variables), key=lambda tv: tv.name): decl_info = tv.decl_info(self.target, index_dtype=kernel.index_dtype) if not tv.base_storage: for idi in decl_info: # global temp vars are mapped to arguments or global declarations if tv.address_space != AddressSpace.GLOBAL and ( tv.name in sub_knl_temps): decl = self.wrap_temporary_decl( self.get_temporary_decl( codegen_state, schedule_index, tv, idi), tv.address_space) if tv.initializer is not None: assert tv.read_only decl = Initializer(decl, generate_array_literal( codegen_state, tv, tv.initializer)) temp_decls.append(decl) else: assert tv.initializer is None offset = 0 base_storage_sizes.setdefault(tv.base_storage, []).append( tv.nbytes) base_storage_to_scope.setdefault(tv.base_storage, []).append( tv.address_space) align_size = tv.dtype.itemsize from loopy.kernel.array import VectorArrayDimTag for dim_tag, axis_len in zip(tv.dim_tags, tv.shape): if isinstance(dim_tag, VectorArrayDimTag): align_size *= axis_len base_storage_to_align_bytes.setdefault(tv.base_storage, []).append( align_size) for idi in decl_info: cast_decl = POD(self, idi.dtype, "") temp_var_decl = POD(self, idi.dtype, idi.name) cast_decl = self.wrap_temporary_decl(cast_decl, tv.address_space) temp_var_decl = self.wrap_temporary_decl( temp_var_decl, tv.address_space) if tv._base_storage_access_may_be_aliasing: ptrtype = _ConstPointer else: # The 'restrict' part of this is a complete lie--of course # all these temporaries are aliased. But we're promising to # not use them to shovel data from one representation to the # other. That counts, right? ptrtype = _ConstRestrictPointer cast_decl = ptrtype(cast_decl) temp_var_decl = ptrtype(temp_var_decl) cast_tp, cast_d = cast_decl.get_decl_pair() temp_var_decl = Initializer( temp_var_decl, "(%s %s) (%s + %s)" % ( " ".join(cast_tp), cast_d, tv.base_storage, offset)) temp_decls.append(temp_var_decl) from pytools import product offset += ( idi.dtype.itemsize * product(si for si in idi.shape)) ecm = self.get_expression_to_code_mapper(codegen_state) for bs_name, bs_sizes in sorted(six.iteritems(base_storage_sizes)): bs_var_decl = Value("char", bs_name) from pytools import single_valued bs_var_decl = self.wrap_temporary_decl( bs_var_decl, single_valued(base_storage_to_scope[bs_name])) # FIXME: Could try to use isl knowledge to simplify max. if all(isinstance(bs, int) for bs in bs_sizes): bs_size_max = max(bs_sizes) else: bs_size_max = p.Max(tuple(bs_sizes)) bs_var_decl = ArrayOf(bs_var_decl, ecm(bs_size_max)) alignment = max(base_storage_to_align_bytes[bs_name]) bs_var_decl = AlignedAttribute(alignment, bs_var_decl) base_storage_decls.append(bs_var_decl) # }}} result = base_storage_decls + temp_decls if result: result.append(Line()) return result
def save_and_reload_temporaries(program, entrypoint=None): """ Add instructions to save and reload temporary variables that are live across kernel calls. The basic code transformation turns schedule segments:: t = <...> <return followed by call> <...> = t into this code:: t = <...> t_save_slot = t <return followed by call> t = t_save_slot <...> = t where `t_save_slot` is a newly-created global temporary variable. :returns: The resulting kernel """ if entrypoint is None: if len(program.entrypoints) != 1: raise LoopyError("Missing argument 'entrypoint'.") entrypoint = list(program.entrypoints)[0] knl = program[entrypoint] if not knl.linearization: program = lp.preprocess_program(program) from loopy.schedule import get_one_linearized_kernel knl = get_one_linearized_kernel(program[entrypoint], program.callables_table) assert knl.linearization is not None liveness = LivenessAnalysis(knl) saver = TemporarySaver(knl, program.callables_table) from loopy.schedule.tools import (temporaries_read_in_subkernel, temporaries_written_in_subkernel) for sched_idx, sched_item in enumerate(knl.linearization): if isinstance(sched_item, CallKernel): # Any written temporary that is live-out needs to be read into # memory because of the potential for partial writes. if sched_idx == 0: # Kernel entry: nothing live interesting_temporaries = set() else: subkernel = sched_item.kernel_name interesting_temporaries = ( temporaries_read_in_subkernel(knl, subkernel) | temporaries_written_in_subkernel(knl, subkernel)) for temporary in liveness[ sched_idx].live_out & interesting_temporaries: logger.info("reloading {} at entry of {}".format( temporary, sched_item.kernel_name)) saver.reload(temporary, sched_item.kernel_name) elif isinstance(sched_item, ReturnFromKernel): if sched_idx == len(knl.linearization) - 1: # Kernel exit: nothing live interesting_temporaries = set() else: subkernel = sched_item.kernel_name interesting_temporaries = (temporaries_written_in_subkernel( knl, subkernel)) for temporary in liveness[ sched_idx].live_in & interesting_temporaries: logger.info("saving {} before return of {}".format( temporary, sched_item.kernel_name)) saver.save(temporary, sched_item.kernel_name) return program.with_kernel(saver.finish())
def save_and_reload_temporaries(knl): """ Add instructions to save and reload temporary variables that are live across kernel calls. The basic code transformation turns schedule segments:: t = <...> <return followed by call> <...> = t into this code:: t = <...> t_save_slot = t <return followed by call> t = t_save_slot <...> = t where `t_save_slot` is a newly-created global temporary variable. :returns: The resulting kernel """ liveness = LivenessAnalysis(knl) saver = TemporarySaver(knl) from loopy.schedule.tools import ( temporaries_read_in_subkernel, temporaries_written_in_subkernel) for sched_idx, sched_item in enumerate(knl.schedule): if isinstance(sched_item, CallKernel): # Any written temporary that is live-out needs to be read into # memory because of the potential for partial writes. if sched_idx == 0: # Kernel entry: nothing live interesting_temporaries = set() else: subkernel = sched_item.kernel_name interesting_temporaries = ( temporaries_read_in_subkernel(knl, subkernel) | temporaries_written_in_subkernel(knl, subkernel)) for temporary in liveness[sched_idx].live_out & interesting_temporaries: logger.info("reloading {0} at entry of {1}" .format(temporary, sched_item.kernel_name)) saver.reload(temporary, sched_item.kernel_name) elif isinstance(sched_item, ReturnFromKernel): if sched_idx == len(knl.schedule) - 1: # Kernel exit: nothing live interesting_temporaries = set() else: subkernel = sched_item.kernel_name interesting_temporaries = ( temporaries_written_in_subkernel(knl, subkernel)) for temporary in liveness[sched_idx].live_in & interesting_temporaries: logger.info("saving {0} before return of {1}" .format(temporary, sched_item.kernel_name)) saver.save(temporary, sched_item.kernel_name) return saver.finish()