def __call__(self, kernel, codegen_result): """ Generates the wrapping python invoker for this execution target :arg kernel: the loopy :class:`LoopKernel`(s) to be executued :codegen_result: the loopy :class:`CodeGenerationResult` created by code generation :returns: A python callable that handles execution of this kernel """ options = kernel.options implemented_data_info = codegen_result.implemented_data_info from loopy.kernel.data import KernelArgument gen = PythonFunctionGenerator( "invoke_%s_loopy_kernel" % kernel.name, self.system_args + [ "%s=None" % idi.name for idi in implemented_data_info if issubclass(idi.arg_class, KernelArgument) ]) gen.add_to_preamble("from __future__ import division") gen.add_to_preamble("") self.target_specific_preamble(gen) gen.add_to_preamble("") self.generate_host_code(gen, codegen_result) gen.add_to_preamble("") self.initialize_system_args(gen) self.generate_integer_arg_finding_from_shapes(gen, kernel, implemented_data_info) self.generate_integer_arg_finding_from_offsets(gen, kernel, implemented_data_info) self.generate_integer_arg_finding_from_strides(gen, kernel, implemented_data_info) self.generate_value_arg_check(gen, kernel, implemented_data_info) args = self.generate_arg_setup(gen, kernel, implemented_data_info, options) self.generate_invocation(gen, codegen_result.host_program.name, args, kernel, implemented_data_info) self.generate_output_handler(gen, options, kernel, implemented_data_info) if options.write_wrapper: output = gen.get() if options.highlight_wrapper: output = get_highlighted_python_code(output) if options.write_wrapper is True: print(output) else: with open(options.write_wrapper, "w") as outf: outf.write(output) return gen.get_picklable_function()
def _cache_kernel_stats(self, t_unit: lp.TranslationUnit, kwargs: dict) \ -> tuple: """Generate the kernel stats for a program with its args.""" args_tuple = tuple( (key, value.shape) if hasattr(value, "shape") else (key, value) for key, value in kwargs.items()) # Are kernel stats already in the cache? try: self.kernel_stats[t_unit][args_tuple] return args_tuple except KeyError: # If not, calculate and cache the stats ep_name = t_unit.default_entrypoint.name executor = t_unit.target.get_kernel_executor(t_unit, self.queue, entrypoint=ep_name) info = executor.translation_unit_info( ep_name, executor.arg_to_dtype_set(kwargs)) typed_t_unit = executor.get_typed_and_scheduled_translation_unit( ep_name, executor.arg_to_dtype_set(kwargs)) kernel = typed_t_unit[ep_name] idi = info.implemented_data_info param_dict = kwargs.copy() param_dict.update({ k: None for k in kernel.arg_dict.keys() if k not in param_dict }) param_dict.update( {d.name: None for d in idi if d.name not in param_dict}) # Generate the wrapper code wrapper = executor.get_wrapper_generator() gen = PythonFunctionGenerator("_mcom_gen_args_profile", list(param_dict)) wrapper.generate_integer_arg_finding_from_shapes(gen, kernel, idi) wrapper.generate_integer_arg_finding_from_offsets(gen, kernel, idi) wrapper.generate_integer_arg_finding_from_strides(gen, kernel, idi) param_names = kernel.all_params() gen("return {%s}" % ", ".join(f"{repr(name)}: {name}" for name in param_names)) # Run the wrapper code, save argument values in domain_params domain_params = gen.get_picklable_function()(**param_dict) # Get flops/memory statistics op_map = lp.get_op_map(typed_t_unit, subgroup_size="guess") bytes_accessed = lp.get_mem_access_map( typed_t_unit, subgroup_size="guess") \ .to_bytes().eval_and_sum(domain_params) flops = op_map.filter_by( dtype=[np.float32, np.float64]).eval_and_sum(domain_params) # Footprint gathering is not yet available in loopy with # kernel callables: # https://github.com/inducer/loopy/issues/399 if 0: try: footprint = lp.gather_access_footprint_bytes(typed_t_unit) footprint_bytes = sum( footprint[k].eval_with_dict(domain_params) for k in footprint) except lp.symbolic.UnableToDetermineAccessRange: footprint_bytes = None else: footprint_bytes = None res = SingleCallKernelProfile(time=0, flops=flops, bytes_accessed=bytes_accessed, footprint_bytes=footprint_bytes) self.kernel_stats.setdefault(t_unit, {})[args_tuple] = res if self.logmgr: if f"{ep_name}_time" not in self.logmgr.quantity_data: self.logmgr.add_quantity(KernelProfile(self, ep_name)) return args_tuple
def __call__(self, kernel, codegen_result): """ Generates the wrapping python invoker for this execution target :arg kernel: the loopy :class:`LoopKernel`(s) to be executued :codegen_result: the loopy :class:`CodeGenerationResult` created by code generation :returns: A python callable that handles execution of this kernel """ options = kernel.options implemented_data_info = codegen_result.implemented_data_info from loopy.kernel.data import KernelArgument gen = PythonFunctionGenerator( "invoke_%s_loopy_kernel" % kernel.name, self.system_args + [ "%s=None" % idi.name for idi in implemented_data_info if issubclass(idi.arg_class, KernelArgument) ]) gen.add_to_preamble("from __future__ import division") gen.add_to_preamble("") self.target_specific_preamble(gen) gen.add_to_preamble("") self.generate_host_code(gen, codegen_result) gen.add_to_preamble("") self.initialize_system_args(gen) self.generate_integer_arg_finding_from_shapes( gen, kernel, implemented_data_info) self.generate_integer_arg_finding_from_offsets( gen, kernel, implemented_data_info) self.generate_integer_arg_finding_from_strides( gen, kernel, implemented_data_info) self.generate_value_arg_check( gen, kernel, implemented_data_info) args = self.generate_arg_setup( gen, kernel, implemented_data_info, options) self.generate_invocation(gen, codegen_result.host_program.name, args, kernel, implemented_data_info) self.generate_output_handler(gen, options, kernel, implemented_data_info) if options.write_wrapper: output = gen.get() if options.highlight_wrapper: output = get_highlighted_python_code(output) if options.write_wrapper is True: print(output) else: with open(options.write_wrapper, "w") as outf: outf.write(output) return gen.get_picklable_function()
def _cache_kernel_stats(self, program: lp.kernel.LoopKernel, kwargs: dict) \ -> tuple: """Generate the kernel stats for a program with its args.""" args_tuple = tuple( (key, value.shape) if hasattr(value, "shape") else (key, value) for key, value in kwargs.items()) # Are kernel stats already in the cache? try: x = self.kernel_stats[program][args_tuple] # noqa return args_tuple except KeyError: # If not, calculate and cache the stats executor = program.target.get_kernel_executor(program, self.queue) info = executor.kernel_info(executor.arg_to_dtype_set(kwargs)) kernel = executor.get_typed_and_scheduled_kernel( executor.arg_to_dtype_set(kwargs)) idi = info.implemented_data_info types = { k: v for k, v in kwargs.items() if hasattr(v, "dtype") and not v.dtype == object } param_dict = kwargs.copy() param_dict.update({ k: None for k in kernel.arg_dict.keys() if k not in param_dict }) param_dict.update( {d.name: None for d in idi if d.name not in param_dict}) # Generate the wrapper code wrapper = executor.get_wrapper_generator() gen = PythonFunctionGenerator("_mcom_gen_args_profile", list(param_dict)) wrapper.generate_integer_arg_finding_from_shapes(gen, kernel, idi) wrapper.generate_integer_arg_finding_from_offsets(gen, kernel, idi) wrapper.generate_integer_arg_finding_from_strides(gen, kernel, idi) param_names = program.all_params() gen("return {%s}" % ", ".join(f"{repr(name)}: {name}" for name in param_names)) # Run the wrapper code, save argument values in domain_params domain_params = gen.get_picklable_function()(**param_dict) # Get flops/memory statistics kernel = lp.add_and_infer_dtypes(kernel, types) op_map = lp.get_op_map(kernel, subgroup_size="guess") bytes_accessed = lp.get_mem_access_map(kernel, subgroup_size="guess") \ .to_bytes().eval_and_sum(domain_params) flops = op_map.filter_by( dtype=[np.float32, np.float64]).eval_and_sum(domain_params) try: footprint = lp.gather_access_footprint_bytes(kernel) footprint_bytes = sum( footprint[k].eval_with_dict(domain_params) for k in footprint) except lp.symbolic.UnableToDetermineAccessRange: footprint_bytes = None res = ProfileResult(time=0, flops=flops, bytes_accessed=bytes_accessed, footprint_bytes=footprint_bytes) self.kernel_stats.setdefault(program, {})[args_tuple] = res return args_tuple