def get_typed_and_scheduled_translation_unit_uncached( self, entrypoint, arg_to_dtype_set): from loopy.kernel.tools import add_dtypes from loopy.kernel import KernelState from loopy.translation_unit import resolve_callables program = resolve_callables(self.program) if arg_to_dtype_set: var_to_dtype = {} entry_knl = program[entrypoint] for var, dtype in arg_to_dtype_set: if var in entry_knl.impl_arg_to_arg: dest_name = entry_knl.impl_arg_to_arg[var].name else: dest_name = var var_to_dtype[dest_name] = dtype program = program.with_kernel(add_dtypes(entry_knl, var_to_dtype)) from loopy.type_inference import infer_unknown_types program = infer_unknown_types(program, expect_completion=True) if program.state < KernelState.LINEARIZED: from loopy.preprocess import preprocess_program program = preprocess_program(program) from loopy.schedule import get_one_linearized_kernel for e in program.entrypoints: program = program.with_kernel( get_one_linearized_kernel(program[e], program.callables_table)) return program
def get_typed_and_scheduled_kernel_uncached(self, arg_to_dtype_set): from loopy.kernel.tools import add_dtypes kernel = self.kernel if arg_to_dtype_set: var_to_dtype = {} for var, dtype in arg_to_dtype_set: try: dest_name = kernel.impl_arg_to_arg[var].name except KeyError: dest_name = var try: var_to_dtype[dest_name] = dtype except KeyError: raise LoopyError("cannot set type for '%s': " "no known variable/argument with that name" % var) kernel = add_dtypes(kernel, var_to_dtype) from loopy.type_inference import infer_unknown_types kernel = infer_unknown_types(kernel, expect_completion=True) if kernel.schedule is None: from loopy.preprocess import preprocess_kernel kernel = preprocess_kernel(kernel) from loopy.schedule import get_one_scheduled_kernel kernel = get_one_scheduled_kernel(kernel) return kernel
def get_typed_and_scheduled_kernel_uncached(self, arg_to_dtype_set): from loopy.kernel.tools import add_dtypes kernel = self.kernel if arg_to_dtype_set: var_to_dtype = {} for var, dtype in arg_to_dtype_set: try: dest_name = kernel.impl_arg_to_arg[var].name except KeyError: dest_name = var try: var_to_dtype[dest_name] = dtype except KeyError: raise LoopyError( "cannot set type for '%s': " "no known variable/argument with that name" % var) kernel = add_dtypes(kernel, var_to_dtype) from loopy.type_inference import infer_unknown_types kernel = infer_unknown_types(kernel, expect_completion=True) if kernel.schedule is None: from loopy.preprocess import preprocess_kernel kernel = preprocess_kernel(kernel) from loopy.schedule import get_one_scheduled_kernel kernel = get_one_scheduled_kernel(kernel) return kernel
def add_and_infer_dtypes(knl, dtype_dict): processed_dtype_dict = {} for k, v in six.iteritems(dtype_dict): for subkey in k.split(","): subkey = subkey.strip() if subkey: processed_dtype_dict[subkey] = v knl = add_dtypes(knl, processed_dtype_dict) from loopy.type_inference import infer_unknown_types return infer_unknown_types(knl, expect_completion=True)
def specialize_fortran_division(t_unit): from loopy.translation_unit import TranslationUnit, resolve_callables from loopy.kernel.function_interface import CallableKernel from loopy.type_inference import infer_unknown_types assert isinstance(t_unit, TranslationUnit) t_unit = resolve_callables(t_unit) t_unit = infer_unknown_types(t_unit) new_callables = {} for name, clbl in t_unit.callables_table.items(): if isinstance(clbl, CallableKernel): knl = clbl.subkernel clbl = clbl.copy(subkernel=_specialize_fortran_division_for_kernel( knl, t_unit.callables_table)) new_callables[name] = clbl return t_unit.copy(callables_table=new_callables)
def auto_test_vs_ref(ref_knl, ctx, test_knl=None, op_count=[], op_label=[], parameters={}, print_ref_code=False, print_code=True, warmup_rounds=2, dump_binary=False, fills_entire_output=None, do_check=True, check_result=None, max_test_kernel_count=1, quiet=False, blacklist_ref_vendors=[]): """Compare results of `ref_knl` to the kernels generated by scheduling *test_knl*. :arg check_result: a callable with :class:`numpy.ndarray` arguments *(result, reference_result)* returning a a tuple (class:`bool`, message) indicating correctness/acceptability of the result :arg max_test_kernel_count: Stop testing after this many *test_knl* """ import pyopencl as cl if test_knl is None: test_knl = ref_knl do_check = False if len(ref_knl.args) != len(test_knl.args): raise LoopyError("ref_knl and test_knl do not have the same number " "of arguments") for i, (ref_arg, test_arg) in enumerate(zip(ref_knl.args, test_knl.args)): if ref_arg.name != test_arg.name: raise LoopyError( "ref_knl and test_knl argument lists disagree at index " "%d (1-based)" % (i + 1)) if ref_arg.dtype != test_arg.dtype: raise LoopyError( "ref_knl and test_knl argument lists disagree at index " "%d (1-based)" % (i + 1)) from loopy.compiled import CompiledKernel from loopy.target.execution import get_highlighted_code if isinstance(op_count, (int, float)): warn("op_count should be a list", stacklevel=2) op_count = [op_count] if isinstance(op_label, str): warn("op_label should be a list", stacklevel=2) op_label = [op_label] from time import time if check_result is None: check_result = _default_check_result if fills_entire_output is not None: warn("fills_entire_output is deprecated", DeprecationWarning, stacklevel=2) # {{{ compile and run reference code from loopy.type_inference import infer_unknown_types ref_knl = infer_unknown_types(ref_knl, expect_completion=True) found_ref_device = False ref_errors = [] from loopy.kernel.data import ImageArg need_ref_image_support = any( isinstance(arg, ImageArg) for arg in ref_knl.args) for dev in _enumerate_cl_devices_for_ref_test(blacklist_ref_vendors, need_ref_image_support): ref_ctx = cl.Context([dev]) ref_queue = cl.CommandQueue( ref_ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) pp_ref_knl = lp.preprocess_kernel(ref_knl) for knl in lp.generate_loop_schedules(pp_ref_knl): ref_sched_kernel = knl break logger.info("{} (ref): trying {} for the reference calculation".format( ref_knl.name, dev)) ref_compiled = CompiledKernel(ref_ctx, ref_sched_kernel) if not quiet and print_ref_code: print(75 * "-") print("Reference Code:") print(75 * "-") print(get_highlighted_code(ref_compiled.get_code())) print(75 * "-") ref_kernel_info = ref_compiled.kernel_info(frozenset()) try: ref_args, ref_arg_data = \ make_ref_args(ref_sched_kernel, ref_kernel_info.implemented_data_info, ref_queue, parameters) ref_args["out_host"] = False except cl.RuntimeError as e: if e.code == cl.status_code.IMAGE_FORMAT_NOT_SUPPORTED: import traceback ref_errors.append("\n".join([ 75 * "-", "On %s:" % dev, 75 * "-", traceback.format_exc(), 75 * "-" ])) continue else: raise found_ref_device = True if not do_check: break ref_queue.finish() logger.info("{} (ref): using {} for the reference calculation".format( ref_knl.name, dev)) logger.info("%s (ref): run" % ref_knl.name) ref_start = time() if not AUTO_TEST_SKIP_RUN: ref_evt, _ = ref_compiled(ref_queue, **ref_args) else: ref_evt = cl.enqueue_marker(ref_queue) ref_queue.finish() ref_stop = time() ref_elapsed_wall = ref_stop - ref_start logger.info("%s (ref): run done" % ref_knl.name) ref_evt.wait() ref_elapsed_event = 1e-9 * (ref_evt.profile.END - ref_evt.profile.START) break if not found_ref_device: raise LoopyError("could not find a suitable device for the " "reference computation.\n" "These errors were encountered:\n" + "\n".join(ref_errors)) # }}} # {{{ compile and run parallel code need_check = do_check queue = cl.CommandQueue( ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) from loopy.kernel import KernelState from loopy.target.pyopencl import PyOpenCLTarget if test_knl.state not in [ KernelState.PREPROCESSED, KernelState.LINEARIZED ]: if isinstance(test_knl.target, PyOpenCLTarget): test_knl = test_knl.copy(target=PyOpenCLTarget(ctx.devices[0])) test_knl = lp.preprocess_kernel(test_knl) if not test_knl.schedule: test_kernels = lp.generate_loop_schedules(test_knl) else: test_kernels = [test_knl] test_kernel_count = 0 from loopy.type_inference import infer_unknown_types for i, kernel in enumerate(test_kernels): test_kernel_count += 1 if test_kernel_count > max_test_kernel_count: break kernel = infer_unknown_types(kernel, expect_completion=True) compiled = CompiledKernel(ctx, kernel) kernel_info = compiled.kernel_info(frozenset()) args = make_args(kernel, kernel_info.implemented_data_info, queue, ref_arg_data, parameters) args["out_host"] = False if not quiet: print(75 * "-") print("Kernel #%d:" % i) print(75 * "-") if print_code: print(compiled.get_highlighted_code()) print(75 * "-") if dump_binary: # {{{ find cl program for name in dir(kernel_info.cl_kernels): if name.startswith("__"): continue cl_kernel = getattr(kernel_info.cl_kernels, name) cl_program = cl_kernel.get_info(cl.kernel_info.PROGRAM) break else: assert False, "could not find cl_program" # }}} print(type(cl_program)) if hasattr(cl_program, "binaries"): print(cl_program.binaries[0]) print(75 * "-") logger.info("%s: run warmup" % (knl.name)) for i in range(warmup_rounds): if not AUTO_TEST_SKIP_RUN: compiled(queue, **args) if need_check and not AUTO_TEST_SKIP_RUN: for arg_desc in ref_arg_data: if arg_desc is None: continue if not arg_desc.needs_checking: continue from pyopencl.compyte.array import as_strided ref_ary = as_strided( arg_desc.ref_storage_array.get(), shape=arg_desc.ref_shape, strides=arg_desc.ref_numpy_strides).flatten() test_ary = as_strided( arg_desc.test_storage_array.get(), shape=arg_desc.test_shape, strides=arg_desc.test_numpy_strides).flatten() common_len = min(len(ref_ary), len(test_ary)) ref_ary = ref_ary[:common_len] test_ary = test_ary[:common_len] error_is_small, error = check_result(test_ary, ref_ary) if not error_is_small: raise AutomaticTestFailure(error) need_check = False events = [] queue.finish() logger.info("%s: warmup done" % (knl.name)) logger.info("%s: timing run" % (knl.name)) timing_rounds = max(warmup_rounds, 1) while True: from time import time start_time = time() evt_start = cl.enqueue_marker(queue) for i in range(timing_rounds): if not AUTO_TEST_SKIP_RUN: evt, _ = compiled(queue, **args) events.append(evt) else: events.append(cl.enqueue_marker(queue)) evt_end = cl.enqueue_marker(queue) queue.finish() stop_time = time() for evt in events: evt.wait() evt_start.wait() evt_end.wait() elapsed_event = (1e-9*events[-1].profile.END - 1e-9*events[0].profile.START) \ / timing_rounds try: elapsed_event_marker = ((1e-9 * evt_end.profile.START - 1e-9 * evt_start.profile.START) / timing_rounds) except cl.RuntimeError: elapsed_event_marker = None elapsed_wall = (stop_time - start_time) / timing_rounds if elapsed_wall * timing_rounds < 0.3: timing_rounds *= 4 else: break logger.info("%s: timing run done" % (knl.name)) rates = "" for cnt, lbl in zip(op_count, op_label): rates += " {:g} {}/s".format(cnt / elapsed_wall, lbl) if not quiet: def format_float_or_none(v): if v is None: return "<unavailable>" else: return "%g" % v print("elapsed: %s s event, %s s marker-event %s s wall " "(%d rounds)%s" % (format_float_or_none(elapsed_event), format_float_or_none(elapsed_event_marker), format_float_or_none(elapsed_wall), timing_rounds, rates)) if do_check: ref_rates = "" for cnt, lbl in zip(op_count, op_label): ref_rates += " {:g} {}/s".format(cnt / ref_elapsed_event, lbl) if not quiet: print("ref: elapsed: {:g} s event, {:g} s wall{}".format( ref_elapsed_event, ref_elapsed_wall, ref_rates)) # }}} result_dict = {} result_dict["elapsed_event"] = elapsed_event result_dict["elapsed_event_marker"] = elapsed_event_marker result_dict["elapsed_wall"] = elapsed_wall result_dict["timing_rounds"] = timing_rounds if do_check: result_dict["ref_elapsed_event"] = ref_elapsed_event result_dict["ref_elapsed_wall"] = ref_elapsed_wall return result_dict
def generate_code_v2(kernel): """ :returns: a :class:`CodeGenerationResult` """ from loopy.kernel import kernel_state if kernel.state == kernel_state.INITIAL: from loopy.preprocess import preprocess_kernel kernel = preprocess_kernel(kernel) if kernel.schedule is None: from loopy.schedule import get_one_scheduled_kernel kernel = get_one_scheduled_kernel(kernel) if kernel.state != kernel_state.SCHEDULED: raise LoopyError("cannot generate code for a kernel that has not been " "scheduled") # {{{ cache retrieval from loopy import CACHING_ENABLED if CACHING_ENABLED: input_kernel = kernel try: result = code_gen_cache[input_kernel] logger.debug("%s: code generation cache hit" % kernel.name) return result except KeyError: pass # }}} from loopy.type_inference import infer_unknown_types kernel = infer_unknown_types(kernel, expect_completion=True) from loopy.check import pre_codegen_checks pre_codegen_checks(kernel) logger.info("%s: generate code: start" % kernel.name) # {{{ examine arg list from loopy.kernel.data import ValueArg from loopy.kernel.array import ArrayBase implemented_data_info = [] for arg in kernel.args: is_written = arg.name in kernel.get_written_variables() if isinstance(arg, ArrayBase): implemented_data_info.extend( arg.decl_info( kernel.target, is_written=is_written, index_dtype=kernel.index_dtype)) elif isinstance(arg, ValueArg): implemented_data_info.append(ImplementedDataInfo( target=kernel.target, name=arg.name, dtype=arg.dtype, arg_class=ValueArg, is_written=is_written)) else: raise ValueError("argument type not understood: '%s'" % type(arg)) allow_complex = False for var in kernel.args + list(six.itervalues(kernel.temporary_variables)): if var.dtype.involves_complex(): allow_complex = True # }}} seen_dtypes = set() seen_functions = set() seen_atomic_dtypes = set() initial_implemented_domain = isl.BasicSet.from_params(kernel.assumptions) codegen_state = CodeGenerationState( kernel=kernel, implemented_data_info=implemented_data_info, implemented_domain=initial_implemented_domain, implemented_predicates=frozenset(), seen_dtypes=seen_dtypes, seen_functions=seen_functions, seen_atomic_dtypes=seen_atomic_dtypes, var_subst_map={}, allow_complex=allow_complex, var_name_generator=kernel.get_var_name_generator(), is_generating_device_code=False, gen_program_name=( kernel.target.host_program_name_prefix + kernel.name + kernel.target.host_program_name_suffix), schedule_index_end=len(kernel.schedule)) from loopy.codegen.result import generate_host_or_device_program codegen_result = generate_host_or_device_program( codegen_state, schedule_index=0) device_code_str = codegen_result.device_code() from loopy.check import check_implemented_domains assert check_implemented_domains(kernel, codegen_result.implemented_domains, device_code_str) # {{{ handle preambles for arg in kernel.args: seen_dtypes.add(arg.dtype) for tv in six.itervalues(kernel.temporary_variables): seen_dtypes.add(tv.dtype) preambles = kernel.preambles[:] preamble_info = PreambleInfo( kernel=kernel, seen_dtypes=seen_dtypes, seen_functions=seen_functions, # a set of LoopyTypes (!) seen_atomic_dtypes=seen_atomic_dtypes) preamble_generators = (kernel.preamble_generators + kernel.target.get_device_ast_builder().preamble_generators()) for prea_gen in preamble_generators: preambles.extend(prea_gen(preamble_info)) codegen_result = codegen_result.copy(device_preambles=preambles) # }}} logger.info("%s: generate code: done" % kernel.name) if CACHING_ENABLED: code_gen_cache[input_kernel] = codegen_result return codegen_result
def _add_and_infer_dtypes_overdetermined(knl, dtype_dict): knl = _add_dtypes_overdetermined(knl, dtype_dict) from loopy.type_inference import infer_unknown_types return infer_unknown_types(knl, expect_completion=True)
def preprocess_kernel(kernel, device=None): if device is not None: from warnings import warn warn("passing 'device' to preprocess_kernel() is deprecated", DeprecationWarning, stacklevel=2) from loopy.kernel import kernel_state if kernel.state >= kernel_state.PREPROCESSED: return kernel # {{{ cache retrieval from loopy import CACHING_ENABLED if CACHING_ENABLED: input_kernel = kernel try: result = preprocess_cache[kernel] logger.debug("%s: preprocess cache hit" % kernel.name) return result except KeyError: pass # }}} logger.info("%s: preprocess start" % kernel.name) from loopy.check import check_identifiers_in_subst_rules check_identifiers_in_subst_rules(kernel) # {{{ check that there are no l.auto-tagged inames from loopy.kernel.data import AutoLocalIndexTagBase for iname, tag in six.iteritems(kernel.iname_to_tag): if (isinstance(tag, AutoLocalIndexTagBase) and iname in kernel.all_inames()): raise LoopyError("kernel with automatically-assigned " "local axes passed to preprocessing") # }}} from loopy.transform.subst import expand_subst kernel = expand_subst(kernel) # Ordering restriction: # Type inference and reduction iname uniqueness don't handle substitutions. # Get them out of the way. kernel = infer_unknown_types(kernel, expect_completion=False) check_for_writes_to_predicates(kernel) check_reduction_iname_uniqueness(kernel) from loopy.kernel.creation import apply_single_writer_depencency_heuristic kernel = apply_single_writer_depencency_heuristic(kernel) # Ordering restrictions: # # - realize_reduction must happen after type inference because it needs # to be able to determine the types of the reduced expressions. # # - realize_reduction must happen after default dependencies are added # because it manipulates the depends_on field, which could prevent # defaults from being applied. kernel = realize_reduction(kernel, unknown_types_ok=False) # Ordering restriction: # add_axes_to_temporaries_for_ilp because reduction accumulators # need to be duplicated by this. from loopy.transform.ilp import add_axes_to_temporaries_for_ilp_and_vec kernel = add_axes_to_temporaries_for_ilp_and_vec(kernel) kernel = find_temporary_scope(kernel) # boostability should be removed in 2017.x. kernel = find_idempotence(kernel) kernel = limit_boostability(kernel) kernel = kernel.target.preprocess(kernel) logger.info("%s: preprocess done" % kernel.name) kernel = kernel.copy( state=kernel_state.PREPROCESSED) # {{{ prepare for caching # PicklableDtype instances for example need to know the target they're working # towards in order to pickle and unpickle them. This is the first pass that # uses caching, so we need to be ready to pickle. This means propagating # this target information. if CACHING_ENABLED: input_kernel = prepare_for_caching(input_kernel) kernel = prepare_for_caching(kernel) # }}} if CACHING_ENABLED: preprocess_cache[input_kernel] = kernel return kernel
def generate_code_v2(kernel): """ :returns: a :class:`CodeGenerationResult` """ from loopy.kernel import KernelState if kernel.state == KernelState.INITIAL: from loopy.preprocess import preprocess_kernel kernel = preprocess_kernel(kernel) if kernel.schedule is None: from loopy.schedule import get_one_scheduled_kernel kernel = get_one_scheduled_kernel(kernel) if kernel.state != KernelState.SCHEDULED: raise LoopyError("cannot generate code for a kernel that has not been " "scheduled") # {{{ cache retrieval from loopy import CACHING_ENABLED if CACHING_ENABLED: input_kernel = kernel try: result = code_gen_cache[input_kernel] logger.debug("%s: code generation cache hit" % kernel.name) return result except KeyError: pass # }}} from loopy.type_inference import infer_unknown_types kernel = infer_unknown_types(kernel, expect_completion=True) from loopy.check import pre_codegen_checks pre_codegen_checks(kernel) logger.info("%s: generate code: start" % kernel.name) # {{{ examine arg list from loopy.kernel.data import ValueArg from loopy.kernel.array import ArrayBase implemented_data_info = [] for arg in kernel.args: is_written = arg.name in kernel.get_written_variables() if isinstance(arg, ArrayBase): implemented_data_info.extend( arg.decl_info( kernel.target, is_written=is_written, index_dtype=kernel.index_dtype)) elif isinstance(arg, ValueArg): implemented_data_info.append(ImplementedDataInfo( target=kernel.target, name=arg.name, dtype=arg.dtype, arg_class=ValueArg, is_written=is_written)) else: raise ValueError("argument type not understood: '%s'" % type(arg)) allow_complex = False for var in kernel.args + list(six.itervalues(kernel.temporary_variables)): if var.dtype.involves_complex(): allow_complex = True # }}} seen_dtypes = set() seen_functions = set() seen_atomic_dtypes = set() initial_implemented_domain = isl.BasicSet.from_params(kernel.assumptions) codegen_state = CodeGenerationState( kernel=kernel, implemented_data_info=implemented_data_info, implemented_domain=initial_implemented_domain, implemented_predicates=frozenset(), seen_dtypes=seen_dtypes, seen_functions=seen_functions, seen_atomic_dtypes=seen_atomic_dtypes, var_subst_map={}, allow_complex=allow_complex, var_name_generator=kernel.get_var_name_generator(), is_generating_device_code=False, gen_program_name=( kernel.target.host_program_name_prefix + kernel.name + kernel.target.host_program_name_suffix), schedule_index_end=len(kernel.schedule)) from loopy.codegen.result import generate_host_or_device_program codegen_result = generate_host_or_device_program( codegen_state, schedule_index=0) device_code_str = codegen_result.device_code() from loopy.check import check_implemented_domains assert check_implemented_domains(kernel, codegen_result.implemented_domains, device_code_str) # {{{ handle preambles for arg in kernel.args: seen_dtypes.add(arg.dtype) for tv in six.itervalues(kernel.temporary_variables): seen_dtypes.add(tv.dtype) preambles = kernel.preambles[:] preamble_info = PreambleInfo( kernel=kernel, seen_dtypes=seen_dtypes, seen_functions=seen_functions, # a set of LoopyTypes (!) seen_atomic_dtypes=seen_atomic_dtypes, codegen_state=codegen_state ) preamble_generators = (kernel.preamble_generators + kernel.target.get_device_ast_builder().preamble_generators()) for prea_gen in preamble_generators: preambles.extend(prea_gen(preamble_info)) codegen_result = codegen_result.copy(device_preambles=preambles) # }}} # For faster unpickling in the common case when implemented_domains isn't needed. from loopy.tools import LazilyUnpicklingDict codegen_result = codegen_result.copy( implemented_domains=LazilyUnpicklingDict( codegen_result.implemented_domains)) logger.info("%s: generate code: done" % kernel.name) if CACHING_ENABLED: code_gen_cache.store_if_not_present(input_kernel, codegen_result) return codegen_result
def auto_test_vs_ref( ref_knl, ctx, test_knl=None, op_count=[], op_label=[], parameters={}, print_ref_code=False, print_code=True, warmup_rounds=2, dump_binary=False, fills_entire_output=None, do_check=True, check_result=None, max_test_kernel_count=1, quiet=False, blacklist_ref_vendors=[]): """Compare results of `ref_knl` to the kernels generated by scheduling *test_knl*. :arg check_result: a callable with :class:`numpy.ndarray` arguments *(result, reference_result)* returning a a tuple (class:`bool`, message) indicating correctness/acceptability of the result :arg max_test_kernel_count: Stop testing after this many *test_knl* """ import pyopencl as cl if test_knl is None: test_knl = ref_knl do_check = False if len(ref_knl.args) != len(test_knl.args): raise LoopyError("ref_knl and test_knl do not have the same number " "of arguments") for i, (ref_arg, test_arg) in enumerate(zip(ref_knl.args, test_knl.args)): if ref_arg.name != test_arg.name: raise LoopyError("ref_knl and test_knl argument lists disagree at index " "%d (1-based)" % (i+1)) if ref_arg.dtype != test_arg.dtype: raise LoopyError("ref_knl and test_knl argument lists disagree at index " "%d (1-based)" % (i+1)) from loopy.compiled import CompiledKernel from loopy.target.execution import get_highlighted_code if isinstance(op_count, (int, float)): warn("op_count should be a list", stacklevel=2) op_count = [op_count] if isinstance(op_label, str): warn("op_label should be a list", stacklevel=2) op_label = [op_label] from time import time if check_result is None: check_result = _default_check_result if fills_entire_output is not None: warn("fills_entire_output is deprecated", DeprecationWarning, stacklevel=2) # {{{ compile and run reference code from loopy.type_inference import infer_unknown_types ref_knl = infer_unknown_types(ref_knl, expect_completion=True) found_ref_device = False ref_errors = [] for dev in _enumerate_cl_devices_for_ref_test(blacklist_ref_vendors): ref_ctx = cl.Context([dev]) ref_queue = cl.CommandQueue(ref_ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) pp_ref_knl = lp.preprocess_kernel(ref_knl) for knl in lp.generate_loop_schedules(pp_ref_knl): ref_sched_kernel = knl break logger.info("%s (ref): trying %s for the reference calculation" % ( ref_knl.name, dev)) ref_compiled = CompiledKernel(ref_ctx, ref_sched_kernel) if not quiet and print_ref_code: print(75*"-") print("Reference Code:") print(75*"-") print(get_highlighted_code(ref_compiled.get_code())) print(75*"-") ref_kernel_info = ref_compiled.kernel_info(frozenset()) try: ref_args, ref_arg_data = \ make_ref_args(ref_sched_kernel, ref_kernel_info.implemented_data_info, ref_queue, parameters) ref_args["out_host"] = False except cl.RuntimeError as e: if e.code == cl.status_code.IMAGE_FORMAT_NOT_SUPPORTED: import traceback ref_errors.append("\n".join([ 75*"-", "On %s:" % dev, 75*"-", traceback.format_exc(), 75*"-"])) continue else: raise found_ref_device = True if not do_check: break ref_queue.finish() logger.info("%s (ref): using %s for the reference calculation" % ( ref_knl.name, dev)) logger.info("%s (ref): run" % ref_knl.name) ref_start = time() if not AUTO_TEST_SKIP_RUN: ref_evt, _ = ref_compiled(ref_queue, **ref_args) else: ref_evt = cl.enqueue_marker(ref_queue) ref_queue.finish() ref_stop = time() ref_elapsed_wall = ref_stop-ref_start logger.info("%s (ref): run done" % ref_knl.name) ref_evt.wait() ref_elapsed_event = 1e-9*(ref_evt.profile.END-ref_evt.profile.START) break if not found_ref_device: raise LoopyError("could not find a suitable device for the " "reference computation.\n" "These errors were encountered:\n"+"\n".join(ref_errors)) # }}} # {{{ compile and run parallel code need_check = do_check queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) from loopy.kernel import KernelState from loopy.target.pyopencl import PyOpenCLTarget if test_knl.state not in [ KernelState.PREPROCESSED, KernelState.SCHEDULED]: if isinstance(test_knl.target, PyOpenCLTarget): test_knl = test_knl.copy(target=PyOpenCLTarget(ctx.devices[0])) test_knl = lp.preprocess_kernel(test_knl) if not test_knl.schedule: test_kernels = lp.generate_loop_schedules(test_knl) else: test_kernels = [test_knl] test_kernel_count = 0 from loopy.type_inference import infer_unknown_types for i, kernel in enumerate(test_kernels): test_kernel_count += 1 if test_kernel_count > max_test_kernel_count: break kernel = infer_unknown_types(kernel, expect_completion=True) compiled = CompiledKernel(ctx, kernel) kernel_info = compiled.kernel_info(frozenset()) args = make_args(kernel, kernel_info.implemented_data_info, queue, ref_arg_data, parameters) args["out_host"] = False if not quiet: print(75*"-") print("Kernel #%d:" % i) print(75*"-") if print_code: print(compiled.get_highlighted_code()) print(75*"-") if dump_binary: # {{{ find cl program for name in dir(kernel_info.cl_kernels): if name.startswith("__"): continue cl_kernel = getattr(kernel_info.cl_kernels, name) cl_program = cl_kernel.get_info(cl.kernel_info.PROGRAM) break else: assert False, "could not find cl_program" # }}} print(type(cl_program)) if hasattr(cl_program, "binaries"): print(cl_program.binaries[0]) print(75*"-") logger.info("%s: run warmup" % (knl.name)) for i in range(warmup_rounds): if not AUTO_TEST_SKIP_RUN: compiled(queue, **args) if need_check and not AUTO_TEST_SKIP_RUN: for arg_desc in ref_arg_data: if arg_desc is None: continue if not arg_desc.needs_checking: continue from pyopencl.compyte.array import as_strided ref_ary = as_strided( arg_desc.ref_storage_array.get(), shape=arg_desc.ref_shape, strides=arg_desc.ref_numpy_strides).flatten() test_ary = as_strided( arg_desc.test_storage_array.get(), shape=arg_desc.test_shape, strides=arg_desc.test_numpy_strides).flatten() common_len = min(len(ref_ary), len(test_ary)) ref_ary = ref_ary[:common_len] test_ary = test_ary[:common_len] error_is_small, error = check_result(test_ary, ref_ary) if not error_is_small: raise AutomaticTestFailure(error) need_check = False events = [] queue.finish() logger.info("%s: warmup done" % (knl.name)) logger.info("%s: timing run" % (knl.name)) timing_rounds = warmup_rounds while True: from time import time start_time = time() evt_start = cl.enqueue_marker(queue) for i in range(timing_rounds): if not AUTO_TEST_SKIP_RUN: evt, _ = compiled(queue, **args) events.append(evt) else: events.append(cl.enqueue_marker(queue)) evt_end = cl.enqueue_marker(queue) queue.finish() stop_time = time() for evt in events: evt.wait() evt_start.wait() evt_end.wait() elapsed_event = (1e-9*events[-1].profile.END - 1e-9*events[0].profile.START) \ / timing_rounds try: elapsed_event_marker = ((1e-9*evt_end.profile.START - 1e-9*evt_start.profile.START) / timing_rounds) except cl.RuntimeError: elapsed_event_marker = None elapsed_wall = (stop_time-start_time)/timing_rounds if elapsed_wall * timing_rounds < 0.3: timing_rounds *= 4 else: break logger.info("%s: timing run done" % (knl.name)) rates = "" for cnt, lbl in zip(op_count, op_label): rates += " %g %s/s" % (cnt/elapsed_wall, lbl) if not quiet: def format_float_or_none(v): if v is None: return "<unavailable>" else: return "%g" % v print("elapsed: %s s event, %s s marker-event %s s wall " "(%d rounds)%s" % ( format_float_or_none(elapsed_event), format_float_or_none(elapsed_event_marker), format_float_or_none(elapsed_wall), timing_rounds, rates)) if do_check: ref_rates = "" for cnt, lbl in zip(op_count, op_label): ref_rates += " %g %s/s" % (cnt/ref_elapsed_event, lbl) if not quiet: print("ref: elapsed: %g s event, %g s wall%s" % ( ref_elapsed_event, ref_elapsed_wall, ref_rates)) # }}} result_dict = {} result_dict["elapsed_event"] = elapsed_event result_dict["elapsed_event_marker"] = elapsed_event_marker result_dict["elapsed_wall"] = elapsed_wall result_dict["timing_rounds"] = timing_rounds if do_check: result_dict["ref_elapsed_event"] = ref_elapsed_event result_dict["ref_elapsed_wall"] = ref_elapsed_wall return result_dict
def __call__(self, knl): insns = knl.instructions[:] data = knl.args[:] # we generally need to infer the dtypes for temporary variables at this stage # in case any of them are atomic from loopy.type_inference import infer_unknown_types from loopy.types import to_loopy_type from pymbolic.primitives import Sum knl = infer_unknown_types(knl, expect_completion=True) temps = knl.temporary_variables.copy() def _check_atomic_data(insn): # get the kernel arg written by this insn written = insn.assignee_var_names()[0] ind = next((i for i, d in enumerate(data) if d.name == written), None) # make sure the dtype is atomic, if not update it if ind is not None and not isinstance(data[ind].dtype, AtomicType): assert data[ind].dtype is not None, ( "Change of dtype to atomic doesn't work if base dype is not" " populated") data[ind] = data[ind].copy(for_atomic=True) elif ind is None: assert written in temps, ( 'Cannot find written atomic variable: {}'.format(written)) if not isinstance(temps[written].dtype, AtomicType): temps[written] = temps[written].copy(dtype=to_loopy_type( temps[written].dtype, for_atomic=True)) return written for insn_ind, insn in enumerate(insns): if insn.id in self.atomic_ids: written = _check_atomic_data(insn) # and force the insn to an atomic update insns[insn_ind] = insn.copy( atomicity=(lp.AtomicUpdate(written), )) elif insn.id in self.init_ids: written = _check_atomic_data(insn) # setup an atomic init insns[insn_ind] = insn.copy( atomicity=(lp.AtomicInit(written), )) elif insn.id in self.split_ids: if isinstance(insn.expression, Sum) and \ insn.assignee in insn.expression.children: written = _check_atomic_data(insn) # get children that are not the assignee and re-sum others = Sum( tuple(x for x in insn.expression.children if x != insn.assignee)) # finally implement the split as a += sum(others) / vec_width div_size = np.minimum(self.vec_width, self.split_size) insns[insn_ind] = insn.copy( expression=insn.assignee + others / div_size, atomicity=(lp.AtomicUpdate(written), )) else: # otherwise can simply divide div_size = np.minimum(self.vec_width, self.split_size) insns[insn_ind] = insn.copy(expression=insn.expression / div_size) # now force all instructions into inner loop return super(atomic_deep_specialization, self).__call__( knl.copy(instructions=insns, args=data, temporary_variables=temps))