def get_typed_and_scheduled_kernel(self, var_to_dtype_set): kernel = self.kernel from loopy.kernel.tools import add_dtypes if var_to_dtype_set: var_to_dtype = {} for var, dtype in var_to_dtype_set: try: dest_name = kernel.impl_arg_to_arg[var].name except KeyError: dest_name = var try: var_to_dtype[dest_name] = dtype except KeyError: raise LoopyError("cannot set type for '%s': " "no known variable/argument with that name" % var) kernel = add_dtypes(kernel, var_to_dtype) from loopy.preprocess import infer_unknown_types kernel = infer_unknown_types(kernel, expect_completion=True) if kernel.schedule is None: from loopy.preprocess import preprocess_kernel kernel = preprocess_kernel(kernel) from loopy.schedule import get_one_scheduled_kernel kernel = get_one_scheduled_kernel(kernel) return kernel
def test_kernel_splitting_with_loop(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{ [i,k]: 0<=i<n and 0<=k<3 }", """ c[k,i] = a[k, i + 1] out[k,i] = c[k,i] """) knl = lp.add_and_infer_dtypes(knl, {"a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32}) ref_knl = knl knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") # schedule from loopy.preprocess import preprocess_kernel knl = preprocess_kernel(knl) from loopy.schedule import get_one_scheduled_kernel knl = get_one_scheduled_kernel(knl) # map schedule onto host or device print(knl) cgr = lp.generate_code_v2(knl) assert len(cgr.device_programs) == 2 print(cgr.device_code()) print(cgr.host_code()) lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5))
def generate_body(kernel): if kernel.schedule is None: from loopy.schedule import get_one_scheduled_kernel kernel = get_one_scheduled_kernel(kernel) from loopy.kernel import kernel_state if kernel.state != kernel_state.SCHEDULED: raise LoopyError("cannot generate code for a kernel that has not been " "scheduled") from loopy.preprocess import infer_unknown_types kernel = infer_unknown_types(kernel, expect_completion=True) from loopy.check import pre_codegen_checks pre_codegen_checks(kernel) logger.info("%s: generate code: start" % kernel.name) allow_complex = False for var in kernel.args + list(six.itervalues(kernel.temporary_variables)): if var.dtype.kind == "c": allow_complex = True seen_dtypes = set() seen_functions = set() initial_implemented_domain = isl.BasicSet.from_params(kernel.assumptions) codegen_state = CodeGenerationState( kernel=kernel, implemented_domain=initial_implemented_domain, implemented_predicates=frozenset(), seen_dtypes=seen_dtypes, seen_functions=seen_functions, var_subst_map={}, allow_complex=allow_complex, ) code_str, implemented_domains = kernel.target.generate_body(kernel, codegen_state) from loopy.check import check_implemented_domains assert check_implemented_domains(kernel, implemented_domains, code_str) logger.info("%s: generate code: done" % kernel.name) return code_str
def test_kernel_splitting_with_loop_and_private_temporary(ctx_factory): ctx = ctx_factory() pytest.xfail("spilling doesn't yet use local axes") knl = lp.make_kernel( "{ [i,k]: 0<=i<n and 0<=k<3 }", """ <> t_private_scalar = a[k,i+1] <> t_private_array[i % 2] = a[k,i+1] c[k,i] = a[k,i+1] out[k,i] = c[k,i] + t_private_scalar + t_private_array[i % 2] """) knl = lp.add_and_infer_dtypes(knl, { "a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32 }) knl = lp.set_temporary_scope(knl, "t_private_scalar", "private") knl = lp.set_temporary_scope(knl, "t_private_array", "private") ref_knl = knl knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") # schedule from loopy.preprocess import preprocess_kernel knl = preprocess_kernel(knl) from loopy.schedule import get_one_scheduled_kernel knl = get_one_scheduled_kernel(knl) # map schedule onto host or device print(knl) cgr = lp.generate_code_v2(knl) assert len(cgr.device_programs) == 2 print(cgr.device_code()) print(cgr.host_code()) lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5))
def test_kernel_splitting_with_loop(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{ [i,k]: 0<=i<n and 0<=k<3 }", """ c[k,i] = a[k, i + 1] out[k,i] = c[k,i] """) knl = lp.add_and_infer_dtypes(knl, { "a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32 }) ref_knl = knl knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") # schedule from loopy.preprocess import preprocess_kernel knl = preprocess_kernel(knl) from loopy.schedule import get_one_scheduled_kernel knl = get_one_scheduled_kernel(knl) # map schedule onto host or device print(knl) cgr = lp.generate_code_v2(knl) assert len(cgr.device_programs) == 2 print(cgr.device_code()) print(cgr.host_code()) lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5))
def generate_code_v2(kernel): """ :returns: a :class:`CodeGenerationResult` """ from loopy.kernel import kernel_state if kernel.state == kernel_state.INITIAL: from loopy.preprocess import preprocess_kernel kernel = preprocess_kernel(kernel) if kernel.schedule is None: from loopy.schedule import get_one_scheduled_kernel kernel = get_one_scheduled_kernel(kernel) if kernel.state != kernel_state.SCHEDULED: raise LoopyError("cannot generate code for a kernel that has not been " "scheduled") # {{{ cache retrieval from loopy import CACHING_ENABLED if CACHING_ENABLED: input_kernel = kernel try: result = code_gen_cache[input_kernel] logger.debug("%s: code generation cache hit" % kernel.name) return result except KeyError: pass # }}} from loopy.type_inference import infer_unknown_types kernel = infer_unknown_types(kernel, expect_completion=True) from loopy.check import pre_codegen_checks pre_codegen_checks(kernel) logger.info("%s: generate code: start" % kernel.name) # {{{ examine arg list from loopy.kernel.data import ValueArg from loopy.kernel.array import ArrayBase implemented_data_info = [] for arg in kernel.args: is_written = arg.name in kernel.get_written_variables() if isinstance(arg, ArrayBase): implemented_data_info.extend( arg.decl_info( kernel.target, is_written=is_written, index_dtype=kernel.index_dtype)) elif isinstance(arg, ValueArg): implemented_data_info.append(ImplementedDataInfo( target=kernel.target, name=arg.name, dtype=arg.dtype, arg_class=ValueArg, is_written=is_written)) else: raise ValueError("argument type not understood: '%s'" % type(arg)) allow_complex = False for var in kernel.args + list(six.itervalues(kernel.temporary_variables)): if var.dtype.involves_complex(): allow_complex = True # }}} seen_dtypes = set() seen_functions = set() seen_atomic_dtypes = set() initial_implemented_domain = isl.BasicSet.from_params(kernel.assumptions) codegen_state = CodeGenerationState( kernel=kernel, implemented_data_info=implemented_data_info, implemented_domain=initial_implemented_domain, implemented_predicates=frozenset(), seen_dtypes=seen_dtypes, seen_functions=seen_functions, seen_atomic_dtypes=seen_atomic_dtypes, var_subst_map={}, allow_complex=allow_complex, var_name_generator=kernel.get_var_name_generator(), is_generating_device_code=False, gen_program_name=( kernel.target.host_program_name_prefix + kernel.name + kernel.target.host_program_name_suffix), schedule_index_end=len(kernel.schedule)) from loopy.codegen.result import generate_host_or_device_program codegen_result = generate_host_or_device_program( codegen_state, schedule_index=0) device_code_str = codegen_result.device_code() from loopy.check import check_implemented_domains assert check_implemented_domains(kernel, codegen_result.implemented_domains, device_code_str) # {{{ handle preambles for arg in kernel.args: seen_dtypes.add(arg.dtype) for tv in six.itervalues(kernel.temporary_variables): seen_dtypes.add(tv.dtype) preambles = kernel.preambles[:] preamble_info = PreambleInfo( kernel=kernel, seen_dtypes=seen_dtypes, seen_functions=seen_functions, # a set of LoopyTypes (!) seen_atomic_dtypes=seen_atomic_dtypes) preamble_generators = (kernel.preamble_generators + kernel.target.get_device_ast_builder().preamble_generators()) for prea_gen in preamble_generators: preambles.extend(prea_gen(preamble_info)) codegen_result = codegen_result.copy(device_preambles=preambles) # }}} logger.info("%s: generate code: done" % kernel.name) if CACHING_ENABLED: code_gen_cache[input_kernel] = codegen_result return codegen_result
def get_dot_dependency_graph(kernel, iname_cluster=True, use_insn_id=False): """Return a string in the `dot <http://graphviz.org/>`_ language depicting dependencies among kernel instructions. """ # make sure all automatically added stuff shows up from loopy.kernel.creation import apply_single_writer_depencency_heuristic kernel = apply_single_writer_depencency_heuristic(kernel, warn_if_used=False) if iname_cluster and not kernel.schedule: try: from loopy.schedule import get_one_scheduled_kernel kernel = get_one_scheduled_kernel(kernel) except RuntimeError as e: iname_cluster = False from warnings import warn warn("error encountered during scheduling for dep graph -- " "cannot perform iname clustering: %s(%s)" % (type(e).__name__, e)) dep_graph = {} lines = [] from loopy.kernel.data import MultiAssignmentBase, CInstruction for insn in kernel.instructions: if isinstance(insn, MultiAssignmentBase): op = "%s <- %s" % (insn.assignees, insn.expression) if len(op) > 200: op = op[:200] + "..." elif isinstance(insn, CInstruction): op = "<C instruction %s>" % insn.id else: op = "<instruction %s>" % insn.id if use_insn_id: insn_label = insn.id tooltip = op else: insn_label = op tooltip = insn.id lines.append("\"%s\" [label=\"%s\",shape=\"box\",tooltip=\"%s\"];" % ( insn.id, repr(insn_label)[1:-1], repr(tooltip)[1:-1], )) for dep in insn.depends_on: dep_graph.setdefault(insn.id, set()).add(dep) # {{{ O(n^3) transitive reduction # first, compute transitive closure by fixed point iteration while True: changed_something = False for insn_1 in dep_graph: for insn_2 in dep_graph.get(insn_1, set()).copy(): for insn_3 in dep_graph.get(insn_2, set()).copy(): if insn_3 not in dep_graph.get(insn_1, set()): changed_something = True dep_graph[insn_1].add(insn_3) if not changed_something: break for insn_1 in dep_graph: for insn_2 in dep_graph.get(insn_1, set()).copy(): for insn_3 in dep_graph.get(insn_2, set()).copy(): if insn_3 in dep_graph.get(insn_1, set()): dep_graph[insn_1].remove(insn_3) # }}} for insn_1 in dep_graph: for insn_2 in dep_graph.get(insn_1, set()): lines.append("%s -> %s" % (insn_2, insn_1)) if iname_cluster: from loopy.schedule import (EnterLoop, LeaveLoop, RunInstruction, Barrier, CallKernel, ReturnFromKernel) for sched_item in kernel.schedule: if isinstance(sched_item, EnterLoop): lines.append("subgraph cluster_%s { label=\"%s\"" % (sched_item.iname, sched_item.iname)) elif isinstance(sched_item, LeaveLoop): lines.append("}") elif isinstance(sched_item, RunInstruction): lines.append(sched_item.insn_id) elif isinstance(sched_item, (CallKernel, ReturnFromKernel, Barrier)): pass else: raise LoopyError("schedule item not unterstood: %r" % sched_item) return "digraph %s {\n%s\n}" % (kernel.name, "\n".join(lines))
def generate_code_v2(kernel): """ :returns: a :class:`CodeGenerationResult` """ from loopy.kernel import KernelState if kernel.state == KernelState.INITIAL: from loopy.preprocess import preprocess_kernel kernel = preprocess_kernel(kernel) if kernel.schedule is None: from loopy.schedule import get_one_scheduled_kernel kernel = get_one_scheduled_kernel(kernel) if kernel.state != KernelState.SCHEDULED: raise LoopyError("cannot generate code for a kernel that has not been " "scheduled") # {{{ cache retrieval from loopy import CACHING_ENABLED if CACHING_ENABLED: input_kernel = kernel try: result = code_gen_cache[input_kernel] logger.debug("%s: code generation cache hit" % kernel.name) return result except KeyError: pass # }}} from loopy.type_inference import infer_unknown_types kernel = infer_unknown_types(kernel, expect_completion=True) from loopy.check import pre_codegen_checks pre_codegen_checks(kernel) logger.info("%s: generate code: start" % kernel.name) # {{{ examine arg list from loopy.kernel.data import ValueArg from loopy.kernel.array import ArrayBase implemented_data_info = [] for arg in kernel.args: is_written = arg.name in kernel.get_written_variables() if isinstance(arg, ArrayBase): implemented_data_info.extend( arg.decl_info( kernel.target, is_written=is_written, index_dtype=kernel.index_dtype)) elif isinstance(arg, ValueArg): implemented_data_info.append(ImplementedDataInfo( target=kernel.target, name=arg.name, dtype=arg.dtype, arg_class=ValueArg, is_written=is_written)) else: raise ValueError("argument type not understood: '%s'" % type(arg)) allow_complex = False for var in kernel.args + list(six.itervalues(kernel.temporary_variables)): if var.dtype.involves_complex(): allow_complex = True # }}} seen_dtypes = set() seen_functions = set() seen_atomic_dtypes = set() initial_implemented_domain = isl.BasicSet.from_params(kernel.assumptions) codegen_state = CodeGenerationState( kernel=kernel, implemented_data_info=implemented_data_info, implemented_domain=initial_implemented_domain, implemented_predicates=frozenset(), seen_dtypes=seen_dtypes, seen_functions=seen_functions, seen_atomic_dtypes=seen_atomic_dtypes, var_subst_map={}, allow_complex=allow_complex, var_name_generator=kernel.get_var_name_generator(), is_generating_device_code=False, gen_program_name=( kernel.target.host_program_name_prefix + kernel.name + kernel.target.host_program_name_suffix), schedule_index_end=len(kernel.schedule)) from loopy.codegen.result import generate_host_or_device_program codegen_result = generate_host_or_device_program( codegen_state, schedule_index=0) device_code_str = codegen_result.device_code() from loopy.check import check_implemented_domains assert check_implemented_domains(kernel, codegen_result.implemented_domains, device_code_str) # {{{ handle preambles for arg in kernel.args: seen_dtypes.add(arg.dtype) for tv in six.itervalues(kernel.temporary_variables): seen_dtypes.add(tv.dtype) preambles = kernel.preambles[:] preamble_info = PreambleInfo( kernel=kernel, seen_dtypes=seen_dtypes, seen_functions=seen_functions, # a set of LoopyTypes (!) seen_atomic_dtypes=seen_atomic_dtypes, codegen_state=codegen_state ) preamble_generators = (kernel.preamble_generators + kernel.target.get_device_ast_builder().preamble_generators()) for prea_gen in preamble_generators: preambles.extend(prea_gen(preamble_info)) codegen_result = codegen_result.copy(device_preambles=preambles) # }}} # For faster unpickling in the common case when implemented_domains isn't needed. from loopy.tools import LazilyUnpicklingDict codegen_result = codegen_result.copy( implemented_domains=LazilyUnpicklingDict( codegen_result.implemented_domains)) logger.info("%s: generate code: done" % kernel.name) if CACHING_ENABLED: code_gen_cache.store_if_not_present(input_kernel, codegen_result) return codegen_result
def get_dot_dependency_graph(kernel, iname_cluster=True, use_insn_id=False): """Return a string in the `dot <http://graphviz.org/>`_ language depicting dependencies among kernel instructions. """ # make sure all automatically added stuff shows up from loopy.preprocess import add_default_dependencies kernel = add_default_dependencies(kernel) if iname_cluster and not kernel.schedule: try: from loopy.schedule import get_one_scheduled_kernel kernel = get_one_scheduled_kernel(kernel) except RuntimeError as e: iname_cluster = False from warnings import warn warn("error encountered during scheduling for dep graph -- " "cannot perform iname clustering: %s(%s)" % (type(e).__name__, e)) dep_graph = {} lines = [] from loopy.kernel.data import MultiAssignmentBase, CInstruction for insn in kernel.instructions: if isinstance(insn, MultiAssignmentBase): op = "%s <- %s" % (insn.assignees, insn.expression) if len(op) > 200: op = op[:200] + "..." elif isinstance(insn, CInstruction): op = "<C instruction %s>" % insn.id else: op = "<instruction %s>" % insn.id if use_insn_id: insn_label = insn.id tooltip = op else: insn_label = op tooltip = insn.id lines.append("\"%s\" [label=\"%s\",shape=\"box\",tooltip=\"%s\"];" % ( insn.id, repr(insn_label)[1:-1], repr(tooltip)[1:-1], )) for dep in insn.depends_on: dep_graph.setdefault(insn.id, set()).add(dep) # {{{ O(n^3) transitive reduction # first, compute transitive closure by fixed point iteration while True: changed_something = False for insn_1 in dep_graph: for insn_2 in dep_graph.get(insn_1, set()).copy(): for insn_3 in dep_graph.get(insn_2, set()).copy(): if insn_3 not in dep_graph.get(insn_1, set()): changed_something = True dep_graph[insn_1].add(insn_3) if not changed_something: break for insn_1 in dep_graph: for insn_2 in dep_graph.get(insn_1, set()).copy(): for insn_3 in dep_graph.get(insn_2, set()).copy(): if insn_3 in dep_graph.get(insn_1, set()): dep_graph[insn_1].remove(insn_3) # }}} for insn_1 in dep_graph: for insn_2 in dep_graph.get(insn_1, set()): lines.append("%s -> %s" % (insn_2, insn_1)) if iname_cluster: from loopy.schedule import EnterLoop, LeaveLoop, RunInstruction, Barrier for sched_item in kernel.schedule: if isinstance(sched_item, EnterLoop): lines.append("subgraph cluster_%s { label=\"%s\"" % (sched_item.iname, sched_item.iname)) elif isinstance(sched_item, LeaveLoop): lines.append("}") elif isinstance(sched_item, RunInstruction): lines.append(sched_item.insn_id) elif isinstance(sched_item, Barrier): pass else: raise LoopyError("schedule item not unterstood: %r" % sched_item) return "digraph %s {\n%s\n}" % ( kernel.name, "\n".join(lines) )
def generate_code(kernel, device=None): if device is not None: from warnings import warn warn("passing 'device' to generate_code() is deprecated", DeprecationWarning, stacklevel=2) if kernel.schedule is None: from loopy.schedule import get_one_scheduled_kernel kernel = get_one_scheduled_kernel(kernel) from loopy.kernel import kernel_state if kernel.state != kernel_state.SCHEDULED: raise LoopyError("cannot generate code for a kernel that has not been " "scheduled") # {{{ cache retrieval from loopy import CACHING_ENABLED if CACHING_ENABLED: input_kernel = kernel try: result = code_gen_cache[input_kernel] logger.info("%s: code generation cache hit" % kernel.name) return result except KeyError: pass # }}} from loopy.preprocess import infer_unknown_types kernel = infer_unknown_types(kernel, expect_completion=True) from loopy.check import pre_codegen_checks pre_codegen_checks(kernel) logger.info("%s: generate code: start" % kernel.name) # {{{ examine arg list from loopy.kernel.data import ValueArg from loopy.kernel.array import ArrayBase impl_arg_info = [] for arg in kernel.args: if isinstance(arg, ArrayBase): impl_arg_info.extend( arg.decl_info( kernel.target, is_written=arg.name in kernel.get_written_variables(), index_dtype=kernel.index_dtype)) elif isinstance(arg, ValueArg): impl_arg_info.append(ImplementedDataInfo( target=kernel.target, name=arg.name, dtype=arg.dtype, cgen_declarator=arg.get_arg_decl(kernel.target), arg_class=ValueArg)) else: raise ValueError("argument type not understood: '%s'" % type(arg)) allow_complex = False for var in kernel.args + list(six.itervalues(kernel.temporary_variables)): if var.dtype.kind == "c": allow_complex = True # }}} seen_dtypes = set() seen_functions = set() initial_implemented_domain = isl.BasicSet.from_params(kernel.assumptions) codegen_state = CodeGenerationState( kernel=kernel, implemented_domain=initial_implemented_domain, implemented_predicates=frozenset(), seen_dtypes=seen_dtypes, seen_functions=seen_functions, var_subst_map={}, allow_complex=allow_complex) code_str, implemented_domains = kernel.target.generate_code( kernel, codegen_state, impl_arg_info) from loopy.check import check_implemented_domains assert check_implemented_domains(kernel, implemented_domains, code_str) # {{{ handle preambles for arg in kernel.args: seen_dtypes.add(arg.dtype) for tv in six.itervalues(kernel.temporary_variables): seen_dtypes.add(tv.dtype) preambles = kernel.preambles[:] preamble_generators = (kernel.preamble_generators + kernel.target.preamble_generators()) for prea_gen in preamble_generators: preambles.extend(prea_gen(kernel, seen_dtypes, seen_functions)) seen_preamble_tags = set() dedup_preambles = [] for tag, preamble in sorted(preambles, key=lambda tag_code: tag_code[0]): if tag in seen_preamble_tags: continue seen_preamble_tags.add(tag) dedup_preambles.append(preamble) from loopy.tools import remove_common_indentation preamble_codes = [ remove_common_indentation(lines) + "\n" for lines in dedup_preambles] code_str = "".join(preamble_codes) + code_str # }}} logger.info("%s: generate code: done" % kernel.name) result = code_str, impl_arg_info if CACHING_ENABLED: code_gen_cache[input_kernel] = result return result
""" for i, k ... gbarrier c[k,i] = a[k, i + 1] ... gbarrier out[k,i] = c[k,i] end """, seq_dependencies=True) # transform knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") knl = lp.add_and_infer_dtypes(knl, {"a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32}) # schedule from loopy.preprocess import preprocess_kernel knl = preprocess_kernel(knl) from loopy.schedule import get_one_scheduled_kernel knl = knl.with_kernel(get_one_scheduled_kernel(knl["loopy_kernel"], knl.callables_table)) # map schedule onto host or device print(knl) cgr = lp.generate_code_v2(knl) print(cgr.device_code()) print(cgr.host_code())
knl = lp.make_kernel( "{ [i,k]: 0<=i<n and 0<=k<3 }", """ for i, k ... gbarrier c[k,i] = a[k, i + 1] ... gbarrier out[k,i] = c[k,i] end """, seq_dependencies=True) # transform knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") knl = lp.add_and_infer_dtypes(knl, {"a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32}) # schedule from loopy.preprocess import preprocess_kernel knl = preprocess_kernel(knl) from loopy.schedule import get_one_scheduled_kernel knl = get_one_scheduled_kernel(knl) # map schedule onto host or device print(knl) cgr = lp.generate_code_v2(knl) print(cgr.device_code()) print(cgr.host_code())