def get_insn_ids_for_block_at(self, sched_index): """ Cached variant of :func:`loopy.schedule.get_insn_ids_for_block_at`. """ from loopy.schedule import get_insn_ids_for_block_at return get_insn_ids_for_block_at(self.kernel_proxy.schedule, sched_index)
def get_function_declaration(self, codegen_state, codegen_result, schedule_index): fdecl = super().get_function_declaration(codegen_state, codegen_result, schedule_index) from loopy.target.c import FunctionDeclarationWrapper assert isinstance(fdecl, FunctionDeclarationWrapper) fdecl = fdecl.subdecl from cgen.cuda import CudaGlobal, CudaLaunchBounds fdecl = CudaGlobal(fdecl) if self.target.extern_c: from cgen import Extern fdecl = Extern("C", fdecl) from loopy.schedule import get_insn_ids_for_block_at _, local_grid_size = \ codegen_state.kernel.get_grid_sizes_for_insn_ids_as_exprs( get_insn_ids_for_block_at( codegen_state.kernel.linearization, schedule_index), codegen_state.callables_table) from loopy.symbolic import get_dependencies if not get_dependencies(local_grid_size): # Sizes can't have parameter dependencies if they are # to be used in static thread block size. from pytools import product nthreads = product(local_grid_size) fdecl = CudaLaunchBounds(nthreads, fdecl) return FunctionDeclarationWrapper(fdecl)
def get_function_declaration(self, codegen_state, codegen_result, schedule_index): fdecl = super(CUDACASTBuilder, self).get_function_declaration(codegen_state, codegen_result, schedule_index) from cgen.cuda import CudaGlobal, CudaLaunchBounds fdecl = CudaGlobal(fdecl) if self.target.extern_c: from cgen import Extern fdecl = Extern("C", fdecl) from loopy.schedule import get_insn_ids_for_block_at _, local_grid_size = \ codegen_state.kernel.get_grid_sizes_for_insn_ids_as_exprs( get_insn_ids_for_block_at( codegen_state.kernel.schedule, schedule_index)) from loopy.symbolic import get_dependencies if not get_dependencies(local_grid_size): # Sizes can't have parameter dependencies if they are # to be used in static thread block size. from pytools import product nthreads = product(local_grid_size) fdecl = CudaLaunchBounds(nthreads, fdecl) return fdecl
def get_function_declaration(self, codegen_state, codegen_result, schedule_index): fdecl = super(CUDACASTBuilder, self).get_function_declaration( codegen_state, codegen_result, schedule_index) from cgen.cuda import CudaGlobal, CudaLaunchBounds fdecl = CudaGlobal(fdecl) if self.target.extern_c: from cgen import Extern fdecl = Extern("C", fdecl) from loopy.schedule import get_insn_ids_for_block_at _, local_grid_size = \ codegen_state.kernel.get_grid_sizes_for_insn_ids_as_exprs( get_insn_ids_for_block_at( codegen_state.kernel.schedule, schedule_index)) from loopy.symbolic import get_dependencies if not get_dependencies(local_grid_size): # Sizes can't have parameter dependencies if they are # to be used in static thread block size. from pytools import product nthreads = product(local_grid_size) fdecl = CudaLaunchBounds(nthreads, fdecl) return fdecl
def get_function_declaration(self, codegen_state, codegen_result, schedule_index): fdecl = super(OpenCLCASTBuilder, self).get_function_declaration(codegen_state, codegen_result, schedule_index) from loopy.target.c import FunctionDeclarationWrapper assert isinstance(fdecl, FunctionDeclarationWrapper) fdecl = fdecl.subdecl from cgen.opencl import CLKernel, CLRequiredWorkGroupSize fdecl = CLKernel(fdecl) from loopy.schedule import get_insn_ids_for_block_at _, local_sizes = codegen_state.kernel.get_grid_sizes_for_insn_ids_as_exprs( get_insn_ids_for_block_at(codegen_state.kernel.schedule, schedule_index)) from loopy.symbolic import get_dependencies if not get_dependencies(local_sizes): # sizes can't have parameter dependencies if they are # to be used in static WG size. fdecl = CLRequiredWorkGroupSize(local_sizes, fdecl) return FunctionDeclarationWrapper(fdecl)
def get_function_declaration(self, codegen_state, codegen_result, schedule_index): fdecl = super().get_function_declaration( codegen_state, codegen_result, schedule_index) from loopy.target.c import FunctionDeclarationWrapper assert isinstance(fdecl, FunctionDeclarationWrapper) if not codegen_state.is_entrypoint: # auxiliary kernels need not mention opencl speicific qualifiers # for a functions signature return fdecl fdecl = fdecl.subdecl from cgen.opencl import CLKernel, CLRequiredWorkGroupSize fdecl = CLKernel(fdecl) from loopy.schedule import get_insn_ids_for_block_at _, local_sizes = codegen_state.kernel.get_grid_sizes_for_insn_ids_as_exprs( get_insn_ids_for_block_at( codegen_state.kernel.linearization, schedule_index), codegen_state.callables_table) from loopy.symbolic import get_dependencies if not get_dependencies(local_sizes): # sizes can't have parameter dependencies if they are # to be used in static WG size. fdecl = CLRequiredWorkGroupSize(local_sizes, fdecl) return FunctionDeclarationWrapper(fdecl)
def get_usable_inames_for_conditional(kernel, sched_index): from loopy.schedule import ( find_active_inames_at, get_insn_ids_for_block_at, has_barrier_within) from loopy.kernel.data import (ConcurrentTag, LocalIndexTagBase, VectorizeTag, IlpBaseTag) result = find_active_inames_at(kernel, sched_index) crosses_barrier = has_barrier_within(kernel, sched_index) # Find our containing subkernel. Grab inames for all insns from there. within_subkernel = False for sched_item_index, sched_item in enumerate(kernel.schedule[:sched_index]): from loopy.schedule import CallKernel, ReturnFromKernel if isinstance(sched_item, CallKernel): within_subkernel = True subkernel_index = sched_item_index elif isinstance(sched_item, ReturnFromKernel): within_subkernel = False if not within_subkernel: # Outside all subkernels - use only inames available to host. return frozenset(result) insn_ids_for_subkernel = get_insn_ids_for_block_at( kernel.schedule, subkernel_index) inames_for_subkernel = ( iname for insn in insn_ids_for_subkernel for iname in kernel.insn_inames(insn)) for iname in inames_for_subkernel: # Parallel inames are defined within a subkernel, BUT: # # - local indices may not be used in conditionals that cross barriers. # # - ILP indices and vector lane indices are not available in loop # bounds, they only get defined at the innermost level of nesting. if ( kernel.iname_tags_of_type(iname, ConcurrentTag) and not kernel.iname_tags_of_type(iname, VectorizeTag) and not (kernel.iname_tags_of_type(iname, LocalIndexTagBase) and crosses_barrier) and not kernel.iname_tags_of_type(iname, IlpBaseTag) ): result.add(iname) return frozenset(result)
def get_usable_inames_for_conditional(kernel, sched_index): from loopy.schedule import ( find_active_inames_at, get_insn_ids_for_block_at, has_barrier_within) from loopy.kernel.data import (ConcurrentTag, LocalIndexTagBase, IlpBaseTag) result = find_active_inames_at(kernel, sched_index) crosses_barrier = has_barrier_within(kernel, sched_index) # Find our containing subkernel. Grab inames for all insns from there. within_subkernel = False for sched_item_index, sched_item in enumerate(kernel.schedule[:sched_index+1]): from loopy.schedule import CallKernel, ReturnFromKernel if isinstance(sched_item, CallKernel): within_subkernel = True subkernel_index = sched_item_index elif isinstance(sched_item, ReturnFromKernel): within_subkernel = False if not within_subkernel: # Outside all subkernels - use only inames available to host. return frozenset(result) insn_ids_for_subkernel = get_insn_ids_for_block_at( kernel.schedule, subkernel_index) inames_for_subkernel = ( iname for insn in insn_ids_for_subkernel for iname in kernel.insn_inames(insn)) for iname in inames_for_subkernel: # Parallel inames are defined within a subkernel, BUT: # # - local indices may not be used in conditionals that cross barriers. # # - ILP indices are not available in loop bounds, they only get defined # at the innermost level of nesting. if ( kernel.iname_tags_of_type(iname, ConcurrentTag) and not (kernel.iname_tags_of_type(iname, LocalIndexTagBase) and crosses_barrier) and not kernel.iname_tags_of_type(iname, IlpBaseTag) ): result.add(iname) return frozenset(result)
def get_function_declaration(self, codegen_state, codegen_result, schedule_index): fdecl = super(OpenCLCASTBuilder, self).get_function_declaration( codegen_state, codegen_result, schedule_index) from cgen.opencl import CLKernel, CLRequiredWorkGroupSize fdecl = CLKernel(fdecl) from loopy.schedule import get_insn_ids_for_block_at _, local_sizes = codegen_state.kernel.get_grid_sizes_for_insn_ids_as_exprs( get_insn_ids_for_block_at( codegen_state.kernel.schedule, schedule_index)) from loopy.symbolic import get_dependencies if not get_dependencies(local_sizes): # sizes can't have parameter dependencies if they are # to be used in static WG size. fdecl = CLRequiredWorkGroupSize(local_sizes, fdecl) return fdecl
def get_usable_inames_for_conditional(kernel, sched_index): from loopy.schedule import (find_active_inames_at, get_insn_ids_for_block_at, has_barrier_within) from loopy.kernel.data import ParallelTag, LocalIndexTagBase, IlpBaseTag result = find_active_inames_at(kernel, sched_index) crosses_barrier = has_barrier_within(kernel, sched_index) # Find our containing subkernel, grab inames for all insns from there. subkernel_index = sched_index from loopy.schedule import CallKernel while not isinstance(kernel.schedule[subkernel_index], CallKernel): subkernel_index -= 1 insn_ids_for_subkernel = get_insn_ids_for_block_at(kernel.schedule, subkernel_index) inames_for_subkernel = (iname for insn in insn_ids_for_subkernel for iname in kernel.insn_inames(insn)) for iname in inames_for_subkernel: tag = kernel.iname_to_tag.get(iname) # Parallel inames are defined within a subkernel, BUT: # # - local indices may not be used in conditionals that cross barriers. # # - ILP indices are not available in loop bounds, they only get defined # at the innermost level of nesting. if (isinstance(tag, ParallelTag) and not (isinstance(tag, LocalIndexTagBase) and crosses_barrier) and not isinstance(tag, IlpBaseTag)): result.add(iname) return frozenset(result)
def _check_for_unused_hw_axes_in_kernel_chunk(kernel, sched_index=None): from loopy.schedule import (CallKernel, RunInstruction, Barrier, EnterLoop, LeaveLoop, ReturnFromKernel, get_insn_ids_for_block_at, gather_schedule_block) boostable_insn_ids = _find_boostable_insn_ids(kernel) if sched_index is None: group_axes = set() local_axes = set() i = 0 loop_end_i = past_end_i = len(kernel.schedule) else: assert isinstance(kernel.schedule[sched_index], CallKernel) _, past_end_i = gather_schedule_block(kernel.schedule, sched_index) group_size, local_size = kernel.get_grid_sizes_for_insn_ids_as_exprs( get_insn_ids_for_block_at(kernel.schedule, sched_index)) group_axes = {ax for ax, length in enumerate(group_size)} local_axes = {ax for ax, length in enumerate(local_size)} i = sched_index + 1 assert isinstance(kernel.schedule[past_end_i - 1], ReturnFromKernel) loop_end_i = past_end_i - 1 # alternative: just disregard length-1 dimensions? from loopy.kernel.data import (LocalIndexTag, AutoLocalIndexTagBase, GroupIndexTag) while i < loop_end_i: sched_item = kernel.schedule[i] if isinstance(sched_item, CallKernel): i = _check_for_unused_hw_axes_in_kernel_chunk(kernel, i) elif isinstance(sched_item, RunInstruction): insn = kernel.id_to_insn[sched_item.insn_id] i += 1 group_axes_used = set() local_axes_used = set() for iname in insn.within_inames: ltags = kernel.iname_tags_of_type(iname, LocalIndexTag, max_num=1) gtags = kernel.iname_tags_of_type(iname, GroupIndexTag, max_num=1) altags = kernel.iname_tags_of_type(iname, AutoLocalIndexTagBase, max_num=1) if ltags: tag, = ltags local_axes_used.add(tag.axis) elif gtags: tag, = gtags group_axes_used.add(tag.axis) elif altags: raise LoopyError("auto local tag encountered") if group_axes != group_axes_used: if insn.id in boostable_insn_ids: warn("instruction '%s' does not use all group hw axes" " (available: %s used:%s). Loopy will generate code" " with the instruction executed along all the" " missing hw axes. This will result in an" " error from 2021.x onwards, calling" " loopy.add_inames_for_unused_hw_axes(...)" " might help in the transition." % (insn.id, ",".join(str(i) for i in group_axes), ",".join(str(i) for i in group_axes_used)), DeprecationWarning, stacklevel=2) else: raise LoopyError( "instruction '%s' does not use all group" " hw axes (available: %s used:%s)" % (insn.id, ",".join(str(i) for i in group_axes), ",".join(str(i) for i in group_axes_used))) if local_axes != local_axes_used: if insn.id in boostable_insn_ids: warn("instruction '%s' does not use all local hw axes" " (available: %s used:%s). Loopy will generate code" " with the instruction executed along all the" " missing hw axes. This will result in an" " error from 2021.x onwards, calling" " loopy.add_inames_for_unused_hw_axes(...)" " might help in the transition." % (insn.id, ",".join(str(i) for i in local_axes), ",".join(str(i) for i in local_axes_used)), DeprecationWarning, stacklevel=2) else: raise LoopyError( "instruction '%s' does not use all local" " hw axes (available: %s used:%s)" % (insn.id, ",".join(str(i) for i in local_axes), ",".join(str(i) for i in local_axes_used))) elif isinstance(sched_item, (Barrier, EnterLoop, LeaveLoop)): i += 1 continue else: raise TypeError("schedule item not understood: %s" % type(sched_item).__name__) return past_end_i
def generate_code_for_sched_index(codegen_state, sched_index): kernel = codegen_state.kernel sched_item = kernel.schedule[sched_index] if isinstance(sched_item, CallKernel): assert not codegen_state.is_generating_device_code from loopy.schedule import (gather_schedule_block, get_insn_ids_for_block_at) _, past_end_i = gather_schedule_block(kernel.schedule, sched_index) assert past_end_i <= codegen_state.schedule_index_end extra_args = synthesize_idis_for_extra_args(kernel, sched_index) new_codegen_state = codegen_state.copy( is_generating_device_code=True, gen_program_name=sched_item.kernel_name, schedule_index_end=past_end_i - 1, implemented_data_info=(codegen_state.implemented_data_info + extra_args)) from loopy.codegen.result import generate_host_or_device_program codegen_result = generate_host_or_device_program( new_codegen_state, sched_index) glob_grid, loc_grid = kernel.get_grid_sizes_for_insn_ids_as_exprs( get_insn_ids_for_block_at(kernel.schedule, sched_index)) return merge_codegen_results(codegen_state, [ codegen_result, codegen_state.ast_builder.get_kernel_call( codegen_state, sched_item.kernel_name, glob_grid, loc_grid, extra_args), ]) elif isinstance(sched_item, EnterLoop): tag = kernel.iname_to_tag.get(sched_item.iname) from loopy.codegen.loop import (generate_unroll_loop, generate_vectorize_loop, generate_sequential_loop_dim_code) from loopy.kernel.data import (UnrolledIlpTag, UnrollTag, ForceSequentialTag, LoopedIlpTag, VectorizeTag, InOrderSequentialSequentialTag) if isinstance(tag, (UnrollTag, UnrolledIlpTag)): func = generate_unroll_loop elif isinstance(tag, VectorizeTag): func = generate_vectorize_loop elif tag is None or isinstance(tag, (LoopedIlpTag, ForceSequentialTag, InOrderSequentialSequentialTag)): func = generate_sequential_loop_dim_code else: raise RuntimeError("encountered (invalid) EnterLoop " "for '%s', tagged '%s'" % (sched_item.iname, tag)) return func(codegen_state, sched_index) elif isinstance(sched_item, Barrier): # {{{ emit barrier code from loopy.codegen.result import CodeGenerationResult if codegen_state.is_generating_device_code: barrier_ast = codegen_state.ast_builder.emit_barrier( sched_item.synchronization_kind, sched_item.mem_kind, sched_item.comment) if sched_item.originating_insn_id: return CodeGenerationResult.new( codegen_state, sched_item.originating_insn_id, barrier_ast, codegen_state.implemented_domain) else: return barrier_ast else: # host code if sched_item.synchronization_kind in ["global", "local"]: # host code is assumed globally and locally synchronous return CodeGenerationResult( host_program=None, device_programs=[], implemented_domains={}, implemented_data_info=codegen_state.implemented_data_info) else: raise LoopyError("do not know how to emit code for barrier " "synchronization kind '%s'" "in host code" % sched_item.synchronization_kind) # }}} elif isinstance(sched_item, RunInstruction): insn = kernel.id_to_insn[sched_item.insn_id] from loopy.codegen.instruction import generate_instruction_code return codegen_state.try_vectorized( "instruction %s" % insn.id, lambda inner_cgs: generate_instruction_code(inner_cgs, insn)) else: raise RuntimeError("unexpected schedule item type: %s" % type(sched_item))
def _check_for_unused_hw_axes_in_kernel_chunk(kernel, sched_index=None): from loopy.schedule import (CallKernel, RunInstruction, Barrier, EnterLoop, LeaveLoop, ReturnFromKernel, get_insn_ids_for_block_at, gather_schedule_block) if sched_index is None: group_axes = set() local_axes = set() i = 0 loop_end_i = past_end_i = len(kernel.schedule) else: assert isinstance(kernel.schedule[sched_index], CallKernel) _, past_end_i = gather_schedule_block(kernel.schedule, sched_index) group_size, local_size = kernel.get_grid_sizes_for_insn_ids_as_exprs( get_insn_ids_for_block_at(kernel.schedule, sched_index)) group_axes = set(ax for ax, length in enumerate(group_size)) local_axes = set(ax for ax, length in enumerate(local_size)) i = sched_index + 1 assert isinstance(kernel.schedule[past_end_i - 1], ReturnFromKernel) loop_end_i = past_end_i - 1 # alternative: just disregard length-1 dimensions? from loopy.kernel.data import LocalIndexTag, AutoLocalIndexTagBase, GroupIndexTag while i < loop_end_i: sched_item = kernel.schedule[i] if isinstance(sched_item, CallKernel): i = _check_for_unused_hw_axes_in_kernel_chunk(kernel, i) elif isinstance(sched_item, RunInstruction): insn = kernel.id_to_insn[sched_item.insn_id] i += 1 if insn.boostable: continue group_axes_used = set() local_axes_used = set() for iname in kernel.insn_inames(insn): tag = kernel.iname_to_tag.get(iname) if isinstance(tag, LocalIndexTag): local_axes_used.add(tag.axis) elif isinstance(tag, GroupIndexTag): group_axes_used.add(tag.axis) elif isinstance(tag, AutoLocalIndexTagBase): raise LoopyError("auto local tag encountered") if group_axes != group_axes_used: raise LoopyError("instruction '%s' does not use all group hw axes " "(available: %s used:%s)" % (insn.id, ",".join(str(i) for i in group_axes), ",".join(str(i) for i in group_axes_used))) if local_axes != local_axes_used: raise LoopyError("instruction '%s' does not use all local hw axes " "(available: %s used:%s)" % (insn.id, ",".join(str(i) for i in local_axes), ",".join(str(i) for i in local_axes_used))) elif isinstance(sched_item, (Barrier, EnterLoop, LeaveLoop)): i += 1 continue else: raise TypeError( "schedule item not understood: %s" % type(sched_item).__name__) return past_end_i
def _check_for_unused_hw_axes_in_kernel_chunk(kernel, sched_index=None): from loopy.schedule import (CallKernel, RunInstruction, Barrier, EnterLoop, LeaveLoop, ReturnFromKernel, get_insn_ids_for_block_at, gather_schedule_block) if sched_index is None: group_axes = set() local_axes = set() i = 0 loop_end_i = past_end_i = len(kernel.schedule) else: assert isinstance(kernel.schedule[sched_index], CallKernel) _, past_end_i = gather_schedule_block(kernel.schedule, sched_index) group_size, local_size = kernel.get_grid_sizes_for_insn_ids_as_exprs( get_insn_ids_for_block_at(kernel.schedule, sched_index)) group_axes = set(ax for ax, length in enumerate(group_size)) local_axes = set(ax for ax, length in enumerate(local_size)) i = sched_index + 1 assert isinstance(kernel.schedule[past_end_i - 1], ReturnFromKernel) loop_end_i = past_end_i - 1 # alternative: just disregard length-1 dimensions? from loopy.kernel.data import (LocalIndexTag, AutoLocalIndexTagBase, GroupIndexTag) while i < loop_end_i: sched_item = kernel.schedule[i] if isinstance(sched_item, CallKernel): i = _check_for_unused_hw_axes_in_kernel_chunk(kernel, i) elif isinstance(sched_item, RunInstruction): insn = kernel.id_to_insn[sched_item.insn_id] i += 1 if insn.boostable: continue group_axes_used = set() local_axes_used = set() for iname in kernel.insn_inames(insn): ltags = kernel.iname_tags_of_type(iname, LocalIndexTag, max_num=1) gtags = kernel.iname_tags_of_type(iname, GroupIndexTag, max_num=1) altags = kernel.iname_tags_of_type( iname, AutoLocalIndexTagBase, max_num=1) if ltags: tag, = ltags local_axes_used.add(tag.axis) elif gtags: tag, = gtags group_axes_used.add(tag.axis) elif altags: raise LoopyError("auto local tag encountered") if group_axes != group_axes_used: raise LoopyError("instruction '%s' does not use all group hw axes " "(available: %s used:%s)" % (insn.id, ",".join(str(i) for i in group_axes), ",".join(str(i) for i in group_axes_used))) if local_axes != local_axes_used: raise LoopyError("instruction '%s' does not use all local hw axes " "(available: %s used:%s)" % (insn.id, ",".join(str(i) for i in local_axes), ",".join(str(i) for i in local_axes_used))) elif isinstance(sched_item, (Barrier, EnterLoop, LeaveLoop)): i += 1 continue else: raise TypeError( "schedule item not understood: %s" % type(sched_item).__name__) return past_end_i
def generate_code_for_sched_index(codegen_state, sched_index): kernel = codegen_state.kernel sched_item = kernel.schedule[sched_index] if isinstance(sched_item, CallKernel): assert not codegen_state.is_generating_device_code from loopy.schedule import (gather_schedule_block, get_insn_ids_for_block_at) _, past_end_i = gather_schedule_block(kernel.schedule, sched_index) assert past_end_i <= codegen_state.schedule_index_end extra_args = synthesize_idis_for_extra_args(kernel, sched_index) new_codegen_state = codegen_state.copy( is_generating_device_code=True, gen_program_name=sched_item.kernel_name, schedule_index_end=past_end_i-1, implemented_data_info=(codegen_state.implemented_data_info + extra_args)) from loopy.codegen.result import generate_host_or_device_program codegen_result = generate_host_or_device_program( new_codegen_state, sched_index) glob_grid, loc_grid = kernel.get_grid_sizes_for_insn_ids_as_exprs( get_insn_ids_for_block_at(kernel.schedule, sched_index)) return merge_codegen_results(codegen_state, [ codegen_result, codegen_state.ast_builder.get_kernel_call( codegen_state, sched_item.kernel_name, glob_grid, loc_grid, extra_args), ]) elif isinstance(sched_item, EnterLoop): tag = kernel.iname_to_tag.get(sched_item.iname) from loopy.codegen.loop import ( generate_unroll_loop, generate_vectorize_loop, generate_sequential_loop_dim_code) from loopy.kernel.data import (UnrolledIlpTag, UnrollTag, ForceSequentialTag, LoopedIlpTag, VectorizeTag) if isinstance(tag, (UnrollTag, UnrolledIlpTag)): func = generate_unroll_loop elif isinstance(tag, VectorizeTag): func = generate_vectorize_loop elif tag is None or isinstance(tag, (LoopedIlpTag, ForceSequentialTag)): func = generate_sequential_loop_dim_code else: raise RuntimeError("encountered (invalid) EnterLoop " "for '%s', tagged '%s'" % (sched_item.iname, tag)) return func(codegen_state, sched_index) elif isinstance(sched_item, Barrier): return codegen_state.ast_builder.emit_barrier( sched_item.kind, sched_item.comment) elif isinstance(sched_item, RunInstruction): insn = kernel.id_to_insn[sched_item.insn_id] from loopy.codegen.instruction import generate_instruction_code return codegen_state.try_vectorized( "instruction %s" % insn.id, lambda inner_cgs: generate_instruction_code(inner_cgs, insn)) else: raise RuntimeError("unexpected schedule item type: %s" % type(sched_item))
def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func, hw_inames_left=None): kernel = codegen_state.kernel from loopy.kernel.data import (UniqueTag, HardwareConcurrentTag, LocalIndexTag, GroupIndexTag, VectorizeTag) from loopy.schedule import get_insn_ids_for_block_at insn_ids_for_block = get_insn_ids_for_block_at(kernel.schedule, schedule_index) if hw_inames_left is None: all_inames_by_insns = set() for insn_id in insn_ids_for_block: all_inames_by_insns |= kernel.insn_inames(insn_id) hw_inames_left = [ iname for iname in all_inames_by_insns if kernel.iname_tags_of_type(iname, HardwareConcurrentTag) and not kernel.iname_tags_of_type(iname, VectorizeTag) ] if not hw_inames_left: return next_func(codegen_state) global_size, local_size = kernel.get_grid_sizes_for_insn_ids( insn_ids_for_block) hw_inames_left = hw_inames_left[:] iname = hw_inames_left.pop() from loopy.symbolic import GroupHardwareAxisIndex, LocalHardwareAxisIndex tag, = kernel.iname_tags_of_type(iname, UniqueTag, max_num=1, min_num=1) if isinstance(tag, GroupIndexTag): hw_axis_expr = GroupHardwareAxisIndex(tag.axis) elif isinstance(tag, LocalIndexTag): hw_axis_expr = LocalHardwareAxisIndex(tag.axis) else: raise RuntimeError("unexpected hw tag type") other_inames_with_same_tag = [ other_iname for other_iname in kernel.all_inames() if (kernel.iname_tags_of_type(other_iname, UniqueTag) and other_iname != iname and any( _tag.key == tag.key for _tag in kernel.iname_tags(other_iname) if _tag)) ] # {{{ 'implement' hardware axis boundaries if isinstance(tag, LocalIndexTag): hw_axis_size = local_size[tag.axis] elif isinstance(tag, GroupIndexTag): hw_axis_size = global_size[tag.axis] else: raise RuntimeError("unknown hardware parallel tag") result = [] bounds = kernel.get_iname_bounds(iname) domain = kernel.get_inames_domain(iname) # It's ok to find a bound that's too "loose". The conditional # generators will mop up after us. from loopy.isl_helpers import static_min_of_pw_aff lower_bound = static_min_of_pw_aff(bounds.lower_bound_pw_aff, constants_only=False) # These bounds are 'implemented' by the hardware. Make sure # that the downstream conditional generators realize that. if not isinstance(hw_axis_size, int): hw_axis_size, lower_bound = isl.align_two(hw_axis_size, lower_bound) from loopy.isl_helpers import make_slab slab = make_slab(domain.get_space(), iname, lower_bound, lower_bound + hw_axis_size) codegen_state = codegen_state.intersect(slab) from loopy.symbolic import pw_aff_to_expr hw_axis_expr = hw_axis_expr + pw_aff_to_expr(lower_bound) # }}} slabs = get_slab_decomposition(kernel, iname) if other_inames_with_same_tag and len(slabs) > 1: raise RuntimeError("cannot do slab decomposition on inames that share " "a tag with other inames") result = [] for slab_name, slab in slabs: if len(slabs) > 1: result.append( codegen_state.ast_builder.emit_comment("%s slab for '%s'" % (slab_name, iname))) # Have the conditional infrastructure generate the # slabbing conditionals. slabbed_kernel = intersect_kernel_with_slab(kernel, slab, iname) new_codegen_state = (codegen_state.copy_and_assign( iname, hw_axis_expr).copy(kernel=slabbed_kernel)) inner = set_up_hw_parallel_loops(new_codegen_state, schedule_index, next_func, hw_inames_left) result.append(inner) return merge_codegen_results(codegen_state, result)
def generate_code_for_sched_index(codegen_state, sched_index): kernel = codegen_state.kernel sched_item = kernel.schedule[sched_index] if isinstance(sched_item, CallKernel): assert not codegen_state.is_generating_device_code from loopy.schedule import (gather_schedule_block, get_insn_ids_for_block_at) _, past_end_i = gather_schedule_block(kernel.schedule, sched_index) assert past_end_i <= codegen_state.schedule_index_end extra_args = synthesize_idis_for_extra_args(kernel, sched_index) new_codegen_state = codegen_state.copy( is_generating_device_code=True, gen_program_name=sched_item.kernel_name, schedule_index_end=past_end_i-1, implemented_data_info=(codegen_state.implemented_data_info + extra_args)) from loopy.codegen.result import generate_host_or_device_program codegen_result = generate_host_or_device_program( new_codegen_state, sched_index) glob_grid, loc_grid = kernel.get_grid_sizes_for_insn_ids_as_exprs( get_insn_ids_for_block_at(kernel.schedule, sched_index)) return merge_codegen_results(codegen_state, [ codegen_result, codegen_state.ast_builder.get_kernel_call( codegen_state, sched_item.kernel_name, glob_grid, loc_grid, extra_args), ]) elif isinstance(sched_item, EnterLoop): tags = kernel.iname_tags(sched_item.iname) tags = tuple(tag for tag in tags if tag) from loopy.codegen.loop import ( generate_unroll_loop, generate_vectorize_loop, generate_sequential_loop_dim_code) from loopy.kernel.data import (UnrolledIlpTag, UnrollTag, ForceSequentialTag, LoopedIlpTag, VectorizeTag, InOrderSequentialSequentialTag, filter_iname_tags_by_type) if filter_iname_tags_by_type(tags, (UnrollTag, UnrolledIlpTag)): func = generate_unroll_loop elif filter_iname_tags_by_type(tags, VectorizeTag): func = generate_vectorize_loop elif not tags or filter_iname_tags_by_type(tags, (LoopedIlpTag, ForceSequentialTag, InOrderSequentialSequentialTag)): func = generate_sequential_loop_dim_code else: raise RuntimeError("encountered (invalid) EnterLoop " "for '%s', tagged '%s'" % (sched_item.iname, ", ".join(str(tag) for tag in tags))) return func(codegen_state, sched_index) elif isinstance(sched_item, Barrier): # {{{ emit barrier code from loopy.codegen.result import CodeGenerationResult if codegen_state.is_generating_device_code: barrier_ast = codegen_state.ast_builder.emit_barrier( sched_item.synchronization_kind, sched_item.mem_kind, sched_item.comment) if sched_item.originating_insn_id: return CodeGenerationResult.new( codegen_state, sched_item.originating_insn_id, barrier_ast, codegen_state.implemented_domain) else: return barrier_ast else: # host code if sched_item.synchronization_kind in ["global", "local"]: # host code is assumed globally and locally synchronous return CodeGenerationResult( host_program=None, device_programs=[], implemented_domains={}, implemented_data_info=codegen_state.implemented_data_info) else: raise LoopyError("do not know how to emit code for barrier " "synchronization kind '%s'" "in host code" % sched_item.synchronization_kind) # }}} elif isinstance(sched_item, RunInstruction): insn = kernel.id_to_insn[sched_item.insn_id] from loopy.codegen.instruction import generate_instruction_code return codegen_state.try_vectorized( "instruction %s" % insn.id, lambda inner_cgs: generate_instruction_code(inner_cgs, insn)) else: raise RuntimeError("unexpected schedule item type: %s" % type(sched_item))
def generate_code_for_sched_index(codegen_state, sched_index): kernel = codegen_state.kernel sched_item = kernel.schedule[sched_index] if isinstance(sched_item, CallKernel): assert not codegen_state.is_generating_device_code from loopy.schedule import (gather_schedule_block, get_insn_ids_for_block_at) _, past_end_i = gather_schedule_block(kernel.schedule, sched_index) assert past_end_i <= codegen_state.schedule_index_end extra_args = synthesize_idis_for_extra_args(kernel, sched_index) new_codegen_state = codegen_state.copy( is_generating_device_code=True, gen_program_name=sched_item.kernel_name, schedule_index_end=past_end_i - 1, implemented_data_info=(codegen_state.implemented_data_info + extra_args)) from loopy.codegen.result import generate_host_or_device_program codegen_result = generate_host_or_device_program( new_codegen_state, sched_index) glob_grid, loc_grid = kernel.get_grid_sizes_for_insn_ids_as_exprs( get_insn_ids_for_block_at(kernel.schedule, sched_index)) return merge_codegen_results(codegen_state, [ codegen_result, codegen_state.ast_builder.get_kernel_call( codegen_state, sched_item.kernel_name, glob_grid, loc_grid, extra_args), ]) elif isinstance(sched_item, EnterLoop): tag = kernel.iname_to_tag.get(sched_item.iname) from loopy.codegen.loop import (generate_unroll_loop, generate_vectorize_loop, generate_sequential_loop_dim_code) from loopy.kernel.data import (UnrolledIlpTag, UnrollTag, ForceSequentialTag, LoopedIlpTag, VectorizeTag) if isinstance(tag, (UnrollTag, UnrolledIlpTag)): func = generate_unroll_loop elif isinstance(tag, VectorizeTag): func = generate_vectorize_loop elif tag is None or isinstance(tag, (LoopedIlpTag, ForceSequentialTag)): func = generate_sequential_loop_dim_code else: raise RuntimeError("encountered (invalid) EnterLoop " "for '%s', tagged '%s'" % (sched_item.iname, tag)) return func(codegen_state, sched_index) elif isinstance(sched_item, Barrier): return codegen_state.ast_builder.emit_barrier(sched_item.kind, sched_item.comment) elif isinstance(sched_item, RunInstruction): insn = kernel.id_to_insn[sched_item.insn_id] from loopy.codegen.instruction import generate_instruction_code return codegen_state.try_vectorized( "instruction %s" % insn.id, lambda inner_cgs: generate_instruction_code(inner_cgs, insn)) else: raise RuntimeError("unexpected schedule item type: %s" % type(sched_item))
def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func, hw_inames_left=None): kernel = codegen_state.kernel from loopy.kernel.data import (UniqueTag, HardwareConcurrentTag, LocalIndexTag, GroupIndexTag) from loopy.schedule import get_insn_ids_for_block_at insn_ids_for_block = get_insn_ids_for_block_at(kernel.schedule, schedule_index) if hw_inames_left is None: all_inames_by_insns = set() for insn_id in insn_ids_for_block: all_inames_by_insns |= kernel.insn_inames(insn_id) hw_inames_left = [iname for iname in all_inames_by_insns if kernel.iname_tags_of_type(iname, HardwareConcurrentTag)] if not hw_inames_left: return next_func(codegen_state) global_size, local_size = kernel.get_grid_sizes_for_insn_ids( insn_ids_for_block) hw_inames_left = hw_inames_left[:] iname = hw_inames_left.pop() from loopy.symbolic import GroupHardwareAxisIndex, LocalHardwareAxisIndex tag, = kernel.iname_tags_of_type(iname, UniqueTag, max_num=1, min_num=1) if isinstance(tag, GroupIndexTag): hw_axis_expr = GroupHardwareAxisIndex(tag.axis) elif isinstance(tag, LocalIndexTag): hw_axis_expr = LocalHardwareAxisIndex(tag.axis) else: raise RuntimeError("unexpected hw tag type") other_inames_with_same_tag = [ other_iname for other_iname in kernel.all_inames() if (kernel.iname_tags_of_type(other_iname, UniqueTag) and other_iname != iname and any(_tag.key == tag.key for _tag in kernel.iname_tags(other_iname) if _tag))] # {{{ 'implement' hardware axis boundaries if isinstance(tag, LocalIndexTag): hw_axis_size = local_size[tag.axis] elif isinstance(tag, GroupIndexTag): hw_axis_size = global_size[tag.axis] else: raise RuntimeError("unknown hardware parallel tag") result = [] bounds = kernel.get_iname_bounds(iname) domain = kernel.get_inames_domain(iname) # It's ok to find a bound that's too "loose". The conditional # generators will mop up after us. from loopy.isl_helpers import static_min_of_pw_aff lower_bound = static_min_of_pw_aff(bounds.lower_bound_pw_aff, constants_only=False) # These bounds are 'implemented' by the hardware. Make sure # that the downstream conditional generators realize that. if not isinstance(hw_axis_size, int): hw_axis_size, lower_bound = isl.align_two(hw_axis_size, lower_bound) from loopy.isl_helpers import make_slab slab = make_slab(domain.get_space(), iname, lower_bound, lower_bound+hw_axis_size) codegen_state = codegen_state.intersect(slab) from loopy.symbolic import pw_aff_to_expr hw_axis_expr = hw_axis_expr + pw_aff_to_expr(lower_bound) # }}} slabs = get_slab_decomposition(kernel, iname) if other_inames_with_same_tag and len(slabs) > 1: raise RuntimeError("cannot do slab decomposition on inames that share " "a tag with other inames") result = [] for slab_name, slab in slabs: if len(slabs) > 1: result.append( codegen_state.ast_builder.emit_comment( "%s slab for '%s'" % (slab_name, iname))) # Have the conditional infrastructure generate the # slabbing conditionals. slabbed_kernel = intersect_kernel_with_slab(kernel, slab, iname) new_codegen_state = (codegen_state .copy_and_assign(iname, hw_axis_expr) .copy(kernel=slabbed_kernel)) inner = set_up_hw_parallel_loops( new_codegen_state, schedule_index, next_func, hw_inames_left) result.append(inner) return merge_codegen_results(codegen_state, result)