def generate_unroll_loop(codegen_state, sched_index): kernel = codegen_state.kernel iname = kernel.schedule[sched_index].iname bounds = kernel.get_iname_bounds(iname, constants_only=True) from loopy.isl_helpers import (static_max_of_pw_aff, static_value_of_pw_aff) from loopy.symbolic import pw_aff_to_expr length_aff = static_max_of_pw_aff(bounds.size, constants_only=True) if not length_aff.is_cst(): raise LoopyError("length of unrolled loop '%s' is not a constant, " "cannot unroll") length = int(pw_aff_to_expr(length_aff)) try: lower_bound_aff = static_value_of_pw_aff( bounds.lower_bound_pw_aff.coalesce(), constants_only=False) except Exception as e: raise type(e)("while finding lower bound of '%s': " % iname) result = [] for i in range(length): idx_aff = lower_bound_aff + i new_codegen_state = codegen_state.fix(iname, idx_aff) result.append(build_loop_nest(new_codegen_state, sched_index + 1)) return merge_codegen_results(codegen_state, result)
def gen_code(inner_codegen_state): # noqa pylint:disable=function-redefined condition_exprs = [ constraint_to_cond_expr(cns) for cns in bounds_checks] + [ pred_chk for pred_chk in pred_checks] prev_result = prev_gen_code(inner_codegen_state) return [wrap_in_if( inner_codegen_state, condition_exprs, merge_codegen_results(codegen_state, prev_result))]
def gen_code(inner_codegen_state): # noqa pylint:disable=function-redefined condition_exprs = [ constraint_to_cond_expr(cns) for cns in bounds_checks ] + [pred_chk for pred_chk in pred_checks] prev_result = prev_gen_code(inner_codegen_state) return [ wrap_in_if( inner_codegen_state, condition_exprs, merge_codegen_results(codegen_state, prev_result)) ]
def gen_code(inner_codegen_state): condition_exprs = [ constraint_to_expr(cns) for cns in bounds_checks ] + [pred_chk for pred_chk in pred_checks] prev_result = prev_gen_code(inner_codegen_state) return [ wrap_in_if( inner_codegen_state, condition_exprs, merge_codegen_results(codegen_state, prev_result)) ]
def gen_code(inner_codegen_state): from pymbolic.primitives import Variable condition_exprs = [ constraint_to_expr(cns) for cns in bounds_checks ] + [Variable(pred_chk) for pred_chk in pred_checks] prev_result = prev_gen_code(inner_codegen_state) return [ wrap_in_if( inner_codegen_state, condition_exprs, merge_codegen_results(codegen_state, prev_result)) ]
def gen_code(inner_codegen_state): from pymbolic.primitives import Variable condition_exprs = [ constraint_to_expr(cns) for cns in bounds_checks] + [ Variable(pred_chk) for pred_chk in pred_checks] prev_result = prev_gen_code(inner_codegen_state) return [wrap_in_if( inner_codegen_state, condition_exprs, merge_codegen_results(codegen_state, prev_result))]
def unvectorize(self, func): vinf = self.vectorization_info result = [] novec_self = self.copy(vectorization_info=False) for i in range(vinf.length): idx_aff = isl.Aff.zero_on_domain(vinf.space.params()) + i new_codegen_state = novec_self.fix(vinf.iname, idx_aff) generated = func(new_codegen_state) if isinstance(generated, list): result.extend(generated) else: result.append(generated) from loopy.codegen.result import merge_codegen_results return merge_codegen_results(self, result)
def generate_unroll_loop(codegen_state, sched_index): kernel = codegen_state.kernel iname = kernel.schedule[sched_index].iname bounds = kernel.get_iname_bounds(iname, constants_only=True) from loopy.isl_helpers import ( static_max_of_pw_aff, static_value_of_pw_aff) from loopy.symbolic import pw_aff_to_expr length_aff = static_max_of_pw_aff(bounds.size, constants_only=True) if not length_aff.is_cst(): raise LoopyError( "length of unrolled loop '%s' is not a constant, " "cannot unroll") length = int(pw_aff_to_expr(length_aff)) try: lower_bound_aff = static_value_of_pw_aff( bounds.lower_bound_pw_aff.coalesce(), constants_only=False) except Exception as e: raise type(e)("while finding lower bound of '%s': " % iname) result = [] for i in range(length): idx_aff = lower_bound_aff + i new_codegen_state = codegen_state.fix(iname, idx_aff) result.append( build_loop_nest(new_codegen_state, sched_index+1)) return merge_codegen_results(codegen_state, result)
def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func, hw_inames_left=None): kernel = codegen_state.kernel from loopy.kernel.data import (UniqueTag, HardwareConcurrentTag, LocalIndexTag, GroupIndexTag, VectorizeTag) from loopy.schedule import get_insn_ids_for_block_at insn_ids_for_block = get_insn_ids_for_block_at(kernel.schedule, schedule_index) if hw_inames_left is None: all_inames_by_insns = set() for insn_id in insn_ids_for_block: all_inames_by_insns |= kernel.insn_inames(insn_id) hw_inames_left = [ iname for iname in all_inames_by_insns if kernel.iname_tags_of_type(iname, HardwareConcurrentTag) and not kernel.iname_tags_of_type(iname, VectorizeTag) ] if not hw_inames_left: return next_func(codegen_state) global_size, local_size = kernel.get_grid_sizes_for_insn_ids( insn_ids_for_block) hw_inames_left = hw_inames_left[:] iname = hw_inames_left.pop() from loopy.symbolic import GroupHardwareAxisIndex, LocalHardwareAxisIndex tag, = kernel.iname_tags_of_type(iname, UniqueTag, max_num=1, min_num=1) if isinstance(tag, GroupIndexTag): hw_axis_expr = GroupHardwareAxisIndex(tag.axis) elif isinstance(tag, LocalIndexTag): hw_axis_expr = LocalHardwareAxisIndex(tag.axis) else: raise RuntimeError("unexpected hw tag type") other_inames_with_same_tag = [ other_iname for other_iname in kernel.all_inames() if (kernel.iname_tags_of_type(other_iname, UniqueTag) and other_iname != iname and any( _tag.key == tag.key for _tag in kernel.iname_tags(other_iname) if _tag)) ] # {{{ 'implement' hardware axis boundaries if isinstance(tag, LocalIndexTag): hw_axis_size = local_size[tag.axis] elif isinstance(tag, GroupIndexTag): hw_axis_size = global_size[tag.axis] else: raise RuntimeError("unknown hardware parallel tag") result = [] bounds = kernel.get_iname_bounds(iname) domain = kernel.get_inames_domain(iname) # It's ok to find a bound that's too "loose". The conditional # generators will mop up after us. from loopy.isl_helpers import static_min_of_pw_aff lower_bound = static_min_of_pw_aff(bounds.lower_bound_pw_aff, constants_only=False) # These bounds are 'implemented' by the hardware. Make sure # that the downstream conditional generators realize that. if not isinstance(hw_axis_size, int): hw_axis_size, lower_bound = isl.align_two(hw_axis_size, lower_bound) from loopy.isl_helpers import make_slab slab = make_slab(domain.get_space(), iname, lower_bound, lower_bound + hw_axis_size) codegen_state = codegen_state.intersect(slab) from loopy.symbolic import pw_aff_to_expr hw_axis_expr = hw_axis_expr + pw_aff_to_expr(lower_bound) # }}} slabs = get_slab_decomposition(kernel, iname) if other_inames_with_same_tag and len(slabs) > 1: raise RuntimeError("cannot do slab decomposition on inames that share " "a tag with other inames") result = [] for slab_name, slab in slabs: if len(slabs) > 1: result.append( codegen_state.ast_builder.emit_comment("%s slab for '%s'" % (slab_name, iname))) # Have the conditional infrastructure generate the # slabbing conditionals. slabbed_kernel = intersect_kernel_with_slab(kernel, slab, iname) new_codegen_state = (codegen_state.copy_and_assign( iname, hw_axis_expr).copy(kernel=slabbed_kernel)) inner = set_up_hw_parallel_loops(new_codegen_state, schedule_index, next_func, hw_inames_left) result.append(inner) return merge_codegen_results(codegen_state, result)
def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func, hw_inames_left=None): kernel = codegen_state.kernel from loopy.kernel.data import (UniqueTag, HardwareConcurrentTag, LocalIndexTag, GroupIndexTag) from loopy.schedule import get_insn_ids_for_block_at insn_ids_for_block = get_insn_ids_for_block_at(kernel.schedule, schedule_index) if hw_inames_left is None: all_inames_by_insns = set() for insn_id in insn_ids_for_block: all_inames_by_insns |= kernel.insn_inames(insn_id) hw_inames_left = [iname for iname in all_inames_by_insns if kernel.iname_tags_of_type(iname, HardwareConcurrentTag)] if not hw_inames_left: return next_func(codegen_state) global_size, local_size = kernel.get_grid_sizes_for_insn_ids( insn_ids_for_block) hw_inames_left = hw_inames_left[:] iname = hw_inames_left.pop() from loopy.symbolic import GroupHardwareAxisIndex, LocalHardwareAxisIndex tag, = kernel.iname_tags_of_type(iname, UniqueTag, max_num=1, min_num=1) if isinstance(tag, GroupIndexTag): hw_axis_expr = GroupHardwareAxisIndex(tag.axis) elif isinstance(tag, LocalIndexTag): hw_axis_expr = LocalHardwareAxisIndex(tag.axis) else: raise RuntimeError("unexpected hw tag type") other_inames_with_same_tag = [ other_iname for other_iname in kernel.all_inames() if (kernel.iname_tags_of_type(other_iname, UniqueTag) and other_iname != iname and any(_tag.key == tag.key for _tag in kernel.iname_tags(other_iname) if _tag))] # {{{ 'implement' hardware axis boundaries if isinstance(tag, LocalIndexTag): hw_axis_size = local_size[tag.axis] elif isinstance(tag, GroupIndexTag): hw_axis_size = global_size[tag.axis] else: raise RuntimeError("unknown hardware parallel tag") result = [] bounds = kernel.get_iname_bounds(iname) domain = kernel.get_inames_domain(iname) # It's ok to find a bound that's too "loose". The conditional # generators will mop up after us. from loopy.isl_helpers import static_min_of_pw_aff lower_bound = static_min_of_pw_aff(bounds.lower_bound_pw_aff, constants_only=False) # These bounds are 'implemented' by the hardware. Make sure # that the downstream conditional generators realize that. if not isinstance(hw_axis_size, int): hw_axis_size, lower_bound = isl.align_two(hw_axis_size, lower_bound) from loopy.isl_helpers import make_slab slab = make_slab(domain.get_space(), iname, lower_bound, lower_bound+hw_axis_size) codegen_state = codegen_state.intersect(slab) from loopy.symbolic import pw_aff_to_expr hw_axis_expr = hw_axis_expr + pw_aff_to_expr(lower_bound) # }}} slabs = get_slab_decomposition(kernel, iname) if other_inames_with_same_tag and len(slabs) > 1: raise RuntimeError("cannot do slab decomposition on inames that share " "a tag with other inames") result = [] for slab_name, slab in slabs: if len(slabs) > 1: result.append( codegen_state.ast_builder.emit_comment( "%s slab for '%s'" % (slab_name, iname))) # Have the conditional infrastructure generate the # slabbing conditionals. slabbed_kernel = intersect_kernel_with_slab(kernel, slab, iname) new_codegen_state = (codegen_state .copy_and_assign(iname, hw_axis_expr) .copy(kernel=slabbed_kernel)) inner = set_up_hw_parallel_loops( new_codegen_state, schedule_index, next_func, hw_inames_left) result.append(inner) return merge_codegen_results(codegen_state, result)
def generate_code_for_sched_index(codegen_state, sched_index): kernel = codegen_state.kernel sched_item = kernel.schedule[sched_index] if isinstance(sched_item, CallKernel): assert not codegen_state.is_generating_device_code from loopy.schedule import (gather_schedule_block, get_insn_ids_for_block_at) _, past_end_i = gather_schedule_block(kernel.schedule, sched_index) assert past_end_i <= codegen_state.schedule_index_end extra_args = synthesize_idis_for_extra_args(kernel, sched_index) new_codegen_state = codegen_state.copy( is_generating_device_code=True, gen_program_name=sched_item.kernel_name, schedule_index_end=past_end_i - 1, implemented_data_info=(codegen_state.implemented_data_info + extra_args)) from loopy.codegen.result import generate_host_or_device_program codegen_result = generate_host_or_device_program( new_codegen_state, sched_index) glob_grid, loc_grid = kernel.get_grid_sizes_for_insn_ids_as_exprs( get_insn_ids_for_block_at(kernel.schedule, sched_index)) return merge_codegen_results(codegen_state, [ codegen_result, codegen_state.ast_builder.get_kernel_call( codegen_state, sched_item.kernel_name, glob_grid, loc_grid, extra_args), ]) elif isinstance(sched_item, EnterLoop): tag = kernel.iname_to_tag.get(sched_item.iname) from loopy.codegen.loop import (generate_unroll_loop, generate_vectorize_loop, generate_sequential_loop_dim_code) from loopy.kernel.data import (UnrolledIlpTag, UnrollTag, ForceSequentialTag, LoopedIlpTag, VectorizeTag, InOrderSequentialSequentialTag) if isinstance(tag, (UnrollTag, UnrolledIlpTag)): func = generate_unroll_loop elif isinstance(tag, VectorizeTag): func = generate_vectorize_loop elif tag is None or isinstance(tag, (LoopedIlpTag, ForceSequentialTag, InOrderSequentialSequentialTag)): func = generate_sequential_loop_dim_code else: raise RuntimeError("encountered (invalid) EnterLoop " "for '%s', tagged '%s'" % (sched_item.iname, tag)) return func(codegen_state, sched_index) elif isinstance(sched_item, Barrier): # {{{ emit barrier code from loopy.codegen.result import CodeGenerationResult if codegen_state.is_generating_device_code: barrier_ast = codegen_state.ast_builder.emit_barrier( sched_item.synchronization_kind, sched_item.mem_kind, sched_item.comment) if sched_item.originating_insn_id: return CodeGenerationResult.new( codegen_state, sched_item.originating_insn_id, barrier_ast, codegen_state.implemented_domain) else: return barrier_ast else: # host code if sched_item.synchronization_kind in ["global", "local"]: # host code is assumed globally and locally synchronous return CodeGenerationResult( host_program=None, device_programs=[], implemented_domains={}, implemented_data_info=codegen_state.implemented_data_info) else: raise LoopyError("do not know how to emit code for barrier " "synchronization kind '%s'" "in host code" % sched_item.synchronization_kind) # }}} elif isinstance(sched_item, RunInstruction): insn = kernel.id_to_insn[sched_item.insn_id] from loopy.codegen.instruction import generate_instruction_code return codegen_state.try_vectorized( "instruction %s" % insn.id, lambda inner_cgs: generate_instruction_code(inner_cgs, insn)) else: raise RuntimeError("unexpected schedule item type: %s" % type(sched_item))
def generate_sequential_loop_dim_code(codegen_state, sched_index): kernel = codegen_state.kernel ecm = codegen_state.expression_to_code_mapper loop_iname = kernel.schedule[sched_index].iname slabs = get_slab_decomposition(kernel, loop_iname) from loopy.codegen.bounds import get_usable_inames_for_conditional # Note: this does not include loop_iname itself! usable_inames = get_usable_inames_for_conditional(kernel, sched_index) domain = kernel.get_inames_domain(loop_iname) result = [] for slab_name, slab in slabs: cmt = "%s slab for '%s'" % (slab_name, loop_iname) if len(slabs) == 1: cmt = None # {{{ find bounds aligned_domain = isl.align_spaces(domain, slab, obj_bigger_ok=True) dom_and_slab = aligned_domain & slab assumptions_non_param = isl.BasicSet.from_params(kernel.assumptions) dom_and_slab, assumptions_non_param = isl.align_two( dom_and_slab, assumptions_non_param) dom_and_slab = dom_and_slab & assumptions_non_param # move inames that are usable into parameters moved_inames = [] for das_iname in sorted(dom_and_slab.get_var_names(dim_type.set)): if das_iname in usable_inames: moved_inames.append(das_iname) dt, idx = dom_and_slab.get_var_dict()[das_iname] dom_and_slab = dom_and_slab.move_dims( dim_type.param, dom_and_slab.dim(dim_type.param), dt, idx, 1) _, loop_iname_idx = dom_and_slab.get_var_dict()[loop_iname] impl_domain = isl.align_spaces(codegen_state.implemented_domain, dom_and_slab, obj_bigger_ok=True).params() lbound = (kernel.cache_manager.dim_min( dom_and_slab, loop_iname_idx).gist( kernel.assumptions).gist(impl_domain).coalesce()) ubound = (kernel.cache_manager.dim_max( dom_and_slab, loop_iname_idx).gist( kernel.assumptions).gist(impl_domain).coalesce()) # }}} # {{{ find implemented loop, build inner code from loopy.symbolic import pw_aff_to_pw_aff_implemented_by_expr impl_lbound = pw_aff_to_pw_aff_implemented_by_expr(lbound) impl_ubound = pw_aff_to_pw_aff_implemented_by_expr(ubound) # impl_loop may be overapproximated from loopy.isl_helpers import make_loop_bounds_from_pwaffs impl_loop = make_loop_bounds_from_pwaffs(dom_and_slab.space, loop_iname, impl_lbound, impl_ubound) for moved_iname in moved_inames: # move moved_iname to 'set' dim_type in impl_loop dt, idx = impl_loop.get_var_dict()[moved_iname] impl_loop = impl_loop.move_dims(dim_type.set, impl_loop.dim(dim_type.set), dt, idx, 1) new_codegen_state = (codegen_state.intersect(impl_loop).copy( kernel=intersect_kernel_with_slab(kernel, slab, loop_iname))) inner = build_loop_nest(new_codegen_state, sched_index + 1) # }}} if cmt is not None: result.append(codegen_state.ast_builder.emit_comment(cmt)) astb = codegen_state.ast_builder from loopy.symbolic import pw_aff_to_expr if impl_ubound.is_equal(impl_lbound): # single-trip, generate just a variable assignment, not a loop inner = merge_codegen_results(codegen_state, [ astb.emit_initializer(codegen_state, kernel.index_dtype, loop_iname, ecm(pw_aff_to_expr(lbound), PREC_NONE, "i"), is_const=True), astb.emit_blank_line(), inner, ]) result.append( inner.with_new_ast( codegen_state, astb.ast_block_scope_class( inner.current_ast(codegen_state)))) else: inner_ast = inner.current_ast(codegen_state) from loopy.isl_helpers import simplify_pw_aff result.append( inner.with_new_ast( codegen_state, astb.emit_sequential_loop( codegen_state, loop_iname, kernel.index_dtype, pw_aff_to_expr( simplify_pw_aff(lbound, kernel.assumptions)), pw_aff_to_expr( simplify_pw_aff(ubound, kernel.assumptions)), inner_ast))) return merge_codegen_results(codegen_state, result)
def build_loop_nest(codegen_state, schedule_index): # Most of the complexity of this function goes towards finding groups of # instructions that can be nested inside a shared conditional. kernel = codegen_state.kernel # If the AST builder does not implement conditionals, we can save us # some work about hoisting conditionals and directly go into recursion. if not codegen_state.ast_builder.can_implement_conditionals: result = [] inner = generate_code_for_sched_index(codegen_state, schedule_index) if inner is not None: result.append(inner) return merge_codegen_results(codegen_state, result) # {{{ pass 1: pre-scan schedule for my schedule item's siblings' indices # i.e. go up to the next LeaveLoop, and skip over inner loops. my_sched_indices = [] i = schedule_index while i < codegen_state.schedule_index_end: sched_item = kernel.schedule[i] if isinstance(sched_item, LeaveLoop): break my_sched_indices.append(i) if isinstance(sched_item, (EnterLoop, CallKernel)): _, i = gather_schedule_block(kernel.schedule, i) assert i <= codegen_state.schedule_index_end, \ "schedule block extends beyond schedule_index_end" elif isinstance(sched_item, Barrier): i += 1 elif isinstance(sched_item, RunInstruction): i += 1 else: raise RuntimeError("unexpected schedule item type: %s" % type(sched_item)) del i # }}} # {{{ pass 2: find admissible conditional inames for each sibling schedule item from pytools import ImmutableRecord class ScheduleIndexInfo(ImmutableRecord): """ .. attribute:: schedule_index .. attribute:: admissible_cond_inames .. attribute:: required_predicates .. attribute:: used_inames_within """ from loopy.schedule import find_used_inames_within sched_index_info_entries = [ ScheduleIndexInfo( schedule_indices=[i], admissible_cond_inames=( get_admissible_conditional_inames_for(codegen_state, i)), required_predicates=get_required_predicates(kernel, i), used_inames_within=find_used_inames_within(kernel, i) ) for i in my_sched_indices ] sched_index_info_entries = group_by( sched_index_info_entries, key=lambda sii: ( sii.admissible_cond_inames, sii.required_predicates, sii.used_inames_within), merge=lambda sii1, sii2: sii1.copy( schedule_indices=( sii1.schedule_indices + sii2.schedule_indices))) # }}} # {{{ pass 3: greedily group schedule items that share admissible inames from pytools import memoize_method class BoundsCheckCache: def __init__(self, kernel, impl_domain): self.kernel = kernel self.impl_domain = impl_domain @memoize_method def __call__(self, check_inames): if not check_inames: return [] domain = isl.align_spaces( self.kernel.get_inames_domain(check_inames), self.impl_domain, obj_bigger_ok=True) from loopy.codegen.bounds import get_approximate_convex_bounds_checks # Each instruction individually gets its bounds checks, # so we can safely overapproximate here. return get_approximate_convex_bounds_checks(domain, check_inames, self.impl_domain) def build_insn_group(sched_index_info_entries, codegen_state, done_group_lengths=set()): """ :arg done_group_lengths: A set of group lengths (integers) that grows from empty to include the longest found group and downwards with every recursive call. It serves to prevent infinite recursion by preventing recursive calls from doing anything about groups that are too small. """ from loopy.symbolic import get_dependencies # The rough plan here is that build_insn_group starts out with the # entirety of the current schedule item's downward siblings (i.e. all # the ones up to the next LeaveLoop). It will then iterate upward to # find the largest usable conditional hoist group. # # It will then call itself recursively, telling its recursive instances # to ignore the hoist group it just found by adding that group length # to done_group_length. (It'll also chop the set of schedule indices # considered down so that a callee cannot find a *longer* hoist group.) # # Upon return the hoist is wrapped around the returned code and # build_insn_group calls itself for the remainder of schedule indices # that were not in the hoist group. if not sched_index_info_entries: return [] origin_si_entry = sched_index_info_entries[0] current_iname_set = origin_si_entry.admissible_cond_inames current_pred_set = (origin_si_entry.required_predicates - codegen_state.implemented_predicates) # {{{ grow schedule item group # Keep growing schedule item group as long as group fulfills minimum # size requirement. bounds_check_cache = BoundsCheckCache( kernel, codegen_state.implemented_domain) found_hoists = [] candidate_group_length = 1 while candidate_group_length <= len(sched_index_info_entries): if candidate_group_length in done_group_lengths: candidate_group_length += 1 continue current_iname_set = ( current_iname_set & sched_index_info_entries[candidate_group_length-1] .admissible_cond_inames) current_pred_set = ( current_pred_set & sched_index_info_entries[candidate_group_length-1] .required_predicates) current_pred_set = frozenset( pred for pred in current_pred_set if get_dependencies(pred) & kernel.all_inames() <= current_iname_set) # {{{ see which inames are actually used in group # And only generate conditionals for those. used_inames = set() for sched_index_info_entry in \ sched_index_info_entries[0:candidate_group_length]: used_inames |= sched_index_info_entry.used_inames_within # }}} only_unshared_inames = kernel._remove_inames_for_shared_hw_axes( current_iname_set & used_inames) bounds_checks = bounds_check_cache(only_unshared_inames) if (bounds_checks # found a bounds check or current_pred_set or candidate_group_length == 1): # length-1 must always be an option to reach the recursion base # case below found_hoists.append((candidate_group_length, bounds_checks, current_pred_set)) if not bounds_checks and not current_pred_set: # already no more checks possible, let's not waste time # checking longer groups. break candidate_group_length += 1 # }}} # pick largest such group group_length, bounds_checks, pred_checks = max(found_hoists) check_set = None for cns in bounds_checks: cns_set = (isl.BasicSet.universe(cns.get_space()) .add_constraint(cns)) if check_set is None: check_set = cns_set else: check_set, cns_set = isl.align_two(check_set, cns_set) check_set = check_set.intersect(cns_set) if check_set is None: new_codegen_state = codegen_state is_empty = False else: is_empty = check_set.is_empty() new_codegen_state = codegen_state.intersect(check_set) if pred_checks: new_codegen_state = new_codegen_state.copy( implemented_predicates=new_codegen_state.implemented_predicates | pred_checks) if is_empty: result = [] else: if group_length == 1: # group only contains starting schedule item def gen_code(inner_codegen_state): result = [] for i in origin_si_entry.schedule_indices: inner = generate_code_for_sched_index( inner_codegen_state, i) if inner is not None: result.append(inner) return result else: # recurse with a bigger done_group_lengths def gen_code(inner_codegen_state): return build_insn_group( sched_index_info_entries[0:group_length], inner_codegen_state, done_group_lengths=( done_group_lengths | set([group_length]))) # gen_code returns a list if bounds_checks or pred_checks: from loopy.symbolic import constraint_to_cond_expr prev_gen_code = gen_code def gen_code(inner_codegen_state): # noqa pylint:disable=function-redefined condition_exprs = [ constraint_to_cond_expr(cns) for cns in bounds_checks] + [ pred_chk for pred_chk in pred_checks] prev_result = prev_gen_code(inner_codegen_state) return [wrap_in_if( inner_codegen_state, condition_exprs, merge_codegen_results(codegen_state, prev_result))] cannot_vectorize = False if new_codegen_state.vectorization_info is not None: from loopy.isl_helpers import obj_involves_variable for cond in bounds_checks: if obj_involves_variable( cond, new_codegen_state.vectorization_info.iname): cannot_vectorize = True break if cannot_vectorize: def gen_code_wrapper(inner_codegen_state): # gen_code returns a list, but this needs to return a # GeneratedCode instance. return gen_code(inner_codegen_state) result = [new_codegen_state.unvectorize(gen_code_wrapper)] else: result = gen_code(new_codegen_state) else: result = gen_code(new_codegen_state) return result + build_insn_group( sched_index_info_entries[group_length:], codegen_state) # }}} insn_group = build_insn_group(sched_index_info_entries, codegen_state) return merge_codegen_results( codegen_state, insn_group)
def generate_code_for_sched_index(codegen_state, sched_index): kernel = codegen_state.kernel sched_item = kernel.schedule[sched_index] if isinstance(sched_item, CallKernel): assert not codegen_state.is_generating_device_code from loopy.schedule import (gather_schedule_block, get_insn_ids_for_block_at) _, past_end_i = gather_schedule_block(kernel.schedule, sched_index) assert past_end_i <= codegen_state.schedule_index_end extra_args = synthesize_idis_for_extra_args(kernel, sched_index) new_codegen_state = codegen_state.copy( is_generating_device_code=True, gen_program_name=sched_item.kernel_name, schedule_index_end=past_end_i-1, implemented_data_info=(codegen_state.implemented_data_info + extra_args)) from loopy.codegen.result import generate_host_or_device_program codegen_result = generate_host_or_device_program( new_codegen_state, sched_index) glob_grid, loc_grid = kernel.get_grid_sizes_for_insn_ids_as_exprs( get_insn_ids_for_block_at(kernel.schedule, sched_index)) return merge_codegen_results(codegen_state, [ codegen_result, codegen_state.ast_builder.get_kernel_call( codegen_state, sched_item.kernel_name, glob_grid, loc_grid, extra_args), ]) elif isinstance(sched_item, EnterLoop): tag = kernel.iname_to_tag.get(sched_item.iname) from loopy.codegen.loop import ( generate_unroll_loop, generate_vectorize_loop, generate_sequential_loop_dim_code) from loopy.kernel.data import (UnrolledIlpTag, UnrollTag, ForceSequentialTag, LoopedIlpTag, VectorizeTag) if isinstance(tag, (UnrollTag, UnrolledIlpTag)): func = generate_unroll_loop elif isinstance(tag, VectorizeTag): func = generate_vectorize_loop elif tag is None or isinstance(tag, (LoopedIlpTag, ForceSequentialTag)): func = generate_sequential_loop_dim_code else: raise RuntimeError("encountered (invalid) EnterLoop " "for '%s', tagged '%s'" % (sched_item.iname, tag)) return func(codegen_state, sched_index) elif isinstance(sched_item, Barrier): return codegen_state.ast_builder.emit_barrier( sched_item.kind, sched_item.comment) elif isinstance(sched_item, RunInstruction): insn = kernel.id_to_insn[sched_item.insn_id] from loopy.codegen.instruction import generate_instruction_code return codegen_state.try_vectorized( "instruction %s" % insn.id, lambda inner_cgs: generate_instruction_code(inner_cgs, insn)) else: raise RuntimeError("unexpected schedule item type: %s" % type(sched_item))
def generate_sequential_loop_dim_code(codegen_state, sched_index): kernel = codegen_state.kernel ecm = codegen_state.expression_to_code_mapper loop_iname = kernel.schedule[sched_index].iname slabs = get_slab_decomposition(kernel, loop_iname) from loopy.codegen.bounds import get_usable_inames_for_conditional # Note: this does not include loop_iname itself! usable_inames = get_usable_inames_for_conditional(kernel, sched_index) domain = kernel.get_inames_domain(loop_iname) result = [] for slab_name, slab in slabs: cmt = "%s slab for '%s'" % (slab_name, loop_iname) if len(slabs) == 1: cmt = None # {{{ find bounds aligned_domain = isl.align_spaces(domain, slab, across_dim_types=True, obj_bigger_ok=True) dom_and_slab = aligned_domain & slab assumptions_non_param = isl.BasicSet.from_params(kernel.assumptions) dom_and_slab, assumptions_non_param = isl.align_two( dom_and_slab, assumptions_non_param) dom_and_slab = dom_and_slab & assumptions_non_param # move inames that are usable into parameters moved_inames = [] for iname in dom_and_slab.get_var_names(dim_type.set): if iname in usable_inames: moved_inames.append(iname) dt, idx = dom_and_slab.get_var_dict()[iname] dom_and_slab = dom_and_slab.move_dims( dim_type.param, dom_and_slab.dim(dim_type.param), dt, idx, 1) _, loop_iname_idx = dom_and_slab.get_var_dict()[loop_iname] from loopy.isl_helpers import (static_min_of_pw_aff, static_max_of_pw_aff) lbound = (kernel.cache_manager.dim_min( dom_and_slab, loop_iname_idx).gist(kernel.assumptions).coalesce()) ubound = (kernel.cache_manager.dim_max( dom_and_slab, loop_iname_idx).gist(kernel.assumptions).coalesce()) static_lbound = static_min_of_pw_aff(lbound, constants_only=False) static_ubound = static_max_of_pw_aff(ubound, constants_only=False) # }}} # {{{ find implemented slab, build inner code from loopy.isl_helpers import make_slab_from_bound_pwaffs # impl_slab may be overapproximated impl_slab = make_slab_from_bound_pwaffs(dom_and_slab.space, loop_iname, static_lbound, static_ubound) for iname in moved_inames: dt, idx = impl_slab.get_var_dict()[iname] impl_slab = impl_slab.move_dims(dim_type.set, impl_slab.dim(dim_type.set), dt, idx, 1) new_codegen_state = (codegen_state.intersect(impl_slab).copy( kernel=intersect_kernel_with_slab(kernel, slab, iname))) inner = build_loop_nest(new_codegen_state, sched_index + 1) # }}} if cmt is not None: result.append(codegen_state.ast_builder.emit_comment(cmt)) from loopy.symbolic import aff_to_expr astb = codegen_state.ast_builder if (static_ubound - static_lbound).plain_is_zero(): # single-trip, generate just a variable assignment, not a loop result.append( merge_codegen_results(codegen_state, [ astb.emit_initializer(codegen_state, kernel.index_dtype, loop_iname, ecm(aff_to_expr(static_lbound), PREC_NONE, "i"), is_const=True), astb.emit_blank_line(), inner, ])) else: inner_ast = inner.current_ast(codegen_state) result.append( inner.with_new_ast( codegen_state, astb.emit_sequential_loop(codegen_state, loop_iname, kernel.index_dtype, static_lbound, static_ubound, inner_ast))) return merge_codegen_results(codegen_state, result)
def generate_sequential_loop_dim_code(codegen_state, sched_index): kernel = codegen_state.kernel ecm = codegen_state.expression_to_code_mapper loop_iname = kernel.schedule[sched_index].iname slabs = get_slab_decomposition(kernel, loop_iname) from loopy.codegen.bounds import get_usable_inames_for_conditional # Note: this does not include loop_iname itself! usable_inames = get_usable_inames_for_conditional(kernel, sched_index) domain = kernel.get_inames_domain(loop_iname) result = [] for slab_name, slab in slabs: cmt = "%s slab for '%s'" % (slab_name, loop_iname) if len(slabs) == 1: cmt = None # {{{ find bounds aligned_domain = isl.align_spaces(domain, slab, across_dim_types=True, obj_bigger_ok=True) dom_and_slab = aligned_domain & slab assumptions_non_param = isl.BasicSet.from_params(kernel.assumptions) dom_and_slab, assumptions_non_param = isl.align_two( dom_and_slab, assumptions_non_param) dom_and_slab = dom_and_slab & assumptions_non_param # move inames that are usable into parameters moved_inames = [] for das_iname in sorted(dom_and_slab.get_var_names(dim_type.set)): if das_iname in usable_inames: moved_inames.append(das_iname) dt, idx = dom_and_slab.get_var_dict()[das_iname] dom_and_slab = dom_and_slab.move_dims( dim_type.param, dom_and_slab.dim(dim_type.param), dt, idx, 1) _, loop_iname_idx = dom_and_slab.get_var_dict()[loop_iname] impl_domain = isl.align_spaces( codegen_state.implemented_domain, dom_and_slab, obj_bigger_ok=True, across_dim_types=True ).params() lbound = ( kernel.cache_manager.dim_min( dom_and_slab, loop_iname_idx) .gist(kernel.assumptions) .gist(impl_domain) .coalesce()) ubound = ( kernel.cache_manager.dim_max( dom_and_slab, loop_iname_idx) .gist(kernel.assumptions) .gist(impl_domain) .coalesce()) # }}} # {{{ find implemented loop, build inner code from loopy.symbolic import pw_aff_to_pw_aff_implemented_by_expr impl_lbound = pw_aff_to_pw_aff_implemented_by_expr(lbound) impl_ubound = pw_aff_to_pw_aff_implemented_by_expr(ubound) # impl_loop may be overapproximated from loopy.isl_helpers import make_loop_bounds_from_pwaffs impl_loop = make_loop_bounds_from_pwaffs( dom_and_slab.space, loop_iname, impl_lbound, impl_ubound) for moved_iname in moved_inames: # move moved_iname to 'set' dim_type in impl_loop dt, idx = impl_loop.get_var_dict()[moved_iname] impl_loop = impl_loop.move_dims( dim_type.set, impl_loop.dim(dim_type.set), dt, idx, 1) new_codegen_state = ( codegen_state .intersect(impl_loop) .copy(kernel=intersect_kernel_with_slab( kernel, slab, loop_iname))) inner = build_loop_nest(new_codegen_state, sched_index+1) # }}} if cmt is not None: result.append(codegen_state.ast_builder.emit_comment(cmt)) astb = codegen_state.ast_builder from loopy.symbolic import pw_aff_to_expr if impl_ubound.is_equal(impl_lbound): # single-trip, generate just a variable assignment, not a loop inner = merge_codegen_results(codegen_state, [ astb.emit_initializer( codegen_state, kernel.index_dtype, loop_iname, ecm(pw_aff_to_expr(lbound), PREC_NONE, "i"), is_const=True), astb.emit_blank_line(), inner, ]) result.append( inner.with_new_ast( codegen_state, astb.ast_block_scope_class( inner.current_ast(codegen_state)))) else: inner_ast = inner.current_ast(codegen_state) from loopy.isl_helpers import simplify_pw_aff result.append( inner.with_new_ast( codegen_state, astb.emit_sequential_loop( codegen_state, loop_iname, kernel.index_dtype, pw_aff_to_expr(simplify_pw_aff(lbound, kernel.assumptions)), pw_aff_to_expr(simplify_pw_aff(ubound, kernel.assumptions)), inner_ast))) return merge_codegen_results(codegen_state, result)
def build_loop_nest(codegen_state, schedule_index): # Most of the complexity of this function goes towards finding groups of # instructions that can be nested inside a shared conditional. kernel = codegen_state.kernel # {{{ pass 1: pre-scan schedule for my schedule item's siblings' indices # i.e. go up to the next LeaveLoop, and skip over inner loops. my_sched_indices = [] i = schedule_index while i < codegen_state.schedule_index_end: sched_item = kernel.schedule[i] if isinstance(sched_item, LeaveLoop): break my_sched_indices.append(i) if isinstance(sched_item, (EnterLoop, CallKernel)): _, i = gather_schedule_block(kernel.schedule, i) assert i <= codegen_state.schedule_index_end, \ "schedule block extends beyond schedule_index_end" elif isinstance(sched_item, Barrier): i += 1 elif isinstance(sched_item, RunInstruction): i += 1 else: raise RuntimeError("unexpected schedule item type: %s" % type(sched_item)) del i # }}} # {{{ pass 2: find admissible conditional inames for each sibling schedule item from pytools import Record class ScheduleIndexInfo(Record): """ .. attribute:: schedule_index .. attribute:: admissible_cond_inames .. attribute:: required_predicates .. attribute:: used_inames_within """ from loopy.schedule import find_used_inames_within sched_index_info_entries = [ ScheduleIndexInfo( schedule_indices=[i], admissible_cond_inames=(get_admissible_conditional_inames_for( codegen_state, i)), required_predicates=get_required_predicates(kernel, i), used_inames_within=find_used_inames_within(kernel, i)) for i in my_sched_indices ] sched_index_info_entries = group_by( sched_index_info_entries, key=lambda sii: (sii.admissible_cond_inames, sii.required_predicates, sii.used_inames_within), merge=lambda sii1, sii2: sii1.copy(schedule_indices=( sii1.schedule_indices + sii2.schedule_indices))) # }}} # {{{ pass 3: greedily group schedule items that share admissible inames from pytools import memoize_method class BoundsCheckCache: def __init__(self, kernel, impl_domain): self.kernel = kernel self.impl_domain = impl_domain @memoize_method def __call__(self, check_inames): if not check_inames: return [] domain = isl.align_spaces( self.kernel.get_inames_domain(check_inames), self.impl_domain, obj_bigger_ok=True) from loopy.codegen.bounds import get_bounds_checks return get_bounds_checks( domain, check_inames, self.impl_domain, # Each instruction individually gets its bounds checks, # so we can safely overapproximate here. overapproximate=True) def build_insn_group(sched_index_info_entries, codegen_state, done_group_lengths=set()): """ :arg done_group_lengths: A set of group lengths (integers) that grows from empty to include the longest found group and downwards with every recursive call. It serves to prevent infinite recursion by preventing recursive calls from doing anything about groups that are too small. """ # The rough plan here is that build_insn_group starts out with the # entirety of the current schedule item's downward siblings (i.e. all # the ones up to the next LeaveLoop). It will then iterate upward to # find the largest usable conditional hoist group. # # It will then call itself recursively, telling its recursive instances # to ignore the hoist group it just found by adding that group length # to done_group_length. (It'll also chop the set of schedule indices # considered down so that a callee cannot find a *longer* hoist group.) # # Upon return the hoist is wrapped around the returned code and # build_insn_group calls itself for the remainder of schedule indices # that were not in the hoist group. if not sched_index_info_entries: return [] origin_si_entry = sched_index_info_entries[0] current_iname_set = origin_si_entry.admissible_cond_inames current_pred_set = (origin_si_entry.required_predicates - codegen_state.implemented_predicates) # {{{ grow schedule item group # Keep growing schedule item group as long as group fulfills minimum # size requirement. bounds_check_cache = BoundsCheckCache(kernel, codegen_state.implemented_domain) found_hoists = [] candidate_group_length = 1 while candidate_group_length <= len(sched_index_info_entries): if candidate_group_length in done_group_lengths: candidate_group_length += 1 continue current_iname_set = ( current_iname_set & sched_index_info_entries[candidate_group_length - 1].admissible_cond_inames) current_pred_set = ( current_pred_set & sched_index_info_entries[candidate_group_length - 1].required_predicates) # {{{ see which inames are actually used in group # And only generate conditionals for those. used_inames = set() for sched_index_info_entry in \ sched_index_info_entries[0:candidate_group_length]: used_inames |= sched_index_info_entry.used_inames_within # }}} only_unshared_inames = kernel.remove_inames_for_shared_hw_axes( current_iname_set & used_inames) bounds_checks = bounds_check_cache(only_unshared_inames) if (bounds_checks # found a bounds check or current_pred_set or candidate_group_length == 1): # length-1 must always be an option to reach the recursion base # case below found_hoists.append( (candidate_group_length, bounds_checks, current_pred_set)) if not bounds_checks and not current_pred_set: # already no more checks possible, let's not waste time # checking longer groups. break candidate_group_length += 1 # }}} # pick largest such group group_length, bounds_checks, pred_checks = max(found_hoists) check_set = None for cns in bounds_checks: cns_set = (isl.BasicSet.universe( cns.get_space()).add_constraint(cns)) if check_set is None: check_set = cns_set else: check_set, cns_set = isl.align_two(check_set, cns_set) check_set = check_set.intersect(cns_set) if check_set is None: new_codegen_state = codegen_state is_empty = False else: is_empty = check_set.is_empty() new_codegen_state = codegen_state.intersect(check_set) if pred_checks: new_codegen_state = new_codegen_state.copy( implemented_predicates=new_codegen_state.implemented_predicates | pred_checks) if is_empty: result = [] else: if group_length == 1: # group only contains starting schedule item def gen_code(inner_codegen_state): result = [] for i in origin_si_entry.schedule_indices: inner = generate_code_for_sched_index( inner_codegen_state, i) if inner is not None: result.append(inner) return result else: # recurse with a bigger done_group_lengths def gen_code(inner_codegen_state): return build_insn_group( sched_index_info_entries[0:group_length], inner_codegen_state, done_group_lengths=(done_group_lengths | set([group_length]))) # gen_code returns a list if bounds_checks or pred_checks: from loopy.symbolic import constraint_to_expr prev_gen_code = gen_code def gen_code(inner_codegen_state): condition_exprs = [ constraint_to_expr(cns) for cns in bounds_checks ] + [pred_chk for pred_chk in pred_checks] prev_result = prev_gen_code(inner_codegen_state) return [ wrap_in_if( inner_codegen_state, condition_exprs, merge_codegen_results(codegen_state, prev_result)) ] cannot_vectorize = False if new_codegen_state.vectorization_info is not None: from loopy.isl_helpers import obj_involves_variable for cond in bounds_checks: if obj_involves_variable( cond, new_codegen_state.vectorization_info.iname): cannot_vectorize = True break if cannot_vectorize: def gen_code_wrapper(inner_codegen_state): # gen_code returns a list, but this needs to return a # GeneratedCode instance. return gen_code(inner_codegen_state) result = [new_codegen_state.unvectorize(gen_code_wrapper)] else: result = gen_code(new_codegen_state) else: result = gen_code(new_codegen_state) return result + build_insn_group( sched_index_info_entries[group_length:], codegen_state) # }}} insn_group = build_insn_group(sched_index_info_entries, codegen_state) return merge_codegen_results(codegen_state, insn_group)
def generate_code_for_sched_index(codegen_state, sched_index): kernel = codegen_state.kernel sched_item = kernel.schedule[sched_index] if isinstance(sched_item, CallKernel): assert not codegen_state.is_generating_device_code from loopy.schedule import (gather_schedule_block, get_insn_ids_for_block_at) _, past_end_i = gather_schedule_block(kernel.schedule, sched_index) assert past_end_i <= codegen_state.schedule_index_end extra_args = synthesize_idis_for_extra_args(kernel, sched_index) new_codegen_state = codegen_state.copy( is_generating_device_code=True, gen_program_name=sched_item.kernel_name, schedule_index_end=past_end_i-1, implemented_data_info=(codegen_state.implemented_data_info + extra_args)) from loopy.codegen.result import generate_host_or_device_program codegen_result = generate_host_or_device_program( new_codegen_state, sched_index) glob_grid, loc_grid = kernel.get_grid_sizes_for_insn_ids_as_exprs( get_insn_ids_for_block_at(kernel.schedule, sched_index)) return merge_codegen_results(codegen_state, [ codegen_result, codegen_state.ast_builder.get_kernel_call( codegen_state, sched_item.kernel_name, glob_grid, loc_grid, extra_args), ]) elif isinstance(sched_item, EnterLoop): tags = kernel.iname_tags(sched_item.iname) tags = tuple(tag for tag in tags if tag) from loopy.codegen.loop import ( generate_unroll_loop, generate_vectorize_loop, generate_sequential_loop_dim_code) from loopy.kernel.data import (UnrolledIlpTag, UnrollTag, ForceSequentialTag, LoopedIlpTag, VectorizeTag, InOrderSequentialSequentialTag, filter_iname_tags_by_type) if filter_iname_tags_by_type(tags, (UnrollTag, UnrolledIlpTag)): func = generate_unroll_loop elif filter_iname_tags_by_type(tags, VectorizeTag): func = generate_vectorize_loop elif not tags or filter_iname_tags_by_type(tags, (LoopedIlpTag, ForceSequentialTag, InOrderSequentialSequentialTag)): func = generate_sequential_loop_dim_code else: raise RuntimeError("encountered (invalid) EnterLoop " "for '%s', tagged '%s'" % (sched_item.iname, ", ".join(str(tag) for tag in tags))) return func(codegen_state, sched_index) elif isinstance(sched_item, Barrier): # {{{ emit barrier code from loopy.codegen.result import CodeGenerationResult if codegen_state.is_generating_device_code: barrier_ast = codegen_state.ast_builder.emit_barrier( sched_item.synchronization_kind, sched_item.mem_kind, sched_item.comment) if sched_item.originating_insn_id: return CodeGenerationResult.new( codegen_state, sched_item.originating_insn_id, barrier_ast, codegen_state.implemented_domain) else: return barrier_ast else: # host code if sched_item.synchronization_kind in ["global", "local"]: # host code is assumed globally and locally synchronous return CodeGenerationResult( host_program=None, device_programs=[], implemented_domains={}, implemented_data_info=codegen_state.implemented_data_info) else: raise LoopyError("do not know how to emit code for barrier " "synchronization kind '%s'" "in host code" % sched_item.synchronization_kind) # }}} elif isinstance(sched_item, RunInstruction): insn = kernel.id_to_insn[sched_item.insn_id] from loopy.codegen.instruction import generate_instruction_code return codegen_state.try_vectorized( "instruction %s" % insn.id, lambda inner_cgs: generate_instruction_code(inner_cgs, insn)) else: raise RuntimeError("unexpected schedule item type: %s" % type(sched_item))
def generate_code_for_sched_index(codegen_state, sched_index): kernel = codegen_state.kernel sched_item = kernel.schedule[sched_index] if isinstance(sched_item, CallKernel): assert not codegen_state.is_generating_device_code from loopy.schedule import (gather_schedule_block, get_insn_ids_for_block_at) _, past_end_i = gather_schedule_block(kernel.schedule, sched_index) assert past_end_i <= codegen_state.schedule_index_end extra_args = synthesize_idis_for_extra_args(kernel, sched_index) new_codegen_state = codegen_state.copy( is_generating_device_code=True, gen_program_name=sched_item.kernel_name, schedule_index_end=past_end_i - 1, implemented_data_info=(codegen_state.implemented_data_info + extra_args)) from loopy.codegen.result import generate_host_or_device_program codegen_result = generate_host_or_device_program( new_codegen_state, sched_index) glob_grid, loc_grid = kernel.get_grid_sizes_for_insn_ids_as_exprs( get_insn_ids_for_block_at(kernel.schedule, sched_index)) return merge_codegen_results(codegen_state, [ codegen_result, codegen_state.ast_builder.get_kernel_call( codegen_state, sched_item.kernel_name, glob_grid, loc_grid, extra_args), ]) elif isinstance(sched_item, EnterLoop): tag = kernel.iname_to_tag.get(sched_item.iname) from loopy.codegen.loop import (generate_unroll_loop, generate_vectorize_loop, generate_sequential_loop_dim_code) from loopy.kernel.data import (UnrolledIlpTag, UnrollTag, ForceSequentialTag, LoopedIlpTag, VectorizeTag) if isinstance(tag, (UnrollTag, UnrolledIlpTag)): func = generate_unroll_loop elif isinstance(tag, VectorizeTag): func = generate_vectorize_loop elif tag is None or isinstance(tag, (LoopedIlpTag, ForceSequentialTag)): func = generate_sequential_loop_dim_code else: raise RuntimeError("encountered (invalid) EnterLoop " "for '%s', tagged '%s'" % (sched_item.iname, tag)) return func(codegen_state, sched_index) elif isinstance(sched_item, Barrier): return codegen_state.ast_builder.emit_barrier(sched_item.kind, sched_item.comment) elif isinstance(sched_item, RunInstruction): insn = kernel.id_to_insn[sched_item.insn_id] from loopy.codegen.instruction import generate_instruction_code return codegen_state.try_vectorized( "instruction %s" % insn.id, lambda inner_cgs: generate_instruction_code(inner_cgs, insn)) else: raise RuntimeError("unexpected schedule item type: %s" % type(sched_item))
def generate_sequential_loop_dim_code(codegen_state, sched_index): kernel = codegen_state.kernel ecm = codegen_state.expression_to_code_mapper loop_iname = kernel.schedule[sched_index].iname slabs = get_slab_decomposition(kernel, loop_iname) from loopy.codegen.bounds import get_usable_inames_for_conditional # Note: this does not include loop_iname itself! usable_inames = get_usable_inames_for_conditional(kernel, sched_index) domain = kernel.get_inames_domain(loop_iname) result = [] for slab_name, slab in slabs: cmt = "%s slab for '%s'" % (slab_name, loop_iname) if len(slabs) == 1: cmt = None # {{{ find bounds aligned_domain = isl.align_spaces(domain, slab, across_dim_types=True, obj_bigger_ok=True) dom_and_slab = aligned_domain & slab assumptions_non_param = isl.BasicSet.from_params(kernel.assumptions) dom_and_slab, assumptions_non_param = isl.align_two( dom_and_slab, assumptions_non_param) dom_and_slab = dom_and_slab & assumptions_non_param # move inames that are usable into parameters moved_inames = [] for iname in dom_and_slab.get_var_names(dim_type.set): if iname in usable_inames: moved_inames.append(iname) dt, idx = dom_and_slab.get_var_dict()[iname] dom_and_slab = dom_and_slab.move_dims( dim_type.param, dom_and_slab.dim(dim_type.param), dt, idx, 1) _, loop_iname_idx = dom_and_slab.get_var_dict()[loop_iname] from loopy.isl_helpers import ( static_min_of_pw_aff, static_max_of_pw_aff) lbound = ( kernel.cache_manager.dim_min( dom_and_slab, loop_iname_idx) .gist(kernel.assumptions) .coalesce()) ubound = ( kernel.cache_manager.dim_max( dom_and_slab, loop_iname_idx) .gist(kernel.assumptions) .coalesce()) static_lbound = static_min_of_pw_aff( lbound, constants_only=False) static_ubound = static_max_of_pw_aff( ubound, constants_only=False) # }}} # {{{ find implemented slab, build inner code from loopy.isl_helpers import make_slab_from_bound_pwaffs # impl_slab may be overapproximated impl_slab = make_slab_from_bound_pwaffs( dom_and_slab.space, loop_iname, static_lbound, static_ubound) for iname in moved_inames: dt, idx = impl_slab.get_var_dict()[iname] impl_slab = impl_slab.move_dims( dim_type.set, impl_slab.dim(dim_type.set), dt, idx, 1) new_codegen_state = ( codegen_state .intersect(impl_slab) .copy(kernel=intersect_kernel_with_slab( kernel, slab, iname))) inner = build_loop_nest(new_codegen_state, sched_index+1) # }}} if cmt is not None: result.append(codegen_state.ast_builder.emit_comment(cmt)) from loopy.symbolic import aff_to_expr astb = codegen_state.ast_builder if (static_ubound - static_lbound).plain_is_zero(): # single-trip, generate just a variable assignment, not a loop result.append(merge_codegen_results(codegen_state, [ astb.emit_initializer( codegen_state, kernel.index_dtype, loop_iname, ecm(aff_to_expr(static_lbound), PREC_NONE, "i"), is_const=True), astb.emit_blank_line(), inner, ])) else: inner_ast = inner.current_ast(codegen_state) result.append( inner.with_new_ast( codegen_state, astb.emit_sequential_loop( codegen_state, loop_iname, kernel.index_dtype, static_lbound, static_ubound, inner_ast))) return merge_codegen_results(codegen_state, result)