def generate_unroll_loop(codegen_state, sched_index): kernel = codegen_state.kernel iname = kernel.schedule[sched_index].iname bounds = kernel.get_iname_bounds(iname, constants_only=True) from loopy.isl_helpers import (static_max_of_pw_aff, static_value_of_pw_aff) from loopy.symbolic import pw_aff_to_expr length_aff = static_max_of_pw_aff(bounds.size, constants_only=True) if not length_aff.is_cst(): raise LoopyError("length of unrolled loop '%s' is not a constant, " "cannot unroll") length = int(pw_aff_to_expr(length_aff)) try: lower_bound_aff = static_value_of_pw_aff( bounds.lower_bound_pw_aff.coalesce(), constants_only=False) except Exception as e: raise type(e)("while finding lower bound of '%s': " % iname) result = [] for i in range(length): idx_aff = lower_bound_aff + i new_codegen_state = codegen_state.fix(iname, idx_aff) result.append(build_loop_nest(new_codegen_state, sched_index + 1)) return merge_codegen_results(codegen_state, result)
def generate_unroll_loop(kernel, sched_index, codegen_state): iname = kernel.schedule[sched_index].iname bounds = kernel.get_iname_bounds(iname, constants_only=True) from loopy.isl_helpers import ( static_max_of_pw_aff, static_value_of_pw_aff) from loopy.symbolic import pw_aff_to_expr length_aff = static_max_of_pw_aff(bounds.size, constants_only=True) if not length_aff.is_cst(): raise LoopyError( "length of unrolled loop '%s' is not a constant, " "cannot unroll") length = int(pw_aff_to_expr(length_aff)) try: lower_bound_aff = static_value_of_pw_aff( bounds.lower_bound_pw_aff.coalesce(), constants_only=False) except Exception as e: raise type(e)("while finding lower bound of '%s': " % iname) result = [] for i in range(length): idx_aff = lower_bound_aff + i new_codegen_state = codegen_state.fix(iname, idx_aff) result.append( build_loop_nest(kernel, sched_index+1, new_codegen_state)) return gen_code_block(result)
def generate_vectorize_loop(codegen_state, sched_index): kernel = codegen_state.kernel iname = kernel.schedule[sched_index].iname bounds = kernel.get_iname_bounds(iname, constants_only=True) from loopy.isl_helpers import ( static_max_of_pw_aff, static_value_of_pw_aff) from loopy.symbolic import pw_aff_to_expr length_aff = static_max_of_pw_aff(bounds.size, constants_only=True) if not length_aff.is_cst(): warn(kernel, "vec_upper_not_const", "upper bound for vectorized loop '%s' is not a constant, " "cannot vectorize--unrolling instead") return generate_unroll_loop(kernel, sched_index, codegen_state) length = int(pw_aff_to_expr(length_aff)) try: lower_bound_aff = static_value_of_pw_aff( bounds.lower_bound_pw_aff.coalesce(), constants_only=False) except Exception as e: raise type(e)("while finding lower bound of '%s': " % iname) if not lower_bound_aff.plain_is_zero(): warn(kernel, "vec_lower_not_0", "lower bound for vectorized loop '%s' is not zero, " "cannot vectorize--unrolling instead") return generate_unroll_loop(kernel, sched_index, codegen_state) # {{{ 'implement' vectorization bounds domain = kernel.get_inames_domain(iname) from loopy.isl_helpers import make_slab slab = make_slab(domain.get_space(), iname, lower_bound_aff, lower_bound_aff+length) codegen_state = codegen_state.intersect(slab) # }}} from loopy.codegen import VectorizationInfo new_codegen_state = codegen_state.copy( vectorization_info=VectorizationInfo( iname=iname, length=length, space=length_aff.space)) return build_loop_nest(new_codegen_state, sched_index+1)
def generate_vectorize_loop(codegen_state, sched_index): kernel = codegen_state.kernel iname = kernel.schedule[sched_index].iname bounds = kernel.get_iname_bounds(iname, constants_only=True) from loopy.isl_helpers import ( static_max_of_pw_aff, static_value_of_pw_aff) from loopy.symbolic import pw_aff_to_expr length_aff = static_max_of_pw_aff(bounds.size, constants_only=True) if not length_aff.is_cst(): warn(kernel, "vec_upper_not_const", "upper bound for vectorized loop '%s' is not a constant, " "cannot vectorize--unrolling instead") return generate_unroll_loop(codegen_state, sched_index) length = int(pw_aff_to_expr(length_aff)) try: lower_bound_aff = static_value_of_pw_aff( bounds.lower_bound_pw_aff.coalesce(), constants_only=False) except Exception as e: raise type(e)("while finding lower bound of '%s': " % iname) if not lower_bound_aff.plain_is_zero(): warn(kernel, "vec_lower_not_0", "lower bound for vectorized loop '%s' is not zero, " "cannot vectorize--unrolling instead") return generate_unroll_loop(codegen_state, sched_index) # {{{ 'implement' vectorization bounds domain = kernel.get_inames_domain(iname) from loopy.isl_helpers import make_slab slab = make_slab(domain.get_space(), iname, lower_bound_aff, lower_bound_aff+length) codegen_state = codegen_state.intersect(slab) # }}} from loopy.codegen import VectorizationInfo new_codegen_state = codegen_state.copy( vectorization_info=VectorizationInfo( iname=iname, length=length, space=length_aff.space)) return build_loop_nest(new_codegen_state, sched_index+1)
def generate_host_or_device_program(codegen_state, schedule_index): ast_builder = codegen_state.ast_builder temp_decls = ast_builder.get_temporary_decls(codegen_state, schedule_index) from functools import partial from loopy.codegen.control import build_loop_nest if codegen_state.is_generating_device_code: from loopy.schedule import CallKernel assert isinstance(codegen_state.kernel.linearization[schedule_index], CallKernel) from loopy.codegen.loop import set_up_hw_parallel_loops codegen_result = set_up_hw_parallel_loops( codegen_state, schedule_index, next_func=partial(build_loop_nest, schedule_index=schedule_index + 1)) else: codegen_result = build_loop_nest(codegen_state, schedule_index) if (codegen_state.is_generating_device_code or codegen_state.is_entrypoint): codegen_result = merge_codegen_results( codegen_state, ast_builder.generate_top_of_body(codegen_state) + temp_decls + [codegen_result], collapse=False) cur_prog = codegen_result.current_program(codegen_state) body_ast = cur_prog.ast fdecl_ast = ast_builder.get_function_declaration( codegen_state, codegen_result, schedule_index) fdef_ast = ast_builder.get_function_definition( codegen_state, codegen_result, schedule_index, fdecl_ast, body_ast) codegen_result = codegen_result.with_new_program( codegen_state, cur_prog.copy( ast=ast_builder.process_ast(fdef_ast), body_ast=ast_builder.process_ast(body_ast))) return codegen_result
def generate_host_or_device_program(codegen_state, schedule_index): ast_builder = codegen_state.ast_builder temp_decls = ast_builder.get_temporary_decls(codegen_state, schedule_index) from functools import partial from loopy.codegen.control import build_loop_nest if codegen_state.is_generating_device_code: from loopy.schedule import CallKernel assert isinstance(codegen_state.kernel.schedule[schedule_index], CallKernel) from loopy.codegen.loop import set_up_hw_parallel_loops codegen_result = set_up_hw_parallel_loops( codegen_state, schedule_index, next_func=partial(build_loop_nest, schedule_index=schedule_index + 1)) else: codegen_result = build_loop_nest(codegen_state, schedule_index) codegen_result = merge_codegen_results( codegen_state, ast_builder.generate_top_of_body(codegen_state) + temp_decls + [codegen_result], collapse=False) cur_prog = codegen_result.current_program(codegen_state) body_ast = cur_prog.ast fdecl_ast = ast_builder.get_function_declaration( codegen_state, codegen_result, schedule_index) fdef_ast = ast_builder.get_function_definition( codegen_state, codegen_result, schedule_index, fdecl_ast, body_ast) codegen_result = codegen_result.with_new_program( codegen_state, cur_prog.copy( ast=fdef_ast, body_ast=body_ast)) return codegen_result
def generate_sequential_loop_dim_code(codegen_state, sched_index): kernel = codegen_state.kernel ecm = codegen_state.expression_to_code_mapper loop_iname = kernel.schedule[sched_index].iname slabs = get_slab_decomposition(kernel, loop_iname) from loopy.codegen.bounds import get_usable_inames_for_conditional # Note: this does not include loop_iname itself! usable_inames = get_usable_inames_for_conditional(kernel, sched_index) domain = kernel.get_inames_domain(loop_iname) result = [] for slab_name, slab in slabs: cmt = "%s slab for '%s'" % (slab_name, loop_iname) if len(slabs) == 1: cmt = None # {{{ find bounds aligned_domain = isl.align_spaces(domain, slab, obj_bigger_ok=True) dom_and_slab = aligned_domain & slab assumptions_non_param = isl.BasicSet.from_params(kernel.assumptions) dom_and_slab, assumptions_non_param = isl.align_two( dom_and_slab, assumptions_non_param) dom_and_slab = dom_and_slab & assumptions_non_param # move inames that are usable into parameters moved_inames = [] for das_iname in sorted(dom_and_slab.get_var_names(dim_type.set)): if das_iname in usable_inames: moved_inames.append(das_iname) dt, idx = dom_and_slab.get_var_dict()[das_iname] dom_and_slab = dom_and_slab.move_dims( dim_type.param, dom_and_slab.dim(dim_type.param), dt, idx, 1) _, loop_iname_idx = dom_and_slab.get_var_dict()[loop_iname] impl_domain = isl.align_spaces(codegen_state.implemented_domain, dom_and_slab, obj_bigger_ok=True).params() lbound = (kernel.cache_manager.dim_min( dom_and_slab, loop_iname_idx).gist( kernel.assumptions).gist(impl_domain).coalesce()) ubound = (kernel.cache_manager.dim_max( dom_and_slab, loop_iname_idx).gist( kernel.assumptions).gist(impl_domain).coalesce()) # }}} # {{{ find implemented loop, build inner code from loopy.symbolic import pw_aff_to_pw_aff_implemented_by_expr impl_lbound = pw_aff_to_pw_aff_implemented_by_expr(lbound) impl_ubound = pw_aff_to_pw_aff_implemented_by_expr(ubound) # impl_loop may be overapproximated from loopy.isl_helpers import make_loop_bounds_from_pwaffs impl_loop = make_loop_bounds_from_pwaffs(dom_and_slab.space, loop_iname, impl_lbound, impl_ubound) for moved_iname in moved_inames: # move moved_iname to 'set' dim_type in impl_loop dt, idx = impl_loop.get_var_dict()[moved_iname] impl_loop = impl_loop.move_dims(dim_type.set, impl_loop.dim(dim_type.set), dt, idx, 1) new_codegen_state = (codegen_state.intersect(impl_loop).copy( kernel=intersect_kernel_with_slab(kernel, slab, loop_iname))) inner = build_loop_nest(new_codegen_state, sched_index + 1) # }}} if cmt is not None: result.append(codegen_state.ast_builder.emit_comment(cmt)) astb = codegen_state.ast_builder from loopy.symbolic import pw_aff_to_expr if impl_ubound.is_equal(impl_lbound): # single-trip, generate just a variable assignment, not a loop inner = merge_codegen_results(codegen_state, [ astb.emit_initializer(codegen_state, kernel.index_dtype, loop_iname, ecm(pw_aff_to_expr(lbound), PREC_NONE, "i"), is_const=True), astb.emit_blank_line(), inner, ]) result.append( inner.with_new_ast( codegen_state, astb.ast_block_scope_class( inner.current_ast(codegen_state)))) else: inner_ast = inner.current_ast(codegen_state) from loopy.isl_helpers import simplify_pw_aff result.append( inner.with_new_ast( codegen_state, astb.emit_sequential_loop( codegen_state, loop_iname, kernel.index_dtype, pw_aff_to_expr( simplify_pw_aff(lbound, kernel.assumptions)), pw_aff_to_expr( simplify_pw_aff(ubound, kernel.assumptions)), inner_ast))) return merge_codegen_results(codegen_state, result)
def generate_sequential_loop_dim_code(kernel, sched_index, codegen_state): ecm = codegen_state.expression_to_code_mapper loop_iname = kernel.schedule[sched_index].iname slabs = get_slab_decomposition( kernel, loop_iname, sched_index, codegen_state) from loopy.codegen.bounds import get_usable_inames_for_conditional # Note: this does not include loop_iname itself! usable_inames = get_usable_inames_for_conditional(kernel, sched_index) domain = kernel.get_inames_domain(loop_iname) result = [] for slab_name, slab in slabs: cmt = "%s slab for '%s'" % (slab_name, loop_iname) if len(slabs) == 1: cmt = None # {{{ find bounds aligned_domain = isl.align_spaces(domain, slab, across_dim_types=True, obj_bigger_ok=True) dom_and_slab = aligned_domain & slab assumptions_non_param = isl.BasicSet.from_params(kernel.assumptions) dom_and_slab, assumptions_non_param = isl.align_two( dom_and_slab, assumptions_non_param) dom_and_slab = dom_and_slab & assumptions_non_param # move inames that are usable into parameters moved_inames = [] for iname in dom_and_slab.get_var_names(dim_type.set): if iname in usable_inames: moved_inames.append(iname) dt, idx = dom_and_slab.get_var_dict()[iname] dom_and_slab = dom_and_slab.move_dims( dim_type.param, dom_and_slab.dim(dim_type.param), dt, idx, 1) _, loop_iname_idx = dom_and_slab.get_var_dict()[loop_iname] from loopy.isl_helpers import ( static_min_of_pw_aff, static_max_of_pw_aff) lbound = ( kernel.cache_manager.dim_min( dom_and_slab, loop_iname_idx) .gist(kernel.assumptions) .coalesce()) ubound = ( kernel.cache_manager.dim_max( dom_and_slab, loop_iname_idx) .gist(kernel.assumptions) .coalesce()) static_lbound = static_min_of_pw_aff( lbound, constants_only=False) static_ubound = static_max_of_pw_aff( ubound, constants_only=False) # }}} # {{{ find implemented slab, build inner code from loopy.isl_helpers import make_slab_from_bound_pwaffs # impl_slab may be overapproximated impl_slab = make_slab_from_bound_pwaffs( dom_and_slab.space, loop_iname, static_lbound, static_ubound) for iname in moved_inames: dt, idx = impl_slab.get_var_dict()[iname] impl_slab = impl_slab.move_dims( dim_type.set, impl_slab.dim(dim_type.set), dt, idx, 1) new_codegen_state = codegen_state.intersect(impl_slab) inner = build_loop_nest( intersect_kernel_with_slab( kernel, slab, iname), sched_index+1, new_codegen_state) # }}} if cmt is not None: from cgen import Comment result.append(Comment(cmt)) from cgen import Initializer, POD, Const, Line from loopy.symbolic import aff_to_expr if (static_ubound - static_lbound).plain_is_zero(): # single-trip, generate just a variable assignment, not a loop result.append(gen_code_block([ Initializer(Const(POD(kernel.index_dtype, loop_iname)), ecm(aff_to_expr(static_lbound), PREC_NONE, "i")), Line(), inner, ])) else: result.append( kernel.target.emit_sequential_loop( codegen_state, loop_iname, kernel.index_dtype, static_lbound, static_ubound, inner)) return gen_code_block(result)
def set_up_hw_parallel_loops(kernel, sched_index, codegen_state, hw_inames_left=None): from loopy.kernel.data import ( UniqueTag, HardwareParallelTag, LocalIndexTag, GroupIndexTag) if hw_inames_left is None: hw_inames_left = [iname for iname in kernel.all_inames() if isinstance(kernel.iname_to_tag.get(iname), HardwareParallelTag)] if not hw_inames_left: return build_loop_nest(kernel, sched_index, codegen_state) global_size, local_size = kernel.get_grid_sizes() hw_inames_left = hw_inames_left[:] iname = hw_inames_left.pop() tag = kernel.iname_to_tag.get(iname) from loopy.symbolic import GroupHardwareAxisIndex, LocalHardwareAxisIndex assert isinstance(tag, UniqueTag) if isinstance(tag, GroupIndexTag): hw_axis_expr = GroupHardwareAxisIndex(tag.axis) elif isinstance(tag, LocalIndexTag): hw_axis_expr = LocalHardwareAxisIndex(tag.axis) else: raise RuntimeError("unexpected hw tag type") other_inames_with_same_tag = [ other_iname for other_iname in kernel.all_inames() if isinstance(kernel.iname_to_tag.get(other_iname), UniqueTag) and kernel.iname_to_tag.get(other_iname).key == tag.key and other_iname != iname] # {{{ 'implement' hardware axis boundaries if isinstance(tag, LocalIndexTag): hw_axis_size = local_size[tag.axis] elif isinstance(tag, GroupIndexTag): hw_axis_size = global_size[tag.axis] else: raise RuntimeError("unknown hardware parallel tag") result = [] bounds = kernel.get_iname_bounds(iname) domain = kernel.get_inames_domain(iname) # It's ok to find a bound that's too "loose". The conditional # generators will mop up after us. from loopy.isl_helpers import static_min_of_pw_aff lower_bound = static_min_of_pw_aff(bounds.lower_bound_pw_aff, constants_only=False) # These bounds are 'implemented' by the hardware. Make sure # that the downstream conditional generators realize that. if not isinstance(hw_axis_size, int): hw_axis_size, lower_bound = isl.align_two(hw_axis_size, lower_bound) from loopy.isl_helpers import make_slab slab = make_slab(domain.get_space(), iname, lower_bound, lower_bound+hw_axis_size) codegen_state = codegen_state.intersect(slab) from loopy.symbolic import pw_aff_to_expr hw_axis_expr = hw_axis_expr + pw_aff_to_expr(lower_bound) # }}} slabs = get_slab_decomposition( kernel, iname, sched_index, codegen_state) if other_inames_with_same_tag and len(slabs) > 1: raise RuntimeError("cannot do slab decomposition on inames that share " "a tag with other inames") result = [] from loopy.codegen import add_comment for slab_name, slab in slabs: cmt = "%s slab for '%s'" % (slab_name, iname) if len(slabs) == 1: cmt = None # Have the conditional infrastructure generate the # slabbing conditionals. slabbed_kernel = intersect_kernel_with_slab(kernel, slab, iname) new_codegen_state = codegen_state.copy_and_assign(iname, hw_axis_expr) inner = set_up_hw_parallel_loops( slabbed_kernel, sched_index, new_codegen_state, hw_inames_left) result.append(add_comment(cmt, inner)) from loopy.codegen import gen_code_block return gen_code_block(result)
def generate_sequential_loop_dim_code(codegen_state, sched_index): kernel = codegen_state.kernel ecm = codegen_state.expression_to_code_mapper loop_iname = kernel.schedule[sched_index].iname slabs = get_slab_decomposition(kernel, loop_iname) from loopy.codegen.bounds import get_usable_inames_for_conditional # Note: this does not include loop_iname itself! usable_inames = get_usable_inames_for_conditional(kernel, sched_index) domain = kernel.get_inames_domain(loop_iname) result = [] for slab_name, slab in slabs: cmt = "%s slab for '%s'" % (slab_name, loop_iname) if len(slabs) == 1: cmt = None # {{{ find bounds aligned_domain = isl.align_spaces(domain, slab, across_dim_types=True, obj_bigger_ok=True) dom_and_slab = aligned_domain & slab assumptions_non_param = isl.BasicSet.from_params(kernel.assumptions) dom_and_slab, assumptions_non_param = isl.align_two( dom_and_slab, assumptions_non_param) dom_and_slab = dom_and_slab & assumptions_non_param # move inames that are usable into parameters moved_inames = [] for iname in dom_and_slab.get_var_names(dim_type.set): if iname in usable_inames: moved_inames.append(iname) dt, idx = dom_and_slab.get_var_dict()[iname] dom_and_slab = dom_and_slab.move_dims( dim_type.param, dom_and_slab.dim(dim_type.param), dt, idx, 1) _, loop_iname_idx = dom_and_slab.get_var_dict()[loop_iname] from loopy.isl_helpers import (static_min_of_pw_aff, static_max_of_pw_aff) lbound = (kernel.cache_manager.dim_min( dom_and_slab, loop_iname_idx).gist(kernel.assumptions).coalesce()) ubound = (kernel.cache_manager.dim_max( dom_and_slab, loop_iname_idx).gist(kernel.assumptions).coalesce()) static_lbound = static_min_of_pw_aff(lbound, constants_only=False) static_ubound = static_max_of_pw_aff(ubound, constants_only=False) # }}} # {{{ find implemented slab, build inner code from loopy.isl_helpers import make_slab_from_bound_pwaffs # impl_slab may be overapproximated impl_slab = make_slab_from_bound_pwaffs(dom_and_slab.space, loop_iname, static_lbound, static_ubound) for iname in moved_inames: dt, idx = impl_slab.get_var_dict()[iname] impl_slab = impl_slab.move_dims(dim_type.set, impl_slab.dim(dim_type.set), dt, idx, 1) new_codegen_state = (codegen_state.intersect(impl_slab).copy( kernel=intersect_kernel_with_slab(kernel, slab, iname))) inner = build_loop_nest(new_codegen_state, sched_index + 1) # }}} if cmt is not None: result.append(codegen_state.ast_builder.emit_comment(cmt)) from loopy.symbolic import aff_to_expr astb = codegen_state.ast_builder if (static_ubound - static_lbound).plain_is_zero(): # single-trip, generate just a variable assignment, not a loop result.append( merge_codegen_results(codegen_state, [ astb.emit_initializer(codegen_state, kernel.index_dtype, loop_iname, ecm(aff_to_expr(static_lbound), PREC_NONE, "i"), is_const=True), astb.emit_blank_line(), inner, ])) else: inner_ast = inner.current_ast(codegen_state) result.append( inner.with_new_ast( codegen_state, astb.emit_sequential_loop(codegen_state, loop_iname, kernel.index_dtype, static_lbound, static_ubound, inner_ast))) return merge_codegen_results(codegen_state, result)
def generate_sequential_loop_dim_code(codegen_state, sched_index): kernel = codegen_state.kernel ecm = codegen_state.expression_to_code_mapper loop_iname = kernel.schedule[sched_index].iname slabs = get_slab_decomposition(kernel, loop_iname) from loopy.codegen.bounds import get_usable_inames_for_conditional # Note: this does not include loop_iname itself! usable_inames = get_usable_inames_for_conditional(kernel, sched_index) domain = kernel.get_inames_domain(loop_iname) result = [] for slab_name, slab in slabs: cmt = "%s slab for '%s'" % (slab_name, loop_iname) if len(slabs) == 1: cmt = None # {{{ find bounds aligned_domain = isl.align_spaces(domain, slab, across_dim_types=True, obj_bigger_ok=True) dom_and_slab = aligned_domain & slab assumptions_non_param = isl.BasicSet.from_params(kernel.assumptions) dom_and_slab, assumptions_non_param = isl.align_two( dom_and_slab, assumptions_non_param) dom_and_slab = dom_and_slab & assumptions_non_param # move inames that are usable into parameters moved_inames = [] for das_iname in sorted(dom_and_slab.get_var_names(dim_type.set)): if das_iname in usable_inames: moved_inames.append(das_iname) dt, idx = dom_and_slab.get_var_dict()[das_iname] dom_and_slab = dom_and_slab.move_dims( dim_type.param, dom_and_slab.dim(dim_type.param), dt, idx, 1) _, loop_iname_idx = dom_and_slab.get_var_dict()[loop_iname] impl_domain = isl.align_spaces( codegen_state.implemented_domain, dom_and_slab, obj_bigger_ok=True, across_dim_types=True ).params() lbound = ( kernel.cache_manager.dim_min( dom_and_slab, loop_iname_idx) .gist(kernel.assumptions) .gist(impl_domain) .coalesce()) ubound = ( kernel.cache_manager.dim_max( dom_and_slab, loop_iname_idx) .gist(kernel.assumptions) .gist(impl_domain) .coalesce()) # }}} # {{{ find implemented loop, build inner code from loopy.symbolic import pw_aff_to_pw_aff_implemented_by_expr impl_lbound = pw_aff_to_pw_aff_implemented_by_expr(lbound) impl_ubound = pw_aff_to_pw_aff_implemented_by_expr(ubound) # impl_loop may be overapproximated from loopy.isl_helpers import make_loop_bounds_from_pwaffs impl_loop = make_loop_bounds_from_pwaffs( dom_and_slab.space, loop_iname, impl_lbound, impl_ubound) for moved_iname in moved_inames: # move moved_iname to 'set' dim_type in impl_loop dt, idx = impl_loop.get_var_dict()[moved_iname] impl_loop = impl_loop.move_dims( dim_type.set, impl_loop.dim(dim_type.set), dt, idx, 1) new_codegen_state = ( codegen_state .intersect(impl_loop) .copy(kernel=intersect_kernel_with_slab( kernel, slab, loop_iname))) inner = build_loop_nest(new_codegen_state, sched_index+1) # }}} if cmt is not None: result.append(codegen_state.ast_builder.emit_comment(cmt)) astb = codegen_state.ast_builder from loopy.symbolic import pw_aff_to_expr if impl_ubound.is_equal(impl_lbound): # single-trip, generate just a variable assignment, not a loop inner = merge_codegen_results(codegen_state, [ astb.emit_initializer( codegen_state, kernel.index_dtype, loop_iname, ecm(pw_aff_to_expr(lbound), PREC_NONE, "i"), is_const=True), astb.emit_blank_line(), inner, ]) result.append( inner.with_new_ast( codegen_state, astb.ast_block_scope_class( inner.current_ast(codegen_state)))) else: inner_ast = inner.current_ast(codegen_state) from loopy.isl_helpers import simplify_pw_aff result.append( inner.with_new_ast( codegen_state, astb.emit_sequential_loop( codegen_state, loop_iname, kernel.index_dtype, pw_aff_to_expr(simplify_pw_aff(lbound, kernel.assumptions)), pw_aff_to_expr(simplify_pw_aff(ubound, kernel.assumptions)), inner_ast))) return merge_codegen_results(codegen_state, result)