def _add_kernel_axis(kernel, axis_name, start, stop, base_inames): from loopy.kernel.tools import DomainChanger domch = DomainChanger(kernel, base_inames) domain = domch.domain new_dim_idx = domain.dim(dim_type.set) domain = (domain .insert_dims(dim_type.set, new_dim_idx, 1) .set_dim_name(dim_type.set, new_dim_idx, axis_name)) from loopy.symbolic import get_dependencies deps = get_dependencies(start) | get_dependencies(stop) assert deps <= kernel.all_params() param_names = domain.get_var_names(dim_type.param) for dep in deps: if dep not in param_names: new_dim_idx = domain.dim(dim_type.param) domain = (domain .insert_dims(dim_type.param, new_dim_idx, 1) .set_dim_name(dim_type.param, new_dim_idx, dep)) from loopy.isl_helpers import make_slab slab = make_slab(domain.get_space(), axis_name, start, stop) domain = domain & slab return kernel.copy(domains=domch.get_domains_with(domain))
def process_set(s): var_dict = s.get_var_dict() if split_iname not in var_dict: return s orig_dim_type, _ = var_dict[split_iname] outer_var_nr = s.dim(orig_dim_type) inner_var_nr = s.dim(orig_dim_type)+1 s = s.add_dims(orig_dim_type, 2) s = s.set_dim_name(orig_dim_type, outer_var_nr, outer_iname) s = s.set_dim_name(orig_dim_type, inner_var_nr, inner_iname) from loopy.isl_helpers import make_slab space = s.get_space() inner_constraint_set = ( make_slab(space, inner_iname, 0, inner_length) # name = inner + length*outer .add_constraint(isl.Constraint.eq_from_names( space, { split_iname: 1, inner_iname: -1, outer_iname: -inner_length}))) name_dim_type, name_idx = space.get_var_dict()[split_iname] s = s.intersect(inner_constraint_set) if within is None: s = s.project_out(name_dim_type, name_idx, 1) return s
def cumsum(self, arg): """ Registers a substitution rule in order to cumulatively sum the elements of array ``arg`` along ``axis``. Mimics :func:`numpy.cumsum`. :return: An instance of :class:`numloopy.ArraySymbol` which is which is registered as the cumulative summed-substitution rule. """ # Note: this can remain as a substitution but loopy does not have # support for translating inames for substitutions to the kernel # domains assert len(arg.shape) == 1 i_iname = self.name_generator(based_on="i") j_iname = self.name_generator(based_on="i") space = isl.Space.create_from_names(isl.DEFAULT_CONTEXT, [i_iname, j_iname]) domain = isl.BasicSet.universe(space) arg_name = self.name_generator(based_on="arr") subst_name = self.name_generator(based_on="subst") domain = domain & make_slab(space, i_iname, 0, arg.shape[0]) domain = domain.add_constraint( isl.Constraint.ineq_from_names(space, {j_iname: 1})) domain = domain.add_constraint( isl.Constraint.ineq_from_names(space, { j_iname: -1, i_iname: 1, 1: -1 })) cumsummed_arg = ArraySymbol(stack=self, name=arg_name, shape=arg.shape, dtype=arg.dtype) cumsummed_subst = ArraySymbol(stack=self, name=subst_name, shape=arg.shape, dtype=arg.dtype) subst_iname = self.name_generator(based_on="i") rule = lp.SubstitutionRule( subst_name, (subst_iname, ), Subscript(Variable(arg_name), (Variable(subst_iname), ))) from loopy.library.reduction import SumReductionOperation insn = lp.Assignment(assignee=Subscript(Variable(arg_name), (Variable(i_iname), )), expression=lp.Reduction( SumReductionOperation(), (j_iname, ), parse('{}({})'.format(arg.name, j_iname)))) self.data.append(cumsummed_arg) self.substs_to_arrays[subst_name] = arg_name self.register_implicit_assignment(insn) self.domains.append(domain) self.register_substitution(rule) return cumsummed_subst
def sum(self, arg, axis=None): """ Registers a substitution rule in order to sum the elements of array ``arg`` along ``axis``. :return: An instance of :class:`numloopy.ArraySymbol` which is which is registered as the sum-substitution rule. """ if isinstance(axis, int): axis = (axis, ) if not axis: axis = tuple(range(len(arg.shape))) inames = [self.name_generator(based_on="i") for _ in arg.shape] space = isl.Space.create_from_names(isl.DEFAULT_CONTEXT, inames) domain = isl.BasicSet.universe(space) for axis_len, iname in zip(arg.shape, inames): domain &= make_slab(space, iname, 0, axis_len) self.domains.append(domain) reduction_inames = tuple(iname for i, iname in enumerate(inames) if i in axis) left_inames = tuple(iname for i, iname in enumerate(inames) if i not in axis) def _one_if_empty(t): if t: return t else: return (1, ) subst_name = self.name_generator(based_on="subst") summed_arg = ArraySymbol( stack=self, name=subst_name, shape=_one_if_empty( tuple(axis_len for i, axis_len in enumerate(arg.shape) if i not in axis)), dtype=arg.dtype) from loopy.library.reduction import SumReductionOperation rule = lp.SubstitutionRule( subst_name, left_inames, lp.Reduction(SumReductionOperation(), reduction_inames, parse('{}({})'.format(arg.name, ', '.join(inames))))) self.register_substitution(rule) return summed_arg
def generate_vectorize_loop(codegen_state, sched_index): kernel = codegen_state.kernel iname = kernel.schedule[sched_index].iname bounds = kernel.get_iname_bounds(iname, constants_only=True) from loopy.isl_helpers import ( static_max_of_pw_aff, static_value_of_pw_aff) from loopy.symbolic import pw_aff_to_expr length_aff = static_max_of_pw_aff(bounds.size, constants_only=True) if not length_aff.is_cst(): warn(kernel, "vec_upper_not_const", "upper bound for vectorized loop '%s' is not a constant, " "cannot vectorize--unrolling instead") return generate_unroll_loop(codegen_state, sched_index) length = int(pw_aff_to_expr(length_aff)) try: lower_bound_aff = static_value_of_pw_aff( bounds.lower_bound_pw_aff.coalesce(), constants_only=False) except Exception as e: raise type(e)("while finding lower bound of '%s': " % iname) if not lower_bound_aff.plain_is_zero(): warn(kernel, "vec_lower_not_0", "lower bound for vectorized loop '%s' is not zero, " "cannot vectorize--unrolling instead") return generate_unroll_loop(codegen_state, sched_index) # {{{ 'implement' vectorization bounds domain = kernel.get_inames_domain(iname) from loopy.isl_helpers import make_slab slab = make_slab(domain.get_space(), iname, lower_bound_aff, lower_bound_aff+length) codegen_state = codegen_state.intersect(slab) # }}} from loopy.codegen import VectorizationInfo new_codegen_state = codegen_state.copy( vectorization_info=VectorizationInfo( iname=iname, length=length, space=length_aff.space)) return build_loop_nest(new_codegen_state, sched_index+1)
def generate_vectorize_loop(codegen_state, sched_index): kernel = codegen_state.kernel iname = kernel.schedule[sched_index].iname bounds = kernel.get_iname_bounds(iname, constants_only=True) from loopy.isl_helpers import ( static_max_of_pw_aff, static_value_of_pw_aff) from loopy.symbolic import pw_aff_to_expr length_aff = static_max_of_pw_aff(bounds.size, constants_only=True) if not length_aff.is_cst(): warn(kernel, "vec_upper_not_const", "upper bound for vectorized loop '%s' is not a constant, " "cannot vectorize--unrolling instead") return generate_unroll_loop(kernel, sched_index, codegen_state) length = int(pw_aff_to_expr(length_aff)) try: lower_bound_aff = static_value_of_pw_aff( bounds.lower_bound_pw_aff.coalesce(), constants_only=False) except Exception as e: raise type(e)("while finding lower bound of '%s': " % iname) if not lower_bound_aff.plain_is_zero(): warn(kernel, "vec_lower_not_0", "lower bound for vectorized loop '%s' is not zero, " "cannot vectorize--unrolling instead") return generate_unroll_loop(kernel, sched_index, codegen_state) # {{{ 'implement' vectorization bounds domain = kernel.get_inames_domain(iname) from loopy.isl_helpers import make_slab slab = make_slab(domain.get_space(), iname, lower_bound_aff, lower_bound_aff+length) codegen_state = codegen_state.intersect(slab) # }}} from loopy.codegen import VectorizationInfo new_codegen_state = codegen_state.copy( vectorization_info=VectorizationInfo( iname=iname, length=length, space=length_aff.space)) return build_loop_nest(new_codegen_state, sched_index+1)
def process_set(s): var_dict = s.get_var_dict() if split_iname not in var_dict: return s orig_dim_type, _ = var_dict[split_iname] outer_var_nr = s.dim(orig_dim_type) inner_var_nr = s.dim(orig_dim_type) + 1 s = s.add_dims(orig_dim_type, 2) s = s.set_dim_name(orig_dim_type, outer_var_nr, outer_iname) s = s.set_dim_name(orig_dim_type, inner_var_nr, inner_iname) from loopy.isl_helpers import make_slab if fixed_length_is_inner: fixed_iname, var_length_iname = inner_iname, outer_iname else: fixed_iname, var_length_iname = outer_iname, inner_iname space = s.get_space() fixed_constraint_set = ( make_slab(space, fixed_iname, 0, fixed_length) # name = fixed_iname + fixed_length*var_length_iname .add_constraint( isl.Constraint.eq_from_names( space, { split_iname: 1, fixed_iname: -1, var_length_iname: -fixed_length }))) name_dim_type, name_idx = space.get_var_dict()[split_iname] s = s.intersect(fixed_constraint_set) if within is None: s = s.project_out(name_dim_type, name_idx, 1) return s
def add_diff_inames(self): diff_inames = tuple( self.rule_mapping_context.make_unique_var_name( self.diff_iname_prefix + str(i)) for i in range(len(self.additional_shape))) diff_parameters = set() from loopy.symbolic import get_dependencies for s in self.additional_shape: diff_parameters.update(get_dependencies(s)) diff_domain = isl.BasicSet( "[%s] -> {[%s]}" % (", ".join(diff_parameters), ", ".join(diff_inames))) for i, diff_iname in enumerate(diff_inames): diff_domain = diff_domain & make_slab( diff_domain.space, diff_iname, 0, self.additional_shape[i]) self.new_domains.append(diff_domain) return diff_inames
def add_diff_inames(self): diff_inames = tuple( self.rule_mapping_context.make_unique_var_name( self.diff_iname_prefix+str(i)) for i in range(len(self.additional_shape))) diff_parameters = set() from loopy.symbolic import get_dependencies for s in self.additional_shape: diff_parameters.update(get_dependencies(s)) diff_domain = isl.BasicSet( "[%s] -> {[%s]}" % (", ".join(diff_parameters), ", ".join(diff_inames))) for i, diff_iname in enumerate(diff_inames): diff_domain = diff_domain & make_slab( diff_domain.space, diff_iname, 0, self.additional_shape[i]) self.new_domains.append(diff_domain) return diff_inames
def map_subscript(self, expr): WalkMapper.map_subscript(self, expr) from pymbolic.primitives import Variable assert isinstance(expr.aggregate, Variable) shape = None var_name = expr.aggregate.name if var_name in self.kernel.arg_dict: arg = self.kernel.arg_dict[var_name] shape = arg.shape elif var_name in self.kernel.temporary_variables: tv = self.kernel.temporary_variables[var_name] shape = tv.shape if shape is not None: subscript = expr.index if not isinstance(subscript, tuple): subscript = (subscript,) from loopy.symbolic import (get_dependencies, get_access_range, UnableToDetermineAccessRange) available_vars = set(self.domain.get_var_dict()) shape_deps = set() for shape_axis in shape: if shape_axis is not None: shape_deps.update(get_dependencies(shape_axis)) if not (get_dependencies(subscript) <= available_vars and shape_deps <= available_vars): return if len(subscript) != len(shape): raise LoopyError("subscript to '%s' in '%s' has the wrong " "number of indices (got: %d, expected: %d)" % ( expr.aggregate.name, expr, len(subscript), len(shape))) try: access_range = get_access_range(self.domain, subscript, self.kernel.assumptions) except UnableToDetermineAccessRange: # Likely: index was non-affine, nothing we can do. return shape_domain = isl.BasicSet.universe(access_range.get_space()) for idim in range(len(subscript)): shape_axis = shape[idim] if shape_axis is not None: from loopy.isl_helpers import make_slab slab = make_slab( shape_domain.get_space(), (dim_type.in_, idim), 0, shape_axis) shape_domain = shape_domain.intersect(slab) if not access_range.is_subset(shape_domain): raise LoopyError("'%s' in instruction '%s' " "accesses out-of-bounds array element (could not" " establish '%s' is a subset of '%s')." % (expr, self.insn_id, access_range, shape_domain))
def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func, hw_inames_left=None): kernel = codegen_state.kernel from loopy.kernel.data import (UniqueTag, HardwareConcurrentTag, LocalIndexTag, GroupIndexTag, VectorizeTag) from loopy.schedule import get_insn_ids_for_block_at insn_ids_for_block = get_insn_ids_for_block_at(kernel.schedule, schedule_index) if hw_inames_left is None: all_inames_by_insns = set() for insn_id in insn_ids_for_block: all_inames_by_insns |= kernel.insn_inames(insn_id) hw_inames_left = [ iname for iname in all_inames_by_insns if kernel.iname_tags_of_type(iname, HardwareConcurrentTag) and not kernel.iname_tags_of_type(iname, VectorizeTag) ] if not hw_inames_left: return next_func(codegen_state) global_size, local_size = kernel.get_grid_sizes_for_insn_ids( insn_ids_for_block) hw_inames_left = hw_inames_left[:] iname = hw_inames_left.pop() from loopy.symbolic import GroupHardwareAxisIndex, LocalHardwareAxisIndex tag, = kernel.iname_tags_of_type(iname, UniqueTag, max_num=1, min_num=1) if isinstance(tag, GroupIndexTag): hw_axis_expr = GroupHardwareAxisIndex(tag.axis) elif isinstance(tag, LocalIndexTag): hw_axis_expr = LocalHardwareAxisIndex(tag.axis) else: raise RuntimeError("unexpected hw tag type") other_inames_with_same_tag = [ other_iname for other_iname in kernel.all_inames() if (kernel.iname_tags_of_type(other_iname, UniqueTag) and other_iname != iname and any( _tag.key == tag.key for _tag in kernel.iname_tags(other_iname) if _tag)) ] # {{{ 'implement' hardware axis boundaries if isinstance(tag, LocalIndexTag): hw_axis_size = local_size[tag.axis] elif isinstance(tag, GroupIndexTag): hw_axis_size = global_size[tag.axis] else: raise RuntimeError("unknown hardware parallel tag") result = [] bounds = kernel.get_iname_bounds(iname) domain = kernel.get_inames_domain(iname) # It's ok to find a bound that's too "loose". The conditional # generators will mop up after us. from loopy.isl_helpers import static_min_of_pw_aff lower_bound = static_min_of_pw_aff(bounds.lower_bound_pw_aff, constants_only=False) # These bounds are 'implemented' by the hardware. Make sure # that the downstream conditional generators realize that. if not isinstance(hw_axis_size, int): hw_axis_size, lower_bound = isl.align_two(hw_axis_size, lower_bound) from loopy.isl_helpers import make_slab slab = make_slab(domain.get_space(), iname, lower_bound, lower_bound + hw_axis_size) codegen_state = codegen_state.intersect(slab) from loopy.symbolic import pw_aff_to_expr hw_axis_expr = hw_axis_expr + pw_aff_to_expr(lower_bound) # }}} slabs = get_slab_decomposition(kernel, iname) if other_inames_with_same_tag and len(slabs) > 1: raise RuntimeError("cannot do slab decomposition on inames that share " "a tag with other inames") result = [] for slab_name, slab in slabs: if len(slabs) > 1: result.append( codegen_state.ast_builder.emit_comment("%s slab for '%s'" % (slab_name, iname))) # Have the conditional infrastructure generate the # slabbing conditionals. slabbed_kernel = intersect_kernel_with_slab(kernel, slab, iname) new_codegen_state = (codegen_state.copy_and_assign( iname, hw_axis_expr).copy(kernel=slabbed_kernel)) inner = set_up_hw_parallel_loops(new_codegen_state, schedule_index, next_func, hw_inames_left) result.append(inner) return merge_codegen_results(codegen_state, result)
def end_computation_stack(self, evaluate=(), transform=False): """ Returns an instance :class:`loopy.LoopKernel` corresponding to the computations pushed in the computation stack. :arg variables_needed: An instance of :class:`tuple` of the variables that must be computed :return: An instance of :class:`loopy.LoopKerneel` for the computations registered on the stack. If ``transform=True`` the transformation data is also returned through the tuple ``knl, tf_data``. The transformation data ``tf_data`` is a mapping from names of the variables which are to be evaluated to the tuple of inames which are involved in their respective assignments. """ statements = [] tf_data = {} domains = self.domains[:] data = self.data[:] substitutions = {} substitutions_needed = [ array_sym.name for array_sym in evaluate if array_sym.name not in self.substs_to_arrays ] substs_to_arrays = self.substs_to_arrays.copy() for i, rule in enumerate(self.registered_substitutions): substs_to_arg_mapper = SubstToArrayExapander( substs_to_arrays.copy()) statements.extend([ insn.with_transformed_expressions(substs_to_arg_mapper) for insn in self.implicit_assignments.pop(i, []) ]) if rule.name in substitutions_needed: rule = rule.copy( expression=substs_to_arg_mapper(rule.expression)) arg_name = self.name_generator(based_on="arr") arg = evaluate[substitutions_needed.index(rule.name)] data.append(arg.copy(name=arg_name)) substs_to_arrays[arg.name] = arg_name if arg.shape != (1, ) and arg.shape != (1): inames = tuple( self.name_generator(based_on='i') for _ in arg.shape) space = isl.Space.create_from_names( isl.DEFAULT_CONTEXT, inames) domain = isl.BasicSet.universe(space) for iname_name, axis_length in zip(inames, arg.shape): domain &= make_slab(space, iname_name, 0, axis_length) assignee = substs_to_arg_mapper( parse('{}[{}]'.format(arg_name, ', '.join(inames)))) stmnt = lp.Assignment(assignee=assignee, expression=parse('{}({})'.format( arg.name, ', '.join(inames)))) domains.append(domain) tf_data[arg.name] = inames else: assignee = parse('{}[0]'.format(arg_name)) stmnt = lp.Assignment(assignee=assignee, expression=parse('{}()'.format( arg.name))) tf_data[arg.name] = () statements.append( stmnt.with_transformed_expressions(substs_to_arg_mapper)) substitutions[rule.name] = rule.copy( expression=substs_to_arg_mapper(rule.expression)) substs_to_arg_mapper = SubstToArrayExapander(substs_to_arrays.copy()) statements.extend([ insn.with_transformed_expressions(substs_to_arg_mapper) for insn in self.implicit_assignments.pop(i + 1, []) ]) knl = lp.make_kernel(domains=domains, instructions=statements, kernel_data=data, seq_dependencies=True, lang_version=(2018, 2)) knl = knl.copy(substitutions=substitutions) if transform: return knl, tf_data else: return knl
def set_up_hw_parallel_loops(kernel, sched_index, codegen_state, hw_inames_left=None): from loopy.kernel.data import ( UniqueTag, HardwareParallelTag, LocalIndexTag, GroupIndexTag) if hw_inames_left is None: hw_inames_left = [iname for iname in kernel.all_inames() if isinstance(kernel.iname_to_tag.get(iname), HardwareParallelTag)] if not hw_inames_left: return build_loop_nest(kernel, sched_index, codegen_state) global_size, local_size = kernel.get_grid_sizes() hw_inames_left = hw_inames_left[:] iname = hw_inames_left.pop() tag = kernel.iname_to_tag.get(iname) from loopy.symbolic import GroupHardwareAxisIndex, LocalHardwareAxisIndex assert isinstance(tag, UniqueTag) if isinstance(tag, GroupIndexTag): hw_axis_expr = GroupHardwareAxisIndex(tag.axis) elif isinstance(tag, LocalIndexTag): hw_axis_expr = LocalHardwareAxisIndex(tag.axis) else: raise RuntimeError("unexpected hw tag type") other_inames_with_same_tag = [ other_iname for other_iname in kernel.all_inames() if isinstance(kernel.iname_to_tag.get(other_iname), UniqueTag) and kernel.iname_to_tag.get(other_iname).key == tag.key and other_iname != iname] # {{{ 'implement' hardware axis boundaries if isinstance(tag, LocalIndexTag): hw_axis_size = local_size[tag.axis] elif isinstance(tag, GroupIndexTag): hw_axis_size = global_size[tag.axis] else: raise RuntimeError("unknown hardware parallel tag") result = [] bounds = kernel.get_iname_bounds(iname) domain = kernel.get_inames_domain(iname) # It's ok to find a bound that's too "loose". The conditional # generators will mop up after us. from loopy.isl_helpers import static_min_of_pw_aff lower_bound = static_min_of_pw_aff(bounds.lower_bound_pw_aff, constants_only=False) # These bounds are 'implemented' by the hardware. Make sure # that the downstream conditional generators realize that. if not isinstance(hw_axis_size, int): hw_axis_size, lower_bound = isl.align_two(hw_axis_size, lower_bound) from loopy.isl_helpers import make_slab slab = make_slab(domain.get_space(), iname, lower_bound, lower_bound+hw_axis_size) codegen_state = codegen_state.intersect(slab) from loopy.symbolic import pw_aff_to_expr hw_axis_expr = hw_axis_expr + pw_aff_to_expr(lower_bound) # }}} slabs = get_slab_decomposition( kernel, iname, sched_index, codegen_state) if other_inames_with_same_tag and len(slabs) > 1: raise RuntimeError("cannot do slab decomposition on inames that share " "a tag with other inames") result = [] from loopy.codegen import add_comment for slab_name, slab in slabs: cmt = "%s slab for '%s'" % (slab_name, iname) if len(slabs) == 1: cmt = None # Have the conditional infrastructure generate the # slabbing conditionals. slabbed_kernel = intersect_kernel_with_slab(kernel, slab, iname) new_codegen_state = codegen_state.copy_and_assign(iname, hw_axis_expr) inner = set_up_hw_parallel_loops( slabbed_kernel, sched_index, new_codegen_state, hw_inames_left) result.append(add_comment(cmt, inner)) from loopy.codegen import gen_code_block return gen_code_block(result)
def pack_and_unpack_args_for_call_for_single_kernel(kernel, callables_table, call_name, args_to_pack=None, args_to_unpack=None): """ Returns a a copy of *kernel* with instructions appended to copy the arguments in *args* to match the alignment expected by the *call_name* in the kernel. The arguments are copied back to *args* with the appropriate data layout. :arg call_name: An instance of :class:`str` denoting the function call in the *kernel*. :arg args_to_unpack: A list of the arguments as instances of :class:`str` which must be packed. If set *None*, it is interpreted that all the array arguments would be packed. :arg args_to_unpack: A list of the arguments as instances of :class:`str` which must be unpacked. If set *None*, it is interpreted that all the array arguments should be unpacked. """ assert isinstance(kernel, LoopKernel) new_domains = [] new_tmps = kernel.temporary_variables.copy() old_insn_to_new_insns = {} for insn in kernel.instructions: if not isinstance(insn, CallInstruction): # pack and unpack call only be done for CallInstructions. continue if insn.expression.function.name not in callables_table: continue in_knl_callable = callables_table[insn.expression.function.name] if in_knl_callable.name != call_name: # not the function we're looking for. continue in_knl_callable = in_knl_callable.with_packing_for_args() vng = kernel.get_var_name_generator() ing = kernel.get_instruction_id_generator() parameters = insn.expression.parameters if args_to_pack is None: args_to_pack = [ par.subscript.aggregate.name for par in parameters + insn.assignees if isinstance(par, SubArrayRef) and (par.swept_inames) ] if args_to_unpack is None: args_to_unpack = [ par.subscript.aggregate.name for par in parameters + insn.assignees if isinstance(par, SubArrayRef) and (par.swept_inames) ] # {{{ sanity checks for args assert isinstance(args_to_pack, list) assert isinstance(args_to_unpack, list) for arg in args_to_pack: found_sub_array_ref = False for par in parameters + insn.assignees: # checking that the given args is a sub array ref if isinstance(par, SubArrayRef) and (par.subscript.aggregate.name == arg): found_sub_array_ref = True break if not found_sub_array_ref: raise LoopyError( "No match found for packing arg '%s' of call '%s' " "at insn '%s'." % (arg, call_name, insn.id)) for arg in args_to_unpack: if arg not in args_to_pack: raise LoopyError("Argument %s should be packed in order to be " "unpacked." % arg) # }}} packing_insns = [] unpacking_insns = [] # {{{ handling ilp tags from loopy.kernel.data import IlpBaseTag, VectorizeTag import islpy as isl from pymbolic import var dim_type = isl.dim_type.set ilp_inames = { iname for iname in insn.within_inames if all( isinstance(tag, (IlpBaseTag, VectorizeTag)) for tag in kernel.iname_to_tags.get(iname, [])) } new_ilp_inames = set() ilp_inames_map = {} for iname in ilp_inames: new_iname_name = vng(iname + "_ilp") ilp_inames_map[var(iname)] = var(new_iname_name) new_ilp_inames.add(new_iname_name) for iname in ilp_inames: new_domain = kernel.get_inames_domain(iname).copy() for i in range(new_domain.n_dim()): old_iname = new_domain.get_dim_name(dim_type, i) if old_iname in ilp_inames: new_domain = new_domain.set_dim_name( dim_type, i, ilp_inames_map[var(old_iname)].name) new_domains.append(new_domain) # }}} from pymbolic.mapper.substitutor import make_subst_func from loopy.symbolic import SubstitutionMapper # dict to store the new assignees and parameters, the mapping pattern # from arg_id to parameters is identical to InKernelCallable.arg_id_to_dtype id_to_parameters = tuple(enumerate(parameters)) + tuple( (-i - 1, assignee) for i, assignee in enumerate(insn.assignees)) new_id_to_parameters = {} for arg_id, p in id_to_parameters: if isinstance(p, SubArrayRef) and (p.subscript.aggregate.name in args_to_pack): new_pack_inames = ilp_inames_map.copy( ) # packing-specific inames new_unpack_inames = ilp_inames_map.copy( ) # unpacking-specific iname new_pack_inames = { iname: var(vng(iname.name + "_pack")) for iname in p.swept_inames } new_unpack_inames = { iname: var(vng(iname.name + "_unpack")) for iname in p.swept_inames } # Updating the domains corresponding to the new inames. for iname in p.swept_inames: new_domain_pack = kernel.get_inames_domain( iname.name).copy() new_domain_unpack = kernel.get_inames_domain( iname.name).copy() for i in range(new_domain_pack.n_dim()): old_iname = new_domain_pack.get_dim_name(dim_type, i) if var(old_iname) in new_pack_inames: new_domain_pack = new_domain_pack.set_dim_name( dim_type, i, new_pack_inames[var(old_iname)].name) new_domain_unpack = new_domain_unpack.set_dim_name( dim_type, i, new_unpack_inames[var(old_iname)].name) new_domains.append(new_domain_pack) new_domains.append(new_domain_unpack) arg = p.subscript.aggregate.name pack_name = vng(arg + "_pack") from loopy.kernel.data import (TemporaryVariable, temp_var_scope) if arg in kernel.arg_dict: arg_in_caller = kernel.arg_dict[arg] else: arg_in_caller = kernel.temporary_variables[arg] pack_tmp = TemporaryVariable( name=pack_name, dtype=arg_in_caller.dtype, dim_tags=in_knl_callable.arg_id_to_descr[arg_id].dim_tags, shape=in_knl_callable.arg_id_to_descr[arg_id].shape, scope=temp_var_scope.PRIVATE, ) new_tmps[pack_name] = pack_tmp from loopy import Assignment pack_subst_mapper = SubstitutionMapper( make_subst_func(new_pack_inames)) unpack_subst_mapper = SubstitutionMapper( make_subst_func(new_unpack_inames)) # {{{ getting the lhs for packing and rhs for unpacking from loopy.isl_helpers import simplify_via_aff, make_slab flatten_index = simplify_via_aff( sum(dim_tag.stride * idx for dim_tag, idx in zip( arg_in_caller.dim_tags, p.subscript.index_tuple))) new_indices = [] for dim_tag in in_knl_callable.arg_id_to_descr[ arg_id].dim_tags: ind = flatten_index // dim_tag.stride flatten_index -= (dim_tag.stride * ind) new_indices.append(ind) new_indices = tuple(simplify_via_aff(i) for i in new_indices) pack_lhs_assignee = pack_subst_mapper( var(pack_name).index(new_indices)) unpack_rhs = unpack_subst_mapper( var(pack_name).index(new_indices)) # }}} packing_insns.append( Assignment( assignee=pack_lhs_assignee, expression=pack_subst_mapper.map_subscript( p.subscript), within_inames=insn.within_inames - ilp_inames | {new_pack_inames[i].name for i in p.swept_inames} | (new_ilp_inames), depends_on=insn.depends_on, id=ing(insn.id + "_pack"), depends_on_is_final=True)) if p.subscript.aggregate.name in args_to_unpack: unpacking_insns.append( Assignment( expression=unpack_rhs, assignee=unpack_subst_mapper.map_subscript( p.subscript), within_inames=insn.within_inames - ilp_inames | { new_unpack_inames[i].name for i in p.swept_inames } | (new_ilp_inames), id=ing(insn.id + "_unpack"), depends_on=frozenset([insn.id]), depends_on_is_final=True)) # {{{ creating the sweep inames for the new sub array refs updated_swept_inames = [] for _ in in_knl_callable.arg_id_to_descr[arg_id].shape: updated_swept_inames.append(var(vng("i_packsweep_" + arg))) ctx = kernel.isl_context space = isl.Space.create_from_names( ctx, set=[iname.name for iname in updated_swept_inames]) iname_set = isl.BasicSet.universe(space) for iname, axis_length in zip( updated_swept_inames, in_knl_callable.arg_id_to_descr[arg_id].shape): iname_set = iname_set & make_slab(space, iname.name, 0, axis_length) new_domains = new_domains + [iname_set] # }}} new_id_to_parameters[arg_id] = SubArrayRef( tuple(updated_swept_inames), (var(pack_name).index(tuple(updated_swept_inames)))) else: new_id_to_parameters[arg_id] = p if packing_insns: subst_mapper = SubstitutionMapper(make_subst_func(ilp_inames_map)) new_call_insn = insn.with_transformed_expressions(subst_mapper) new_params = tuple( subst_mapper(new_id_to_parameters[i]) for i, _ in enumerate(parameters)) new_assignees = tuple( subst_mapper(new_id_to_parameters[-i - 1]) for i, _ in enumerate(insn.assignees)) new_call_insn = new_call_insn.copy( depends_on=new_call_insn.depends_on | {pack.id for pack in packing_insns}, within_inames=new_call_insn.within_inames - ilp_inames | (new_ilp_inames), expression=new_call_insn.expression.function(*new_params), assignees=new_assignees) old_insn_to_new_insns[insn.id] = (packing_insns + [new_call_insn] + unpacking_insns) if old_insn_to_new_insns: new_instructions = [] for insn in kernel.instructions: if insn.id in old_insn_to_new_insns: # Replacing the current instruction with the group of # instructions including the packing and unpacking instructions new_instructions.extend(old_insn_to_new_insns[insn.id]) else: # for the instructions that depend on the call instruction that # are to be packed and unpacked, we need to add the complete # instruction block as a dependency for them. new_depends_on = insn.depends_on if insn.depends_on & set(old_insn_to_new_insns): # need to add the unpack instructions on dependencies. for old_insn_id in insn.depends_on & set( old_insn_to_new_insns): new_depends_on |= frozenset( i.id for i in old_insn_to_new_insns[old_insn_id]) new_instructions.append(insn.copy(depends_on=new_depends_on)) kernel = kernel.copy(domains=kernel.domains + new_domains, instructions=new_instructions, temporary_variables=new_tmps) return kernel
def map_subscript(self, expr): WalkMapper.map_subscript(self, expr) from pymbolic.primitives import Variable assert isinstance(expr.aggregate, Variable) shape = None var_name = expr.aggregate.name if var_name in self.kernel.arg_dict: arg = self.kernel.arg_dict[var_name] shape = arg.shape elif var_name in self.kernel.temporary_variables: tv = self.kernel.temporary_variables[var_name] shape = tv.shape if shape is not None: subscript = expr.index if not isinstance(subscript, tuple): subscript = (subscript,) from loopy.symbolic import get_dependencies, get_access_range available_vars = set(self.domain.get_var_dict()) shape_deps = set() for shape_axis in shape: if shape_axis is not None: shape_deps.update(get_dependencies(shape_axis)) if not (get_dependencies(subscript) <= available_vars and shape_deps <= available_vars): return if len(subscript) != len(shape): raise LoopyError("subscript to '%s' in '%s' has the wrong " "number of indices (got: %d, expected: %d)" % ( expr.aggregate.name, expr, len(subscript), len(shape))) try: access_range = get_access_range(self.domain, subscript, self.kernel.assumptions) except isl.Error: # Likely: index was non-linear, nothing we can do. return except TypeError: # Likely: index was non-linear, nothing we can do. return shape_domain = isl.BasicSet.universe(access_range.get_space()) for idim in range(len(subscript)): shape_axis = shape[idim] if shape_axis is not None: from loopy.isl_helpers import make_slab slab = make_slab( shape_domain.get_space(), (dim_type.in_, idim), 0, shape_axis) shape_domain = shape_domain.intersect(slab) if not access_range.is_subset(shape_domain): raise LoopyError("'%s' in instruction '%s' " "accesses out-of-bounds array element" % (expr, self.insn_id))
def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func, hw_inames_left=None): kernel = codegen_state.kernel from loopy.kernel.data import (UniqueTag, HardwareConcurrentTag, LocalIndexTag, GroupIndexTag) from loopy.schedule import get_insn_ids_for_block_at insn_ids_for_block = get_insn_ids_for_block_at(kernel.schedule, schedule_index) if hw_inames_left is None: all_inames_by_insns = set() for insn_id in insn_ids_for_block: all_inames_by_insns |= kernel.insn_inames(insn_id) hw_inames_left = [iname for iname in all_inames_by_insns if kernel.iname_tags_of_type(iname, HardwareConcurrentTag)] if not hw_inames_left: return next_func(codegen_state) global_size, local_size = kernel.get_grid_sizes_for_insn_ids( insn_ids_for_block) hw_inames_left = hw_inames_left[:] iname = hw_inames_left.pop() from loopy.symbolic import GroupHardwareAxisIndex, LocalHardwareAxisIndex tag, = kernel.iname_tags_of_type(iname, UniqueTag, max_num=1, min_num=1) if isinstance(tag, GroupIndexTag): hw_axis_expr = GroupHardwareAxisIndex(tag.axis) elif isinstance(tag, LocalIndexTag): hw_axis_expr = LocalHardwareAxisIndex(tag.axis) else: raise RuntimeError("unexpected hw tag type") other_inames_with_same_tag = [ other_iname for other_iname in kernel.all_inames() if (kernel.iname_tags_of_type(other_iname, UniqueTag) and other_iname != iname and any(_tag.key == tag.key for _tag in kernel.iname_tags(other_iname) if _tag))] # {{{ 'implement' hardware axis boundaries if isinstance(tag, LocalIndexTag): hw_axis_size = local_size[tag.axis] elif isinstance(tag, GroupIndexTag): hw_axis_size = global_size[tag.axis] else: raise RuntimeError("unknown hardware parallel tag") result = [] bounds = kernel.get_iname_bounds(iname) domain = kernel.get_inames_domain(iname) # It's ok to find a bound that's too "loose". The conditional # generators will mop up after us. from loopy.isl_helpers import static_min_of_pw_aff lower_bound = static_min_of_pw_aff(bounds.lower_bound_pw_aff, constants_only=False) # These bounds are 'implemented' by the hardware. Make sure # that the downstream conditional generators realize that. if not isinstance(hw_axis_size, int): hw_axis_size, lower_bound = isl.align_two(hw_axis_size, lower_bound) from loopy.isl_helpers import make_slab slab = make_slab(domain.get_space(), iname, lower_bound, lower_bound+hw_axis_size) codegen_state = codegen_state.intersect(slab) from loopy.symbolic import pw_aff_to_expr hw_axis_expr = hw_axis_expr + pw_aff_to_expr(lower_bound) # }}} slabs = get_slab_decomposition(kernel, iname) if other_inames_with_same_tag and len(slabs) > 1: raise RuntimeError("cannot do slab decomposition on inames that share " "a tag with other inames") result = [] for slab_name, slab in slabs: if len(slabs) > 1: result.append( codegen_state.ast_builder.emit_comment( "%s slab for '%s'" % (slab_name, iname))) # Have the conditional infrastructure generate the # slabbing conditionals. slabbed_kernel = intersect_kernel_with_slab(kernel, slab, iname) new_codegen_state = (codegen_state .copy_and_assign(iname, hw_axis_expr) .copy(kernel=slabbed_kernel)) inner = set_up_hw_parallel_loops( new_codegen_state, schedule_index, next_func, hw_inames_left) result.append(inner) return merge_codegen_results(codegen_state, result)
def __setitem__(self, index, value): """ Registers an assignment in order to make the indices represented by ``index`` to ``value``. :arg index: An instance of :class:`int`, or :class:`slice` or :class:`numloopy.ArraySymbol`. :arg value: An instance of :class:`numloopy.ArraySymbol`, with the same shape as represented by ``index``. """ if isinstance(index, (Number, slice, ArraySymbol)): index = (index, ) assert isinstance(index, tuple) # current heuristic: assumes that the dereferenced guys are # always arguments and not temporary variables, maybe we need to fix # this later? try: arg_name = self.stack.substs_to_arrays[self.name] except KeyError: inames = tuple( self.stack.name_generator(based_on="i") for _ in self.shape) arg_name = self.stack.name_generator(based_on="arr") insn = lp.Assignment( assignee=parse('{}[{}]'.format(arg_name, ', '.join(inames))), expression=parse('{}({})'.format(self.name, ', '.join(inames)))) self.stack.register_implicit_assignment(insn) space = isl.Space.create_from_names(isl.DEFAULT_CONTEXT, inames) domain = isl.BasicSet.universe(space) for iname_name, axis_length in zip(inames, self.shape): domain &= make_slab(space, iname_name, 0, axis_length) self.stack.domains.append(domain) # now handling the second assignment try: inames, iname_lens = zip(*tuple( (self.stack.name_generator(based_on="i"), axis_len) for idx, axis_len in zip(index, self.shape) if isinstance(idx, slice) or isinstance(idx, ArraySymbol))) space = isl.Space.create_from_names(isl.DEFAULT_CONTEXT, inames) domain = isl.BasicSet.universe(space) for iname_name, axis_length in zip(inames, iname_lens): domain &= make_slab(space, iname_name, 0, axis_length) self.stack.domains.append(domain) except ValueError: inames = () iname_lens = () indices = [] _k = 0 for idx in index: if isinstance(idx, slice): indices.append(Variable(inames[_k])) _k += 1 elif isinstance(idx, ArraySymbol): indices.append(Variable(idx.name)(Variable(inames[_k]))) _k += 1 else: indices.append(idx) assert _k == len(inames) indices = tuple(indices) if isinstance(value, ArraySymbol): insn = lp.Assignment(assignee=Subscript(Variable(arg_name), indices), expression='{}({})'.format( value.name, ', '.join(str(iname) for iname in inames))) elif isinstance(value, Number): insn = lp.Assignment(assignee=Subscript(Variable(arg_name), indices), expression=value) else: raise TypeError("arrays can be only assigned with number or other " "arrays") self.stack.register_implicit_assignment(insn) if self.name not in self.stack.substs_to_arrays: subst_name = self.stack.name_generator(based_on="subst") inames = tuple( self.stack.name_generator(based_on='i') for _ in self.shape) rule = lp.SubstitutionRule( subst_name, inames, expression=Subscript( Variable(arg_name), tuple(Variable(iname) for iname in inames))) self.stack.register_substitution(rule) self.stack.data.append(self.copy(name=arg_name)) self.stack.substs_to_arrays[subst_name] = arg_name self.name = subst_name
def _make_slab(self, space, iname, start, stop): from loopy.isl_helpers import make_slab return make_slab(space, iname, start, stop)