def add_dependency(kernel, insn_match, dependency): """Add the instruction dependency *dependency* to the instructions matched by *insn_match*. *insn_match* may be any instruction id match understood by :func:`loopy.match.parse_match`. """ if dependency not in kernel.id_to_insn: raise LoopyError( "cannot add dependency on non-existent instruction ID '%s'" % dependency) def add_dep(insn): new_deps = insn.depends_on added_deps = frozenset([dependency]) if new_deps is None: new_deps = added_deps else: new_deps = new_deps | added_deps return insn.copy(depends_on=new_deps) return map_instructions(kernel, insn_match, add_dep)
def generate_unroll_loop(codegen_state, sched_index): kernel = codegen_state.kernel iname = kernel.schedule[sched_index].iname bounds = kernel.get_iname_bounds(iname, constants_only=True) from loopy.isl_helpers import ( static_max_of_pw_aff, static_value_of_pw_aff) from loopy.symbolic import pw_aff_to_expr length_aff = static_max_of_pw_aff(bounds.size, constants_only=True) if not length_aff.is_cst(): raise LoopyError( "length of unrolled loop '%s' is not a constant, " "cannot unroll") length = int(pw_aff_to_expr(length_aff)) try: lower_bound_aff = static_value_of_pw_aff( bounds.lower_bound_pw_aff.coalesce(), constants_only=False) except Exception as e: raise type(e)("while finding lower bound of '%s': " % iname) result = [] for i in range(length): idx_aff = lower_bound_aff + i new_codegen_state = codegen_state.fix(iname, idx_aff) result.append( build_loop_nest(new_codegen_state, sched_index+1)) return merge_codegen_results(codegen_state, result)
def _is_racing_iname_tag(tv, tag): from loopy.kernel.data import (temp_var_scope, LocalIndexTagBase, GroupIndexTag, ConcurrentTag, auto) if tv.scope == temp_var_scope.PRIVATE: return ( isinstance(tag, ConcurrentTag) and not isinstance(tag, (LocalIndexTagBase, GroupIndexTag))) elif tv.scope == temp_var_scope.LOCAL: return ( isinstance(tag, ConcurrentTag) and not isinstance(tag, GroupIndexTag)) elif tv.scope == temp_var_scope.GLOBAL: return isinstance(tag, ConcurrentTag) elif tv.scope == auto: raise LoopyError("scope of temp var '%s' has not yet been" "determined" % tv.name) else: raise ValueError("unexpected value of temp_var.scope for " "temporary variable '%s'" % tv.name)
def map_call(self, expr, return_tuple=False): from pymbolic.primitives import Variable identifier = expr.function if isinstance(identifier, Variable): identifier = identifier.name if identifier in ["indexof", "indexof_vec"]: return [self.kernel.index_dtype] def none_if_empty(d): if d: d, = d return d else: return None arg_dtypes = tuple(none_if_empty(self.rec(par)) for par in expr.parameters) if None in arg_dtypes: return [] mangle_result = self.kernel.mangle_function(identifier, arg_dtypes) if return_tuple: if mangle_result is not None: return [mangle_result.result_dtypes] else: if mangle_result is not None: if len(mangle_result.result_dtypes) != 1 and not return_tuple: raise LoopyError("functions with more or fewer than one " "return value may only be used in direct assignments") return [mangle_result.result_dtypes[0]] raise RuntimeError("unable to resolve " "function '%s' with %d given arguments" % (identifier, len(arg_dtypes)))
def map_reduction(self, expr, return_tuple=False): rec_result = self.rec(expr.expr) if rec_result: rec_result, = rec_result result = expr.operation.result_dtypes(self.kernel, rec_result, expr.inames) else: result = expr.operation.result_dtypes(self.kernel, None, expr.inames) if result is None: return [] if return_tuple: return [result] else: if len(result) != 1 and not return_tuple: raise LoopyError( "reductions with more or fewer than one " "return value may only be used in direct assignments") return [result[0]]
def make_ref_args(kernel, impl_arg_info, queue, parameters): import pyopencl as cl import pyopencl.array as cl_array from loopy.kernel.data import ValueArg, ArrayArg, ImageArg, \ TemporaryVariable, ConstantArg from pymbolic import evaluate ref_args = {} ref_arg_data = [] for arg in impl_arg_info: kernel_arg = kernel.impl_arg_to_arg.get(arg.name) if arg.arg_class is ValueArg: if arg.offset_for_name: continue arg_value = parameters[arg.name] try: argv_dtype = arg_value.dtype except AttributeError: argv_dtype = None if argv_dtype != arg.dtype: arg_value = arg.dtype.numpy_dtype.type(arg_value) ref_args[arg.name] = arg_value ref_arg_data.append(None) elif arg.arg_class is ArrayArg or arg.arg_class is ImageArg \ or arg.arg_class is ConstantArg: if arg.shape is None or any(saxis is None for saxis in arg.shape): raise LoopyError( "array '%s' needs known shape to use automatic " "testing" % arg.name) shape = evaluate_shape(arg.unvec_shape, parameters) dtype = kernel_arg.dtype is_output = arg.base_name in kernel.get_written_variables() if arg.arg_class is ImageArg: storage_array = ary = cl_array.empty(queue, shape, dtype, order="C") numpy_strides = None alloc_size = None strides = None else: strides = evaluate(arg.unvec_strides, parameters) alloc_size = sum(astrd * (alen - 1) if astrd != 0 else alen - 1 for alen, astrd in zip(shape, strides)) + 1 if dtype is None: raise LoopyError("dtype for argument '%s' is not yet " "known. Perhaps you want to use " "loopy.add_dtypes " "or loopy.infer_argument_dtypes?" % arg.name) itemsize = dtype.itemsize numpy_strides = [itemsize * s for s in strides] storage_array = cl_array.empty(queue, alloc_size, dtype) if is_output and arg.arg_class is ImageArg: raise LoopyError("write-mode images not supported in " "automatic testing") fill_rand(storage_array) if arg.arg_class is ImageArg: # must be contiguous pre_run_ary = pre_run_storage_array = storage_array.copy() ref_args[arg.name] = cl.image_from_array( queue.context, ary.get()) else: pre_run_storage_array = storage_array.copy() ary = cl_array.as_strided(storage_array, shape, numpy_strides) pre_run_ary = cl_array.as_strided(pre_run_storage_array, shape, numpy_strides) ref_args[arg.name] = ary ref_arg_data.append( TestArgInfo(name=arg.name, ref_array=ary, ref_storage_array=storage_array, ref_pre_run_array=pre_run_ary, ref_pre_run_storage_array=pre_run_storage_array, ref_shape=shape, ref_strides=strides, ref_alloc_size=alloc_size, ref_numpy_strides=numpy_strides, needs_checking=is_output)) elif arg.arg_class is TemporaryVariable: # global temporary, handled by invocation logic pass else: raise LoopyError("arg type %s not understood" % type(arg)) return ref_args, ref_arg_data
def make_args(kernel, impl_arg_info, queue, ref_arg_data, parameters): import pyopencl as cl import pyopencl.array as cl_array from loopy.kernel.data import ValueArg, ArrayArg, ImageArg,\ TemporaryVariable, ConstantArg from pymbolic import evaluate args = {} for arg, arg_desc in zip(impl_arg_info, ref_arg_data): kernel_arg = kernel.impl_arg_to_arg.get(arg.name) if arg.arg_class is ValueArg: arg_value = parameters[arg.name] try: argv_dtype = arg_value.dtype except AttributeError: argv_dtype = None if argv_dtype != arg.dtype: arg_value = arg.dtype.numpy_dtype.type(arg_value) args[arg.name] = arg_value elif arg.arg_class is ImageArg: if arg.name in kernel.get_written_variables(): raise NotImplementedError("write-mode images not supported in " "automatic testing") shape = evaluate_shape(arg.unvec_shape, parameters) assert shape == arg_desc.ref_shape # must be contiguous args[arg.name] = cl.image_from_array( queue.context, arg_desc.ref_pre_run_array.get()) elif arg.arg_class is ArrayArg or\ arg.arg_class is ConstantArg: shape = evaluate(arg.unvec_shape, parameters) strides = evaluate(arg.unvec_strides, parameters) dtype = kernel_arg.dtype itemsize = dtype.itemsize numpy_strides = [itemsize * s for s in strides] alloc_size = sum(astrd * (alen - 1) if astrd != 0 else alen - 1 for alen, astrd in zip(shape, strides)) + 1 # use contiguous array to transfer to host host_ref_contig_array = arg_desc.ref_pre_run_storage_array.get() # use device shape/strides from pyopencl.compyte.array import as_strided host_ref_array = as_strided(host_ref_contig_array, arg_desc.ref_shape, arg_desc.ref_numpy_strides) # flatten the thing host_ref_flat_array = host_ref_array.flatten() # create host array with test shape (but not strides) host_contig_array = np.empty(shape, dtype=dtype) common_len = min(len(host_ref_flat_array), len(host_contig_array.ravel())) host_contig_array.ravel()[:common_len] = \ host_ref_flat_array[:common_len] # create host array with test shape and storage layout host_storage_array = np.empty(alloc_size, dtype) host_array = as_strided(host_storage_array, shape, numpy_strides) host_array[...] = host_contig_array host_contig_array = arg_desc.ref_storage_array.get() storage_array = cl_array.to_device(queue, host_storage_array) ary = cl_array.as_strided(storage_array, shape, numpy_strides) args[arg.name] = ary arg_desc.test_storage_array = storage_array arg_desc.test_array = ary arg_desc.test_shape = shape arg_desc.test_strides = strides arg_desc.test_numpy_strides = numpy_strides arg_desc.test_alloc_size = alloc_size elif arg.arg_class is TemporaryVariable: # global temporary, handled by invocation logic pass else: raise LoopyError("arg type not understood") return args
def _check_for_unused_hw_axes_in_kernel_chunk(kernel, sched_index=None): from loopy.schedule import (CallKernel, RunInstruction, Barrier, EnterLoop, LeaveLoop, ReturnFromKernel, get_insn_ids_for_block_at, gather_schedule_block) if sched_index is None: group_axes = set() local_axes = set() i = 0 loop_end_i = past_end_i = len(kernel.schedule) else: assert isinstance(kernel.schedule[sched_index], CallKernel) _, past_end_i = gather_schedule_block(kernel.schedule, sched_index) group_size, local_size = kernel.get_grid_sizes_for_insn_ids_as_exprs( get_insn_ids_for_block_at(kernel.schedule, sched_index)) group_axes = set(ax for ax, length in enumerate(group_size)) local_axes = set(ax for ax, length in enumerate(local_size)) i = sched_index + 1 assert isinstance(kernel.schedule[past_end_i - 1], ReturnFromKernel) loop_end_i = past_end_i - 1 # alternative: just disregard length-1 dimensions? from loopy.kernel.data import (LocalIndexTag, AutoLocalIndexTagBase, GroupIndexTag) while i < loop_end_i: sched_item = kernel.schedule[i] if isinstance(sched_item, CallKernel): i = _check_for_unused_hw_axes_in_kernel_chunk(kernel, i) elif isinstance(sched_item, RunInstruction): insn = kernel.id_to_insn[sched_item.insn_id] i += 1 if insn.boostable: continue group_axes_used = set() local_axes_used = set() for iname in kernel.insn_inames(insn): ltags = kernel.iname_tags_of_type(iname, LocalIndexTag, max_num=1) gtags = kernel.iname_tags_of_type(iname, GroupIndexTag, max_num=1) altags = kernel.iname_tags_of_type( iname, AutoLocalIndexTagBase, max_num=1) if ltags: tag, = ltags local_axes_used.add(tag.axis) elif gtags: tag, = gtags group_axes_used.add(tag.axis) elif altags: raise LoopyError("auto local tag encountered") if group_axes != group_axes_used: raise LoopyError("instruction '%s' does not use all group hw axes " "(available: %s used:%s)" % (insn.id, ",".join(str(i) for i in group_axes), ",".join(str(i) for i in group_axes_used))) if local_axes != local_axes_used: raise LoopyError("instruction '%s' does not use all local hw axes " "(available: %s used:%s)" % (insn.id, ",".join(str(i) for i in local_axes), ",".join(str(i) for i in local_axes_used))) elif isinstance(sched_item, (Barrier, EnterLoop, LeaveLoop)): i += 1 continue else: raise TypeError( "schedule item not understood: %s" % type(sched_item).__name__) return past_end_i
def check_loop_priority_inames_known(kernel): for prio in kernel.loop_priority: for iname in prio: if iname not in kernel.all_inames(): raise LoopyError("unknown iname '%s' in loop priorities" % iname)
def __init__(self, assignees, expression, id=None, depends_on=None, depends_on_is_final=None, groups=None, conflicts_with_groups=None, no_sync_with=None, within_inames_is_final=None, within_inames=None, boostable=None, boostable_into=None, tags=None, temp_var_types=None, priority=0, predicates=frozenset(), insn_deps=None, insn_deps_is_final=None, forced_iname_deps=None, forced_iname_deps_is_final=None): super(CallInstruction, self).__init__( id=id, depends_on=depends_on, depends_on_is_final=depends_on_is_final, groups=groups, conflicts_with_groups=conflicts_with_groups, no_sync_with=no_sync_with, within_inames_is_final=within_inames_is_final, within_inames=within_inames, boostable=boostable, boostable_into=boostable_into, priority=priority, predicates=predicates, tags=tags, insn_deps=insn_deps, insn_deps_is_final=insn_deps_is_final, forced_iname_deps=forced_iname_deps, forced_iname_deps_is_final=forced_iname_deps_is_final) from pymbolic.primitives import Call from loopy.symbolic import Reduction if not isinstance(expression, (Call, Reduction)) and expression is not None: raise LoopyError("'expression' argument to CallInstruction " "must be a function call") from loopy.symbolic import parse if isinstance(assignees, str): assignees = parse(assignees) if not isinstance(assignees, tuple): raise LoopyError("'assignees' argument to CallInstruction " "must be a tuple or a string parseable to a tuple" "--got '%s'" % type(assignees).__name__) if isinstance(expression, str): expression = parse(expression) from pymbolic.primitives import Variable, Subscript from loopy.symbolic import LinearSubscript for assignee in assignees: if not isinstance(assignee, (Variable, Subscript, LinearSubscript)): raise LoopyError("invalid lvalue '%s'" % assignee) self.assignees = assignees self.expression = expression if temp_var_types is None: self.temp_var_types = (None, ) * len(self.assignees) else: self.temp_var_types = temp_var_types
def collect_common_factors_on_increment(kernel, var_name, vary_by_axes=()): assert isinstance(kernel, LoopKernel) # FIXME: Does not understand subst rules for now if kernel.substitutions: from loopy.transform.subst import expand_subst kernel = expand_subst(kernel) if var_name in kernel.temporary_variables: var_descr = kernel.temporary_variables[var_name] elif var_name in kernel.arg_dict: var_descr = kernel.arg_dict[var_name] else: raise NameError("array '%s' was not found" % var_name) # {{{ check/normalize vary_by_axes if isinstance(vary_by_axes, str): vary_by_axes = vary_by_axes.split(",") from loopy.kernel.array import ArrayBase if isinstance(var_descr, ArrayBase): if var_descr.dim_names is not None: name_to_index = { name: idx for idx, name in enumerate(var_descr.dim_names) } else: name_to_index = {} def map_ax_name_to_index(ax): if isinstance(ax, str): try: return name_to_index[ax] except KeyError: raise LoopyError("axis name '%s' not understood " % ax) else: return ax vary_by_axes = [map_ax_name_to_index(ax) for ax in vary_by_axes] if (vary_by_axes and (min(vary_by_axes) < 0 or max(vary_by_axes) > var_descr.num_user_axes())): raise LoopyError("vary_by_axes refers to out-of-bounds axis index") # }}} from pymbolic.mapper.substitutor import make_subst_func from pymbolic.primitives import (Sum, Product, is_zero, flattened_sum, flattened_product, Subscript, Variable) from loopy.symbolic import (get_dependencies, SubstitutionMapper, UnidirectionalUnifier) # {{{ common factor key list maintenance # list of (index_key, common factors found) common_factors = [] def find_unifiable_cf_index(index_key): for i, (key, _val) in enumerate(common_factors): unif = UnidirectionalUnifier( lhs_mapping_candidates=get_dependencies(key)) unif_result = unif(key, index_key) if unif_result: assert len(unif_result) == 1 return i, unif_result[0] return None, None def extract_index_key(access_expr): if isinstance(access_expr, Variable): return () elif isinstance(access_expr, Subscript): index = access_expr.index_tuple return tuple(index[ax] for ax in vary_by_axes) else: raise ValueError("unexpected type of access_expr") def is_assignee(insn): return var_name in insn.assignee_var_names() def iterate_as(cls, expr): if isinstance(expr, cls): yield from expr.children else: yield expr # }}} # {{{ find common factors from loopy.kernel.data import Assignment for insn in kernel.instructions: if not is_assignee(insn): continue if not isinstance(insn, Assignment): raise LoopyError("'%s' modified by non-single-assignment" % var_name) lhs = insn.assignee rhs = insn.expression if is_zero(rhs): continue index_key = extract_index_key(lhs) cf_index, unif_result = find_unifiable_cf_index(index_key) if cf_index is None: # {{{ doesn't exist yet assert unif_result is None my_common_factors = None for term in iterate_as(Sum, rhs): if term == lhs: continue for part in iterate_as(Product, term): if var_name in get_dependencies(part): raise LoopyError("unexpected dependency on '%s' " "in RHS of instruction '%s'" % (var_name, insn.id)) product_parts = set(iterate_as(Product, term)) if my_common_factors is None: my_common_factors = product_parts else: my_common_factors = my_common_factors & product_parts if my_common_factors is not None: common_factors.append((index_key, my_common_factors)) # }}} else: # {{{ match, filter existing common factors _, my_common_factors = common_factors[cf_index] unif_subst_map = SubstitutionMapper( make_subst_func(unif_result.lmap)) for term in iterate_as(Sum, rhs): if term == lhs: continue for part in iterate_as(Product, term): if var_name in get_dependencies(part): raise LoopyError("unexpected dependency on '%s' " "in RHS of instruction '%s'" % (var_name, insn.id)) product_parts = set(iterate_as(Product, term)) my_common_factors = { cf for cf in my_common_factors if unif_subst_map(cf) in product_parts } common_factors[cf_index] = (index_key, my_common_factors) # }}} # }}} common_factors = [(ik, cf) for ik, cf in common_factors if cf] if not common_factors: raise LoopyError("no common factors found") # {{{ remove common factors new_insns = [] for insn in kernel.instructions: if not isinstance(insn, Assignment) or not is_assignee(insn): new_insns.append(insn) continue index_key = extract_index_key(insn.assignee) lhs = insn.assignee rhs = insn.expression if is_zero(rhs): new_insns.append(insn) continue index_key = extract_index_key(lhs) cf_index, unif_result = find_unifiable_cf_index(index_key) if cf_index is None: new_insns.append(insn) continue _, my_common_factors = common_factors[cf_index] unif_subst_map = SubstitutionMapper(make_subst_func(unif_result.lmap)) mapped_my_common_factors = { unif_subst_map(cf) for cf in my_common_factors } new_sum_terms = [] for term in iterate_as(Sum, rhs): if term == lhs: new_sum_terms.append(term) continue new_sum_terms.append( flattened_product([ part for part in iterate_as(Product, term) if part not in mapped_my_common_factors ])) new_insns.append(insn.copy(expression=flattened_sum(new_sum_terms))) # }}} # {{{ substitute common factors into usage sites def find_substitution(expr): if isinstance(expr, Subscript): v = expr.aggregate.name elif isinstance(expr, Variable): v = expr.name else: return expr if v != var_name: return expr index_key = extract_index_key(expr) cf_index, unif_result = find_unifiable_cf_index(index_key) unif_subst_map = SubstitutionMapper(make_subst_func(unif_result.lmap)) _, my_common_factors = common_factors[cf_index] if my_common_factors is not None: return flattened_product( [unif_subst_map(cf) for cf in my_common_factors] + [expr]) else: return expr insns = new_insns new_insns = [] subm = SubstitutionMapper(find_substitution) for insn in insns: if not isinstance(insn, Assignment) or is_assignee(insn): new_insns.append(insn) continue new_insns.append(insn.with_transformed_expressions(subm)) # }}} return kernel.copy(instructions=new_insns)
def gen_decls(name_suffix, shape, strides, unvec_shape, unvec_strides, stride_arg_axes, dtype, user_index): """ :arg unvec_shape: shape tuple that accounts for :class:`loopy.kernel.array.VectorArrayDimTag` in a scalar manner :arg unvec_strides: strides tuple that accounts for :class:`loopy.kernel.array.VectorArrayDimTag` in a scalar manner :arg stride_arg_axes: a tuple *(user_axis, impl_axis, unvec_impl_axis)* :arg user_index: A tuple representing a (user-facing) multi-dimensional subscript. This is filled in with concrete integers when known (such as for separate-array dim tags), and with *None* where the index won't be known until run time. """ if dtype is None: dtype = self.dtype user_axis = len(user_index) num_user_axes = self.num_user_axes(require_answer=False) if num_user_axes is None or user_axis >= num_user_axes: # {{{ recursion base case full_name = self.name + name_suffix stride_args = [] strides = list(strides) unvec_strides = list(unvec_strides) # generate stride arguments, yielded later to keep array first for stride_user_axis, stride_impl_axis, stride_unvec_impl_axis \ in stride_arg_axes: stride_name = full_name + "_stride%d" % stride_user_axis from pymbolic import var strides[stride_impl_axis] = \ unvec_strides[stride_unvec_impl_axis] = \ var(stride_name) stride_args.append( ImplementedDataInfo( target=target, name=stride_name, dtype=index_dtype, arg_class=ValueArg, stride_for_name_and_axis=(full_name, stride_impl_axis), is_written=False)) yield ImplementedDataInfo(target=target, name=full_name, base_name=self.name, arg_class=type(self), dtype=dtype, shape=shape, strides=tuple(strides), unvec_shape=unvec_shape, unvec_strides=tuple(unvec_strides), allows_offset=bool(self.offset), is_written=is_written) import loopy as lp if self.offset is lp.auto: offset_name = full_name + "_offset" yield ImplementedDataInfo(target=target, name=offset_name, dtype=index_dtype, arg_class=ValueArg, offset_for_name=full_name, is_written=False) yield from stride_args # }}} return dim_tag = self.dim_tags[user_axis] if isinstance(dim_tag, FixedStrideArrayDimTag): if array_shape is None: new_shape_axis = None else: new_shape_axis = array_shape[user_axis] import loopy as lp if dim_tag.stride is lp.auto: new_stride_arg_axes = stride_arg_axes \ + ((user_axis, len(strides), len(unvec_strides)),) # repaired above when final array name is known # (and stride argument is created) new_stride_axis = None else: new_stride_arg_axes = stride_arg_axes new_stride_axis = dim_tag.stride yield from gen_decls(name_suffix, shape + (new_shape_axis, ), strides + (new_stride_axis, ), unvec_shape + (new_shape_axis, ), unvec_strides + (new_stride_axis, ), new_stride_arg_axes, dtype, user_index + (None, )) elif isinstance(dim_tag, SeparateArrayArrayDimTag): shape_i = array_shape[user_axis] if not is_integer(shape_i): raise LoopyError("shape of '%s' has non-constant " "integer axis %d (0-based)" % (self.name, user_axis)) for i in range(shape_i): yield from gen_decls(name_suffix + "_s%d" % i, shape, strides, unvec_shape, unvec_strides, stride_arg_axes, dtype, user_index + (i, )) elif isinstance(dim_tag, VectorArrayDimTag): shape_i = array_shape[user_axis] if not is_integer(shape_i): raise LoopyError("shape of '%s' has non-constant " "integer axis %d (0-based)" % (self.name, user_axis)) yield from gen_decls( name_suffix, shape, strides, unvec_shape + (shape_i, ), # vectors always have stride 1 unvec_strides + (1, ), stride_arg_axes, target.vector_dtype(dtype, shape_i), user_index + (None, )) else: raise LoopyError( "unsupported array dim implementation tag '%s' " "in array '%s'" % (dim_tag, self.name))
def __init__(self, name, dtype=None, shape=(), address_space=None, dim_tags=None, offset=0, dim_names=None, strides=None, order=None, base_indices=None, storage_shape=None, base_storage=None, initializer=None, read_only=False, _base_storage_access_may_be_aliasing=False, **kwargs): """ :arg dtype: :class:`loopy.auto` or a :class:`numpy.dtype` :arg shape: :class:`loopy.auto` or a shape tuple :arg base_indices: :class:`loopy.auto` or a tuple of base indices """ scope = kwargs.pop("scope", None) if scope is not None: warn("Passing 'scope' is deprecated. Use 'address_space' instead.", DeprecationWarning, stacklevel=2) if address_space is not None: raise ValueError("only one of 'scope' and 'address_space' " "may be specified") else: address_space = scope del scope if address_space is None: address_space = auto if address_space is None: raise LoopyError( "temporary variable '%s': " "address_space must not be None" % name) if initializer is None: pass elif isinstance(initializer, np.ndarray): if offset != 0: raise LoopyError( "temporary variable '%s': " "offset must be 0 if initializer specified" % name) from loopy.types import NumpyType, to_loopy_type if dtype is auto or dtype is None: dtype = NumpyType(initializer.dtype) elif to_loopy_type(dtype) != to_loopy_type(initializer.dtype): raise LoopyError( "temporary variable '%s': " "dtype of initializer does not match " "dtype of array." % name) if shape is auto: shape = initializer.shape else: raise LoopyError( "temporary variable '%s': " "initializer must be None or a numpy array" % name) if order is None: order = "C" if base_indices is None: base_indices = (0,) * len(shape) if not read_only and initializer is not None: raise LoopyError( "temporary variable '%s': " "read-write variables with initializer " "are not currently supported " "(did you mean to set read_only=True?)" % name) if base_storage is not None and initializer is not None: raise LoopyError( "temporary variable '%s': " "base_storage and initializer are " "mutually exclusive" % name) if base_storage is None and _base_storage_access_may_be_aliasing: raise LoopyError( "temporary variable '%s': " "_base_storage_access_may_be_aliasing option, but no " "base_storage given!" % name) ArrayBase.__init__(self, name=intern(name), dtype=dtype, shape=shape, strides=strides, dim_tags=dim_tags, offset=offset, dim_names=dim_names, order=order, base_indices=base_indices, address_space=address_space, storage_shape=storage_shape, base_storage=base_storage, initializer=initializer, read_only=read_only, _base_storage_access_may_be_aliasing=( _base_storage_access_may_be_aliasing), **kwargs)
def map_local_hw_index(self, expr, enclosing_prec, type_context): raise LoopyError("plain C does not have group hw axes")
def map_call(self, expr, enclosing_prec, type_context): from pymbolic.primitives import Variable, Subscript from pymbolic.mapper.stringifier import PREC_NONE identifier = expr.function # {{{ implement indexof, indexof_vec if identifier.name in ["indexof", "indexof_vec"]: if len(expr.parameters) != 1: raise LoopyError("%s takes exactly one argument" % identifier.name) arg, = expr.parameters if not isinstance(arg, Subscript): raise LoopyError("argument to %s must be a subscript" % identifier.name) ary = self.find_array(arg) from loopy.kernel.array import get_access_info from pymbolic import evaluate access_info = get_access_info( self.kernel.target, ary, arg.index, lambda expr: evaluate(expr, self.codegen_state.var_subst_map), self.codegen_state.vectorization_info) from loopy.kernel.data import ImageArg if isinstance(ary, ImageArg): raise LoopyError("%s does not support images" % identifier.name) if identifier.name == "indexof": return access_info.subscripts[0] elif identifier.name == "indexof_vec": from loopy.kernel.array import VectorArrayDimTag ivec = None for iaxis, dim_tag in enumerate(ary.dim_tags): if isinstance(dim_tag, VectorArrayDimTag): ivec = iaxis if ivec is None: return access_info.subscripts[0] else: return (access_info.subscripts[0] * ary.shape[ivec] + access_info.vector_index) else: raise RuntimeError("should not get here") # }}} if isinstance(identifier, Variable): identifier = identifier.name par_dtypes = tuple(self.infer_type(par) for par in expr.parameters) str_parameters = None mangle_result = self.kernel.mangle_function( identifier, par_dtypes, ast_builder=self.codegen_state.ast_builder) if mangle_result is None: raise RuntimeError( "function '%s' unknown--" "maybe you need to register a function mangler?" % identifier) if len(mangle_result.result_dtypes) != 1: raise LoopyError( "functions with more or fewer than one return value " "may not be used in an expression") if mangle_result.arg_dtypes is not None: str_parameters = [ self.rec(par, PREC_NONE, dtype_to_type_context(self.kernel.target, tgt_dtype), tgt_dtype) for par, par_dtype, tgt_dtype in zip(expr.parameters, par_dtypes, mangle_result.arg_dtypes) ] else: # /!\ FIXME For some functions (e.g. 'sin'), it makes sense to # propagate the type context here. But for many others, it does # not. Using the inferred type as a stopgap for now. str_parameters = [ self.rec(par, PREC_NONE, type_context=dtype_to_type_context( self.kernel.target, par_dtype)) for par, par_dtype in zip(expr.parameters, par_dtypes) ] from warnings import warn warn( "Calling function '%s' with unknown C signature--" "return CallMangleInfo.arg_dtypes" % identifier, LoopyWarning) from loopy.codegen import SeenFunction self.codegen_state.seen_functions.add( SeenFunction(identifier, mangle_result.target_name, mangle_result.arg_dtypes or par_dtypes)) return "%s(%s)" % (mangle_result.target_name, ", ".join(str_parameters))
def precompute( kernel, subst_use, sweep_inames=[], within=None, storage_axes=None, temporary_name=None, precompute_inames=None, precompute_outer_inames=None, storage_axis_to_tag={}, # "None" is a valid value here, distinct from the default. default_tag=_not_provided, dtype=None, fetch_bounding_box=False, temporary_address_space=None, compute_insn_id=None, **kwargs): """Precompute the expression described in the substitution rule determined by *subst_use* and store it in a temporary array. A precomputation needs two things to operate, a list of *sweep_inames* (order irrelevant) and an ordered list of *storage_axes* (whose order will describe the axis ordering of the temporary array). :arg subst_use: Describes what to prefetch. The following objects may be given for *subst_use*: * The name of the substitution rule. * The tagged name ("name$tag") of the substitution rule. * A list of invocations of the substitution rule. This list of invocations, when swept across *sweep_inames*, then serves to define the footprint of the precomputation. Invocations may be tagged ("name$tag") to filter out a subset of the usage sites of the substitution rule. (Namely those usage sites that use the same tagged name.) Invocations may be given as a string or as a :class:`pymbolic.primitives.Expression` object. If only one invocation is to be given, then the only entry of the list may be given directly. If the list of invocations generating the footprint is not given, all (tag-matching, if desired) usage sites of the substitution rule are used to determine the footprint. The following cases can arise for each sweep axis: * The axis is an iname that occurs within arguments specified at usage sites of the substitution rule. This case is assumed covered by the storage axes provided for the argument. * The axis is an iname that occurs within the *value* of the rule, but not within its arguments. A new, dedicated storage axis is allocated for such an axis. :arg sweep_inames: A :class:`list` of inames to be swept. May also equivalently be a comma-separated string. :arg within: a stack match as understood by :func:`loopy.match.parse_stack_match`. :arg storage_axes: A :class:`list` of inames and/or rule argument names/indices to be used as storage axes. May also equivalently be a comma-separated string. :arg temporary_name: The temporary variable name to use for storing the precomputed data. If it does not exist, it will be created. If it does exist, its properties (such as size, type) are checked (and updated, if possible) to match its use. :arg precompute_inames: A tuple of inames to be used to carry out the precomputation. If the specified inames do not already exist, they will be created. If they do already exist, their loop domain is verified against the one required for this precomputation. This tuple may be shorter than the (provided or automatically found) *storage_axes* tuple, in which case names will be automatically created. May also equivalently be a comma-separated string. :arg precompute_outer_inames: A :class:`frozenset` of inames within which the compute instruction is nested. If *None*, make an educated guess. May also be specified as a comma-separated string. :arg default_tag: The :ref:`iname tag <iname-tags>` to be applied to the inames created to perform the precomputation. The current default will make them local axes and automatically split them to fit the work group size, but this default will disappear in favor of simply leaving them untagged in 2019. For 2018, a warning will be issued if no *default_tag* is specified. :arg compute_insn_id: The ID of the instruction generated to perform the precomputation. If `storage_axes` is not specified, it defaults to the arrangement `<direct sweep axes><arguments>` with the direct sweep axes being the slower-varying indices. Trivial storage axes (i.e. axes of length 1 with respect to the sweep) are eliminated. """ # {{{ unify temporary_address_space / temporary_scope temporary_scope = kwargs.pop("temporary_scope", None) from loopy.kernel.data import AddressSpace if temporary_scope is not None: from warnings import warn warn( "temporary_scope is deprecated. Use temporary_address_space instead", DeprecationWarning, stacklevel=2) if temporary_address_space is not None: raise LoopyError( "may not specify both temporary_address_space and " "temporary_scope") temporary_address_space = temporary_scope del temporary_scope # }}} if kwargs: raise TypeError("unrecognized keyword arguments: %s" % ", ".join(kwargs.keys())) # {{{ check, standardize arguments if isinstance(sweep_inames, str): sweep_inames = [iname.strip() for iname in sweep_inames.split(",")] for iname in sweep_inames: if iname not in kernel.all_inames(): raise RuntimeError("sweep iname '%s' is not a known iname" % iname) sweep_inames = list(sweep_inames) sweep_inames_set = frozenset(sweep_inames) if isinstance(storage_axes, str): storage_axes = [ax.strip() for ax in storage_axes.split(",")] if isinstance(precompute_inames, str): precompute_inames = [ iname.strip() for iname in precompute_inames.split(",") ] if isinstance(precompute_outer_inames, str): precompute_outer_inames = frozenset( iname.strip() for iname in precompute_outer_inames.split(",")) if isinstance(subst_use, str): subst_use = [subst_use] footprint_generators = None subst_name = None subst_tag = None from pymbolic.primitives import Variable, Call from loopy.symbolic import parse, TaggedVariable for use in subst_use: if isinstance(use, str): use = parse(use) if isinstance(use, Call): if footprint_generators is None: footprint_generators = [] footprint_generators.append(use) subst_name_as_expr = use.function else: subst_name_as_expr = use if isinstance(subst_name_as_expr, TaggedVariable): new_subst_name = subst_name_as_expr.name new_subst_tag = subst_name_as_expr.tag elif isinstance(subst_name_as_expr, Variable): new_subst_name = subst_name_as_expr.name new_subst_tag = None else: raise ValueError("unexpected type of subst_name") if (subst_name, subst_tag) == (None, None): subst_name, subst_tag = new_subst_name, new_subst_tag else: if (subst_name, subst_tag) != (new_subst_name, new_subst_tag): raise ValueError("not all uses in subst_use agree " "on rule name and tag") from loopy.match import parse_stack_match within = parse_stack_match(within) try: subst = kernel.substitutions[subst_name] except KeyError: raise LoopyError("substitution rule '%s' not found" % subst_name) c_subst_name = subst_name.replace(".", "_") # {{{ handle default_tag from loopy.transform.data import _not_provided \ as transform_data_not_provided if default_tag is _not_provided or default_tag is transform_data_not_provided: # no need to warn for scalar precomputes if sweep_inames: from warnings import warn warn( "Not specifying default_tag is deprecated, and default_tag " "will become mandatory in 2019.x. " "Pass 'default_tag=\"l.auto\" to match the current default, " "or Pass 'default_tag=None to leave the loops untagged, which " "is the recommended behavior.", DeprecationWarning, stacklevel=( # In this case, we came here through add_prefetch. Increase # the stacklevel. 3 if default_tag is transform_data_not_provided else 2)) default_tag = "l.auto" from loopy.kernel.data import parse_tag default_tag = parse_tag(default_tag) # }}} # }}} # {{{ process invocations in footprint generators, start access_descriptors if footprint_generators: from pymbolic.primitives import Variable, Call access_descriptors = [] for fpg in footprint_generators: if isinstance(fpg, Variable): args = () elif isinstance(fpg, Call): args = fpg.parameters else: raise ValueError("footprint generator must " "be substitution rule invocation") access_descriptors.append( RuleAccessDescriptor(identifier=access_descriptor_id( args, None), args=args)) # }}} # {{{ gather up invocations in kernel code, finish access_descriptors if not footprint_generators: rule_mapping_context = SubstitutionRuleMappingContext( kernel.substitutions, kernel.get_var_name_generator()) invg = RuleInvocationGatherer(rule_mapping_context, kernel, subst_name, subst_tag, within) del rule_mapping_context import loopy as lp for insn in kernel.instructions: if isinstance(insn, lp.MultiAssignmentBase): for assignee in insn.assignees: invg(assignee, kernel, insn) invg(insn.expression, kernel, insn) access_descriptors = invg.access_descriptors if not access_descriptors: raise RuntimeError("no invocations of '%s' found" % subst_name) # }}} # {{{ find inames used in arguments expanding_usage_arg_deps = set() for accdesc in access_descriptors: for arg in accdesc.args: expanding_usage_arg_deps.update( get_dependencies(arg) & kernel.all_inames()) # }}} var_name_gen = kernel.get_var_name_generator() # {{{ use given / find new storage_axes # extra axes made necessary because they don't occur in the arguments extra_storage_axes = set(sweep_inames_set - expanding_usage_arg_deps) from loopy.symbolic import SubstitutionRuleExpander submap = SubstitutionRuleExpander(kernel.substitutions) value_inames = (get_dependencies(submap(subst.expression)) - frozenset(subst.arguments)) & kernel.all_inames() if value_inames - expanding_usage_arg_deps < extra_storage_axes: raise RuntimeError("unreferenced sweep inames specified: " + ", ".join(extra_storage_axes - value_inames - expanding_usage_arg_deps)) new_iname_to_tag = {} if storage_axes is None: storage_axes = [] # Add sweep_inames (in given--rather than arbitrary--order) to # storage_axes *if* they are part of extra_storage_axes. for iname in sweep_inames: if iname in extra_storage_axes: extra_storage_axes.remove(iname) storage_axes.append(iname) if extra_storage_axes: if (precompute_inames is not None and len(storage_axes) < len(precompute_inames)): raise LoopyError( "must specify a sufficient number of " "storage_axes to uniquely determine the meaning " "of the given precompute_inames. (%d storage_axes " "needed)" % len(precompute_inames)) storage_axes.extend(sorted(extra_storage_axes)) storage_axes.extend(range(len(subst.arguments))) del extra_storage_axes prior_storage_axis_name_dict = {} storage_axis_names = [] storage_axis_sources = [] # number for arg#, or iname # {{{ check for pre-existing precompute_inames if precompute_inames is not None: preexisting_precompute_inames = (set(precompute_inames) & kernel.all_inames()) else: preexisting_precompute_inames = set() # }}} for i, saxis in enumerate(storage_axes): tag_lookup_saxis = saxis if saxis in subst.arguments: saxis = subst.arguments.index(saxis) storage_axis_sources.append(saxis) if isinstance(saxis, int): # argument index name = old_name = subst.arguments[saxis] else: old_name = saxis name = "%s_%s" % (c_subst_name, old_name) if (precompute_inames is not None and i < len(precompute_inames) and precompute_inames[i]): name = precompute_inames[i] tag_lookup_saxis = name if (name not in preexisting_precompute_inames and var_name_gen.is_name_conflicting(name)): raise RuntimeError("new storage axis name '%s' " "conflicts with existing name" % name) else: name = var_name_gen(name) storage_axis_names.append(name) if name not in preexisting_precompute_inames: new_iname_to_tag[name] = storage_axis_to_tag.get( tag_lookup_saxis, default_tag) prior_storage_axis_name_dict[name] = old_name del storage_axis_to_tag del storage_axes del precompute_inames # }}} # {{{ fill out access_descriptors[...].storage_axis_exprs access_descriptors = [ accdesc.copy(storage_axis_exprs=storage_axis_exprs( storage_axis_sources, accdesc.args)) for accdesc in access_descriptors ] # }}} expanding_inames = sweep_inames_set | frozenset(expanding_usage_arg_deps) assert expanding_inames <= kernel.all_inames() if storage_axis_names: # {{{ find domain to be changed change_inames = expanding_inames | preexisting_precompute_inames from loopy.kernel.tools import DomainChanger domch = DomainChanger(kernel, change_inames) if domch.leaf_domain_index is not None: # If the sweep inames are at home in parent domains, then we'll add # fetches with loops over copies of these parent inames that will end # up being scheduled *within* loops over these parents. for iname in sweep_inames_set: if kernel.get_home_domain_index( iname) != domch.leaf_domain_index: raise RuntimeError( "sweep iname '%s' is not 'at home' in the " "sweep's leaf domain" % iname) # }}} abm = ArrayToBufferMap(kernel, domch.domain, sweep_inames, access_descriptors, len(storage_axis_names)) non1_storage_axis_names = [] for i, saxis in enumerate(storage_axis_names): if abm.non1_storage_axis_flags[i]: non1_storage_axis_names.append(saxis) else: del new_iname_to_tag[saxis] if saxis in preexisting_precompute_inames: raise LoopyError( "precompute axis %d (1-based) was " "eliminated as " "having length 1 but also mapped to existing " "iname '%s'" % (i + 1, saxis)) mod_domain = domch.domain # {{{ modify the domain, taking into account preexisting inames # inames may already exist in mod_domain, add them primed to start primed_non1_saxis_names = [ iname + "'" for iname in non1_storage_axis_names ] mod_domain = abm.augment_domain_with_sweep( domch.domain, primed_non1_saxis_names, boxify_sweep=fetch_bounding_box) check_domain = mod_domain for i, saxis in enumerate(non1_storage_axis_names): var_dict = mod_domain.get_var_dict(isl.dim_type.set) if saxis in preexisting_precompute_inames: # add equality constraint between existing and new variable dt, dim_idx = var_dict[saxis] saxis_aff = isl.Aff.var_on_domain(mod_domain.space, dt, dim_idx) dt, dim_idx = var_dict[primed_non1_saxis_names[i]] new_var_aff = isl.Aff.var_on_domain(mod_domain.space, dt, dim_idx) mod_domain = mod_domain.add_constraint( isl.Constraint.equality_from_aff(new_var_aff - saxis_aff)) # project out the new one mod_domain = mod_domain.project_out(dt, dim_idx, 1) else: # remove the prime from the new variable dt, dim_idx = var_dict[primed_non1_saxis_names[i]] mod_domain = mod_domain.set_dim_name(dt, dim_idx, saxis) def add_assumptions(d): assumption_non_param = isl.BasicSet.from_params(kernel.assumptions) assumptions, domain = isl.align_two(assumption_non_param, d) return assumptions & domain # {{{ check that we got the desired domain check_domain = add_assumptions( check_domain.project_out_except(primed_non1_saxis_names, [isl.dim_type.set])) mod_check_domain = add_assumptions(mod_domain) # re-add the prime from the new variable var_dict = mod_check_domain.get_var_dict(isl.dim_type.set) for saxis in non1_storage_axis_names: dt, dim_idx = var_dict[saxis] mod_check_domain = mod_check_domain.set_dim_name( dt, dim_idx, saxis + "'") mod_check_domain = mod_check_domain.project_out_except( primed_non1_saxis_names, [isl.dim_type.set]) mod_check_domain, check_domain = isl.align_two(mod_check_domain, check_domain) # The modified domain can't get bigger by adding constraints assert mod_check_domain <= check_domain if not check_domain <= mod_check_domain: print(check_domain) print(mod_check_domain) raise LoopyError("domain of preexisting inames does not match " "domain needed for precompute") # }}} # {{{ check that we didn't shrink the original domain # project out the new names from the modified domain orig_domain_inames = list(domch.domain.get_var_dict(isl.dim_type.set)) mod_check_domain = add_assumptions( mod_domain.project_out_except(orig_domain_inames, [isl.dim_type.set])) check_domain = add_assumptions(domch.domain) mod_check_domain, check_domain = isl.align_two(mod_check_domain, check_domain) # The modified domain can't get bigger by adding constraints assert mod_check_domain <= check_domain if not check_domain <= mod_check_domain: print(check_domain) print(mod_check_domain) raise LoopyError( "original domain got shrunk by applying the precompute") # }}} # }}} new_kernel_domains = domch.get_domains_with(mod_domain) else: # leave kernel domains unchanged new_kernel_domains = kernel.domains non1_storage_axis_names = [] abm = NoOpArrayToBufferMap() kernel = kernel.copy(domains=new_kernel_domains) # {{{ set up compute insn if temporary_name is None: temporary_name = var_name_gen(based_on=c_subst_name) assignee = var(temporary_name) if non1_storage_axis_names: assignee = assignee[tuple( var(iname) for iname in non1_storage_axis_names)] # {{{ process substitutions on compute instruction storage_axis_subst_dict = {} for arg_name, bi in zip(storage_axis_names, abm.storage_base_indices): if arg_name in non1_storage_axis_names: arg = var(arg_name) else: arg = 0 storage_axis_subst_dict[prior_storage_axis_name_dict.get( arg_name, arg_name)] = arg + bi rule_mapping_context = SubstitutionRuleMappingContext( kernel.substitutions, kernel.get_var_name_generator()) from loopy.match import parse_stack_match expr_subst_map = RuleAwareSubstitutionMapper( rule_mapping_context, make_subst_func(storage_axis_subst_dict), within=parse_stack_match(None)) compute_expression = expr_subst_map(subst.expression, kernel, None) # }}} from loopy.kernel.data import Assignment if compute_insn_id is None: compute_insn_id = kernel.make_unique_instruction_id( based_on=c_subst_name) compute_insn = Assignment( id=compute_insn_id, assignee=assignee, expression=compute_expression, # within_inames determined below ) compute_dep_id = compute_insn_id added_compute_insns = [compute_insn] if temporary_address_space == AddressSpace.GLOBAL: barrier_insn_id = kernel.make_unique_instruction_id( based_on=c_subst_name + "_barrier") from loopy.kernel.instruction import BarrierInstruction barrier_insn = BarrierInstruction(id=barrier_insn_id, depends_on=frozenset( [compute_insn_id]), synchronization_kind="global", mem_kind="global") compute_dep_id = barrier_insn_id added_compute_insns.append(barrier_insn) # }}} # {{{ substitute rule into expressions in kernel (if within footprint) from loopy.symbolic import SubstitutionRuleExpander expander = SubstitutionRuleExpander(kernel.substitutions) invr = RuleInvocationReplacer(rule_mapping_context, subst_name, subst_tag, within, access_descriptors, abm, storage_axis_names, storage_axis_sources, non1_storage_axis_names, temporary_name, compute_insn_id, compute_dep_id, compute_read_variables=get_dependencies( expander(compute_expression))) kernel = invr.map_kernel(kernel) kernel = kernel.copy(instructions=added_compute_insns + kernel.instructions) kernel = rule_mapping_context.finish_kernel(kernel) # }}} # {{{ add dependencies to compute insn kernel = kernel.copy(instructions=[ insn.copy(depends_on=frozenset(invr.compute_insn_depends_on)) if insn. id == compute_insn_id else insn for insn in kernel.instructions ]) # }}} # {{{ propagate storage iname subst to dependencies of compute instructions from loopy.kernel.tools import find_recursive_dependencies compute_deps = find_recursive_dependencies(kernel, frozenset([compute_insn_id])) # FIXME: Need to verify that there are no outside dependencies # on compute_deps prior_storage_axis_names = frozenset(storage_axis_subst_dict) new_insns = [] for insn in kernel.instructions: if (insn.id in compute_deps and insn.within_inames & prior_storage_axis_names): insn = (insn.with_transformed_expressions( lambda expr: expr_subst_map(expr, kernel, insn)).copy( within_inames=frozenset( storage_axis_subst_dict.get(iname, var(iname)).name for iname in insn.within_inames))) new_insns.append(insn) else: new_insns.append(insn) kernel = kernel.copy(instructions=new_insns) # }}} # {{{ determine inames for compute insn if precompute_outer_inames is None: from loopy.kernel.tools import guess_iname_deps_based_on_var_use precompute_outer_inames = ( frozenset(non1_storage_axis_names) | frozenset((expanding_usage_arg_deps | value_inames) - sweep_inames_set) | guess_iname_deps_based_on_var_use(kernel, compute_insn)) else: if not isinstance(precompute_outer_inames, frozenset): raise TypeError("precompute_outer_inames must be a frozenset") precompute_outer_inames = precompute_outer_inames \ | frozenset(non1_storage_axis_names) kernel = kernel.copy(instructions=[ insn.copy(within_inames=precompute_outer_inames) if insn.id == compute_insn_id else insn for insn in kernel.instructions ]) # }}} # {{{ set up temp variable import loopy as lp if dtype is not None: dtype = np.dtype(dtype) if temporary_address_space is None: temporary_address_space = lp.auto new_temp_shape = tuple(abm.non1_storage_shape) new_temporary_variables = kernel.temporary_variables.copy() if temporary_name not in new_temporary_variables: temp_var = lp.TemporaryVariable( name=temporary_name, dtype=dtype, base_indices=(0, ) * len(new_temp_shape), shape=tuple(abm.non1_storage_shape), address_space=temporary_address_space, dim_names=tuple(non1_storage_axis_names)) else: temp_var = new_temporary_variables[temporary_name] # {{{ check and adapt existing temporary if temp_var.dtype is lp.auto: pass elif temp_var.dtype is not lp.auto and dtype is lp.auto: dtype = temp_var.dtype elif temp_var.dtype is not lp.auto and dtype is not lp.auto: if temp_var.dtype != dtype: raise LoopyError("Existing and new dtype of temporary '%s' " "do not match (existing: %s, new: %s)" % (temporary_name, temp_var.dtype, dtype)) temp_var = temp_var.copy(dtype=dtype) if len(temp_var.shape) != len(new_temp_shape): raise LoopyError( "Existing and new temporary '%s' do not " "have matching number of dimensions ('%d' vs. '%d') " % (temporary_name, len(temp_var.shape), len(new_temp_shape))) if temp_var.base_indices != (0, ) * len(new_temp_shape): raise LoopyError( "Existing and new temporary '%s' do not " "have matching number of dimensions ('%d' vs. '%d') " % (temporary_name, len(temp_var.shape), len(new_temp_shape))) new_temp_shape = tuple( max(i, ex_i) for i, ex_i in zip(new_temp_shape, temp_var.shape)) temp_var = temp_var.copy(shape=new_temp_shape) if temporary_address_space == temp_var.address_space: pass elif temporary_address_space is lp.auto: temporary_address_space = temp_var.address_space elif temp_var.address_space is lp.auto: pass else: raise LoopyError("Existing and new temporary '%s' do not " "have matching scopes (existing: %s, new: %s)" % (temporary_name, AddressSpace.stringify(temp_var.address_space), AddressSpace.stringify(temporary_address_space))) temp_var = temp_var.copy(address_space=temporary_address_space) # }}} new_temporary_variables[temporary_name] = temp_var kernel = kernel.copy(temporary_variables=new_temporary_variables) # }}} from loopy import tag_inames kernel = tag_inames(kernel, new_iname_to_tag) from loopy.kernel.data import AutoFitLocalIndexTag, filter_iname_tags_by_type if filter_iname_tags_by_type(new_iname_to_tag.values(), AutoFitLocalIndexTag): from loopy.kernel.tools import assign_automatic_axes kernel = assign_automatic_axes(kernel) return kernel
def emit_multiple_assignment(self, codegen_state, insn): ecm = codegen_state.expression_to_code_mapper from pymbolic.primitives import Variable from pymbolic.mapper.stringifier import PREC_NONE func_id = insn.expression.function parameters = insn.expression.parameters if isinstance(func_id, Variable): func_id = func_id.name assignee_var_descriptors = [ codegen_state.kernel.get_var_descriptor(a) for a in insn.assignee_var_names() ] par_dtypes = tuple(ecm.infer_type(par) for par in parameters) mangle_result = codegen_state.kernel.mangle_function( func_id, par_dtypes) if mangle_result is None: raise RuntimeError( "function '%s' unknown--" "maybe you need to register a function mangler?" % func_id) assert mangle_result.arg_dtypes is not None from loopy.expression import dtype_to_type_context c_parameters = [ ecm(par, PREC_NONE, dtype_to_type_context(self.target, tgt_dtype), tgt_dtype).expr for par, par_dtype, tgt_dtype in zip( parameters, par_dtypes, mangle_result.arg_dtypes) ] from loopy.codegen import SeenFunction codegen_state.seen_functions.add( SeenFunction(func_id, mangle_result.target_name, mangle_result.arg_dtypes)) from pymbolic import var for i, (a, tgt_dtype) in enumerate( zip(insn.assignees[1:], mangle_result.result_dtypes[1:])): if tgt_dtype != ecm.infer_type(a): raise LoopyError("type mismatch in %d'th (1-based) left-hand " "side of instruction '%s'" % (i + 1, insn.id)) c_parameters.append( # TODO Yuck: The "where-at function": &(...) var("&")(ecm(a, PREC_NONE, dtype_to_type_context(self.target, tgt_dtype), tgt_dtype).expr)) from pymbolic import var result = var(mangle_result.target_name)(*c_parameters) # In case of no assignees, we are done if len(mangle_result.result_dtypes) == 0: from cgen import ExpressionStatement return ExpressionStatement( CExpression(self.get_c_expression_to_code_mapper(), result)) result = ecm.wrap_in_typecast(mangle_result.result_dtypes[0], assignee_var_descriptors[0].dtype, result) lhs_code = ecm(insn.assignees[0], prec=PREC_NONE, type_context=None) from cgen import Assign return Assign( lhs_code, CExpression(self.get_c_expression_to_code_mapper(), result))
def get_access_info(target, ary, index, eval_expr, vectorization_info): """ :arg ary: an object of type :class:`ArrayBase` :arg index: a tuple of indices representing a subscript into ary :arg vectorization_info: an instance of :class:`loopy.codegen.VectorizationInfo`, or *None*. """ import loopy as lp from pymbolic import var def eval_expr_assert_integer_constant(i, expr): from pymbolic.mapper.evaluator import UnknownVariableError try: result = eval_expr(expr) except UnknownVariableError as e: raise LoopyError( "When trying to index the array '%s' along axis " "%d (tagged '%s'), the index was not a compile-time " "constant (but it has to be in order for code to be " "generated). You likely want to unroll the iname(s) '%s'." % (ary.name, i, ary.dim_tags[i], str(e))) if not is_integer(result): raise LoopyError("subscript '%s[%s]' has non-constant " "index for separate-array axis %d (0-based)" % (ary.name, index, i)) return result def apply_offset(sub): import loopy as lp if ary.offset: if ary.offset is lp.auto: return var(array_name + "_offset") + sub elif isinstance(ary.offset, str): return var(ary.offset) + sub else: # assume it's an expression return ary.offset + sub else: return sub if not isinstance(index, tuple): index = (index, ) array_name = ary.name if ary.dim_tags is None: if len(index) != 1: raise LoopyError( "Array '%s' has no known axis implementation " "tags and therefore only supports one-dimensional " "indexing. (Did you mean 'shape=loopy.auto' instead of " "'shape=None'?)" % ary.name) return AccessInfo(array_name=array_name, subscripts=(apply_offset(index[0]), ), vector_index=None) if len(ary.dim_tags) != len(index): raise LoopyError("subscript to '%s[%s]' has the wrong " "number of indices (got: %d, expected: %d)" % (ary.name, index, len(index), len(ary.dim_tags))) num_target_axes = ary.num_target_axes() vector_index = None subscripts = [0] * num_target_axes vector_size = ary.vector_size(target) # {{{ process separate-array dim tags first, to find array name for i, (idx, dim_tag) in enumerate(zip(index, ary.dim_tags)): if isinstance(dim_tag, SeparateArrayArrayDimTag): idx = eval_expr_assert_integer_constant(i, idx) array_name += "_s%d" % idx # }}} # {{{ process remaining dim tags for i, (idx, dim_tag) in enumerate(zip(index, ary.dim_tags)): if isinstance(dim_tag, FixedStrideArrayDimTag): stride = dim_tag.stride if is_integer(stride): if not dim_tag.stride % vector_size == 0: raise LoopyError( "array '%s' has axis %d stride of " "%d, which is not divisible by the size of the " "vector (%d)" % (ary.name, i, dim_tag.stride, vector_size)) elif stride is lp.auto: stride = var(array_name + "_stride%d" % i) subscripts[dim_tag.target_axis] += (stride // vector_size) * idx elif isinstance(dim_tag, SeparateArrayArrayDimTag): pass elif isinstance(dim_tag, VectorArrayDimTag): from pymbolic.primitives import Variable if (vectorization_info is not None and isinstance(index[i], Variable) and index[i].name == vectorization_info.iname): # We'll do absolutely nothing here, which will result # in the vector being returned. pass else: idx = eval_expr_assert_integer_constant(i, idx) assert vector_index is None vector_index = idx else: raise LoopyError("unsupported array dim implementation tag '%s' " "in array '%s'" % (dim_tag, ary.name)) # }}} from pymbolic import var import loopy as lp if ary.offset: if num_target_axes > 1: raise NotImplementedError("offsets for multiple image axes") subscripts[0] = apply_offset(subscripts[0]) return AccessInfo(array_name=array_name, vector_index=vector_index, subscripts=subscripts)
def privatize_temporaries_with_inames(kernel, privatizing_inames, only_var_names=None): """This function provides each loop iteration of the *privatizing_inames* with its own private entry in the temporaries it accesses (possibly restricted to *only_var_names*). This is accomplished implicitly as part of generating instruction-level parallelism by the "ILP" tag and accessible separately through this transformation. Example:: for imatrix, i acc = 0 for k acc = acc + a[imatrix, i, k] * vec[k] end end might become:: for imatrix, i acc[imatrix] = 0 for k acc[imatrix] = acc[imatrix] + a[imatrix, i, k] * vec[k] end end facilitating loop interchange of the *imatrix* loop. .. versionadded:: 2018.1 """ if isinstance(privatizing_inames, str): privatizing_inames = frozenset(s.strip() for s in privatizing_inames.split(",")) if isinstance(only_var_names, str): only_var_names = frozenset(s.strip() for s in only_var_names.split(",")) wmap = kernel.writer_map() var_to_new_priv_axis_iname = {} # {{{ find variables that need extra indices for tv in kernel.temporary_variables.values(): if only_var_names is not None and tv.name not in only_var_names: continue for writer_insn_id in wmap.get(tv.name, []): writer_insn = kernel.id_to_insn[writer_insn_id] priv_axis_inames = writer_insn.within_inames & privatizing_inames referenced_priv_axis_inames = ( priv_axis_inames & writer_insn.write_dependency_names()) new_priv_axis_inames = priv_axis_inames - referenced_priv_axis_inames if not new_priv_axis_inames: break if tv.name in var_to_new_priv_axis_iname: if new_priv_axis_inames != set( var_to_new_priv_axis_iname[tv.name]): raise LoopyError( "instruction '%s' requires adding " "indices for privatizing var '%s' on iname(s) '%s', " "but previous instructions required inames '%s'" % (writer_insn_id, tv.name, ", ".join(new_priv_axis_inames), ", ".join( var_to_new_priv_axis_iname[tv.name]))) continue var_to_new_priv_axis_iname[tv.name] = set(new_priv_axis_inames) # }}} # {{{ find ilp iname lengths from loopy.isl_helpers import static_max_of_pw_aff from loopy.symbolic import pw_aff_to_expr priv_axis_iname_to_length = {} iname_to_lbound = {} for priv_axis_inames in var_to_new_priv_axis_iname.values(): for iname in priv_axis_inames: if iname in priv_axis_iname_to_length: continue bounds = kernel.get_iname_bounds(iname, constants_only=False) priv_axis_iname_to_length[iname] = pw_aff_to_expr( static_max_of_pw_aff(bounds.size, constants_only=False)) iname_to_lbound[iname] = pw_aff_to_expr(bounds.lower_bound_pw_aff) # }}} # {{{ change temporary variables from loopy.kernel.data import VectorizeTag new_temp_vars = kernel.temporary_variables.copy() for tv_name, inames in var_to_new_priv_axis_iname.items(): tv = new_temp_vars[tv_name] extra_shape = tuple(priv_axis_iname_to_length[iname] for iname in inames) shape = tv.shape if shape is None: shape = () dim_tags = ["c"] * (len(shape) + len(extra_shape)) for i, iname in enumerate(inames): if kernel.iname_tags_of_type(iname, VectorizeTag): dim_tags[len(shape) + i] = "vec" new_temp_vars[tv.name] = tv.copy( shape=shape + extra_shape, # Forget what you knew about data layout, # create from scratch. dim_tags=dim_tags, dim_names=None) # }}} from pymbolic import var var_to_extra_iname = { var_name: tuple(var(iname) for iname in inames) for var_name, inames in var_to_new_priv_axis_iname.items() } new_insns = [] for insn in kernel.instructions: eiii = ExtraInameIndexInserter(var_to_extra_iname, iname_to_lbound) new_insn = insn.with_transformed_expressions(eiii) if not eiii.seen_priv_axis_inames <= insn.within_inames: raise LoopyError( "Kernel '%s': Instruction '%s': touched variable that " "(for privatization, e.g. as performed for ILP) " "required iname(s) '%s', but that the instruction was not " "previously within the iname(s). To remedy this, first promote" "the instruction into the iname." % (kernel.name, insn.id, ", ".join(eiii.seen_priv_axis_inames - insn.within_inames))) new_insns.append(new_insn) return kernel.copy(temporary_variables=new_temp_vars, instructions=new_insns)
def _parse_array_dim_tag(tag, default_target_axis, nesting_levels): if isinstance(tag, ArrayDimImplementationTag): return False, False, tag if not isinstance(tag, str): raise TypeError("arg dimension implementation tag must be " "string or tag object") tag = tag.strip() is_optional = False if tag.endswith("?"): tag = tag[:-1] is_optional = True orig_tag = tag if tag == "sep": return False, is_optional, SeparateArrayArrayDimTag() elif tag == "vec": return False, is_optional, VectorArrayDimTag() nesting_level_match = NESTING_LEVEL_RE.match(tag) if nesting_level_match is not None: nesting_level = int(nesting_level_match.group(1)) tag = nesting_level_match.group(2) if tag is None: tag = "" else: nesting_level = None has_explicit_nesting_level = nesting_level is not None target_axis_match = TARGET_AXIS_RE.search(tag) if target_axis_match is not None: target_axis = int(target_axis_match.group(1)) tag = tag[:target_axis_match.start()] else: target_axis = default_target_axis ta_nesting_levels = nesting_levels.get(target_axis, []) if tag.startswith("stride:"): fixed_stride_descr = tag[7:] if fixed_stride_descr.strip() == "auto": import loopy as lp return (has_explicit_nesting_level, is_optional, FixedStrideArrayDimTag(lp.auto, target_axis, layout_nesting_level=nesting_level)) else: from loopy.symbolic import parse return (has_explicit_nesting_level, is_optional, FixedStrideArrayDimTag(parse(fixed_stride_descr), target_axis, layout_nesting_level=nesting_level)) else: padded_stride_match = PADDED_STRIDE_TAG_RE.match(tag) if padded_stride_match is not None: tag = padded_stride_match.group(1) from loopy.symbolic import parse pad_to = parse(padded_stride_match.group(2)) else: pad_to = None if tag in ["c", "C"]: if nesting_level is not None: raise LoopyError( "may not specify 'C' array order with explicit " "layout nesting level") if ta_nesting_levels: nesting_level = min(ta_nesting_levels) - 1 else: nesting_level = 0 elif tag in ["f", "F"]: if nesting_level is not None: raise LoopyError( "may not specify 'C' array order with explicit " "layout nesting level") if ta_nesting_levels: nesting_level = max(ta_nesting_levels) + 1 else: nesting_level = 0 elif tag == "": if nesting_level is None: raise LoopyError("invalid dim tag: '%s'" % orig_tag) else: raise LoopyError("invalid dim tag: '%s'" % orig_tag) return (has_explicit_nesting_level, is_optional, ComputedStrideArrayDimTag(nesting_level, pad_to=pad_to, target_axis=target_axis))
def __init__(self, id, depends_on, depends_on_is_final, groups, conflicts_with_groups, no_sync_with, within_inames_is_final, within_inames, priority, boostable, boostable_into, predicates, tags, insn_deps=None, insn_deps_is_final=None, forced_iname_deps=None, forced_iname_deps_is_final=None): # {{{ backwards compatibility goop if depends_on is not None and insn_deps is not None: raise LoopyError("may not specify both insn_deps and depends_on") elif insn_deps is not None: warn("insn_deps is deprecated, use depends_on", DeprecationWarning, stacklevel=2) depends_on = insn_deps depends_on_is_final = insn_deps_is_final if forced_iname_deps is not None and within_inames is not None: raise LoopyError("may not specify both forced_iname_deps " "and within_inames") elif forced_iname_deps is not None: warn("forced_iname_deps is deprecated, use within_inames", DeprecationWarning, stacklevel=2) within_inames = forced_iname_deps within_inames_is_final = forced_iname_deps_is_final if predicates is None: predicates = frozenset() new_predicates = set() for pred in predicates: if isinstance(pred, str): from pymbolic.primitives import LogicalNot from loopy.symbolic import parse if pred.startswith("!"): warn("predicates starting with '!' are deprecated. " "Simply use 'not' instead") pred = LogicalNot(parse(pred[1:])) else: pred = parse(pred) new_predicates.add(pred) predicates = frozenset(new_predicates) del new_predicates # }}} if depends_on is None: depends_on = frozenset() if groups is None: groups = frozenset() if conflicts_with_groups is None: conflicts_with_groups = frozenset() if no_sync_with is None: no_sync_with = frozenset() if within_inames is None: within_inames = frozenset() if within_inames_is_final is None: within_inames_is_final = False if isinstance(depends_on, str): depends_on = frozenset(s.strip() for s in depends_on.split(",") if s.strip()) if depends_on_is_final is None: depends_on_is_final = False if depends_on_is_final and not isinstance(depends_on, frozenset): raise LoopyError("Setting depends_on_is_final to True requires " "actually specifying depends_on") if tags is None: tags = frozenset() if priority is None: priority = 0 if not isinstance(tags, frozenset): # was previously allowed to be tuple tags = frozenset(tags) # Periodically reenable these and run the tests to ensure all # performance-relevant identifiers are interned. # # from loopy.tools import is_interned # assert is_interned(id) # assert all(is_interned(dep) for dep in depends_on) # assert all(is_interned(grp) for grp in groups) # assert all(is_interned(grp) for grp in conflicts_with_groups) # assert all(is_interned(iname) for iname in within_inames) # assert all(is_interned(pred) for pred in predicates) assert isinstance(within_inames, frozenset) assert isinstance(depends_on, frozenset) or depends_on is None assert isinstance(groups, frozenset) assert isinstance(conflicts_with_groups, frozenset) ImmutableRecord.__init__(self, id=id, depends_on=depends_on, depends_on_is_final=depends_on_is_final, no_sync_with=no_sync_with, groups=groups, conflicts_with_groups=conflicts_with_groups, within_inames_is_final=within_inames_is_final, within_inames=within_inames, priority=priority, boostable=boostable, boostable_into=boostable_into, predicates=predicates, tags=tags)
def parse_array_dim_tags(dim_tags, n_axes=None, use_increasing_target_axes=False, dim_names=None): if isinstance(dim_tags, str): dim_tags = dim_tags.split(",") if isinstance(dim_tags, dict): dim_tags_dict = dim_tags if dim_names is None: raise LoopyError("dim_tags may only be given as a dictionary if " "dim_names is available") assert n_axes == len(dim_names) dim_tags = [None] * n_axes for dim_name, val in dim_tags_dict.items(): try: dim_idx = dim_names.index(dim_name) except ValueError: raise LoopyError("'%s' does not name an array axis" % dim_name) dim_tags[dim_idx] = val for idim, dim_tag in enumerate(dim_tags): if dim_tag is None: raise LoopyError( "array axis tag for axis %d (1-based) was not " "set by passed dictionary" % (idim + 1)) default_target_axis = 0 result = [] # a mapping from target axes to used nesting levels nesting_levels = {} target_axis_to_has_explicit_nesting_level = {} for iaxis, dim_tag in enumerate(dim_tags): has_explicit_nesting_level, is_optional, parsed_dim_tag = ( _parse_array_dim_tag(dim_tag, default_target_axis, nesting_levels)) if (is_optional and n_axes is not None and len(result) + (len(dim_tags) - iaxis) > n_axes): continue if isinstance(parsed_dim_tag, _StrideArrayDimTagBase): # {{{ check for C/F mixed with explicit layout nesting level specs if (parsed_dim_tag.target_axis in target_axis_to_has_explicit_nesting_level): if (has_explicit_nesting_level != target_axis_to_has_explicit_nesting_level[ parsed_dim_tag.target_axis]): raise LoopyError( "may not mix C/F dim_tag specifications with " "explicit specification of layout nesting levels") else: target_axis_to_has_explicit_nesting_level[ parsed_dim_tag.target_axis] = has_explicit_nesting_level # }}} lnl = parsed_dim_tag.layout_nesting_level target_axis = parsed_dim_tag.target_axis if lnl is not None: if lnl in nesting_levels.get(target_axis, []): raise LoopyError("layout nesting level %d is not unique" " in target axis %d" % (lnl, target_axis)) nesting_levels.setdefault(target_axis, []) \ .append(parsed_dim_tag.layout_nesting_level) result.append(parsed_dim_tag) if use_increasing_target_axes: default_target_axis += 1 # {{{ check contiguity of nesting levels for target_axis, ta_nesting_levels in nesting_levels.items(): if sorted(ta_nesting_levels) != list( range(min(ta_nesting_levels), min(ta_nesting_levels) + len(ta_nesting_levels))): raise LoopyError( "layout nesting levels '%s' " "for target axis %d not contiguous" % (",".join(str(nl) for nl in ta_nesting_levels), target_axis)) ta_nesting_level_increment = -min(ta_nesting_levels) for i in range(len(result)): if (isinstance(result[i], _StrideArrayDimTagBase) and result[i].target_axis == target_axis and result[i].layout_nesting_level is not None): result[i] = result[i].copy( layout_nesting_level=result[i].layout_nesting_level + ta_nesting_level_increment) # }}} return result
def map_subscript(self, expr): WalkMapper.map_subscript(self, expr) from pymbolic.primitives import Variable assert isinstance(expr.aggregate, Variable) shape = None var_name = expr.aggregate.name if var_name in self.kernel.arg_dict: arg = self.kernel.arg_dict[var_name] shape = arg.shape elif var_name in self.kernel.temporary_variables: tv = self.kernel.temporary_variables[var_name] shape = tv.shape if shape is not None: subscript = expr.index if not isinstance(subscript, tuple): subscript = (subscript,) from loopy.symbolic import (get_dependencies, get_access_range, UnableToDetermineAccessRange) available_vars = set(self.domain.get_var_dict()) shape_deps = set() for shape_axis in shape: if shape_axis is not None: shape_deps.update(get_dependencies(shape_axis)) if not (get_dependencies(subscript) <= available_vars and shape_deps <= available_vars): return if len(subscript) != len(shape): raise LoopyError("subscript to '%s' in '%s' has the wrong " "number of indices (got: %d, expected: %d)" % ( expr.aggregate.name, expr, len(subscript), len(shape))) try: access_range = get_access_range(self.domain, subscript, self.kernel.assumptions) except UnableToDetermineAccessRange: # Likely: index was non-affine, nothing we can do. return shape_domain = isl.BasicSet.universe(access_range.get_space()) for idim in range(len(subscript)): shape_axis = shape[idim] if shape_axis is not None: from loopy.isl_helpers import make_slab slab = make_slab( shape_domain.get_space(), (dim_type.in_, idim), 0, shape_axis) shape_domain = shape_domain.intersect(slab) if not access_range.is_subset(shape_domain): raise LoopyError("'%s' in instruction '%s' " "accesses out-of-bounds array element (could not" " establish '%s' is a subset of '%s')." % (expr, self.insn_id, access_range, shape_domain))
def convert_computed_to_fixed_dim_tags(name, num_user_axes, num_target_axes, shape, dim_tags): # Just to clarify: # # - user axes are user-facing--what the user actually uses for indexing. # # - target axes are implementation facing. Normal in-memory arrays have one. # 3D images have three. import loopy as lp # {{{ pick apart arg dim tags into computed, fixed and vec vector_dim = None # a mapping from target axes to {layout_nesting_level: dim_tag_index} target_axis_to_nesting_level_map = {} for i, dim_tag in enumerate(dim_tags): if isinstance(dim_tag, VectorArrayDimTag): if vector_dim is not None: raise LoopyError("arg '%s' may only have one vector-tagged " "argument dimension" % name) vector_dim = i elif isinstance(dim_tag, _StrideArrayDimTagBase): if dim_tag.layout_nesting_level is None: continue nl_map = target_axis_to_nesting_level_map \ .setdefault(dim_tag.target_axis, {}) assert dim_tag.layout_nesting_level not in nl_map nl_map[dim_tag.layout_nesting_level] = i elif isinstance(dim_tag, SeparateArrayArrayDimTag): pass else: raise LoopyError("invalid array dim tag") # }}} # {{{ convert computed to fixed stride dim tags new_dim_tags = dim_tags[:] for target_axis in range(num_target_axes): if vector_dim is None: stride_so_far = 1 else: if shape is None or shape is lp.auto: # unable to normalize without known shape return None if not is_integer(shape[vector_dim]): raise TypeError( "shape along vector axis %d of array '%s' " "must be an integer, not an expression ('%s')" % (vector_dim, name, shape[vector_dim])) stride_so_far = shape[vector_dim] # FIXME: OpenCL-specific if stride_so_far == 3: stride_so_far = 4 nesting_level_map = target_axis_to_nesting_level_map.get( target_axis, {}) nl_keys = sorted(nesting_level_map.keys()) if not nl_keys: continue for key in nl_keys: dim_tag_index = nesting_level_map[key] dim_tag = dim_tags[dim_tag_index] if isinstance(dim_tag, ComputedStrideArrayDimTag): if stride_so_far is None: raise LoopyError( "unable to determine fixed stride " "for axis %d because it is nested outside of " "an 'auto' stride axis" % dim_tag_index) new_dim_tags[dim_tag_index] = FixedStrideArrayDimTag( stride_so_far, target_axis=dim_tag.target_axis, layout_nesting_level=dim_tag.layout_nesting_level) if shape is None or shape is lp.auto: # unable to normalize without known shape return None shape_axis = shape[dim_tag_index] if shape_axis is None: stride_so_far = None else: stride_so_far *= shape_axis if dim_tag.pad_to is not None: from pytools import div_ceil stride_so_far = (div_ceil(stride_so_far, dim_tag.pad_to) * stride_so_far) elif isinstance(dim_tag, FixedStrideArrayDimTag): stride_so_far = dim_tag.stride if stride_so_far is lp.auto: stride_so_far = None else: raise TypeError("internal error in dim_tag conversion") # }}} return new_dim_tags
def check_implemented_domains(kernel, implemented_domains, code=None): from islpy import dim_type from islpy import align_two last_idomains = None last_insn_inames = None for insn_id, idomains in six.iteritems(implemented_domains): insn = kernel.id_to_insn[insn_id] assert idomains insn_inames = kernel.insn_inames(insn) # {{{ if we've checked the same thing before, no need to check it again if last_idomains is not None and last_insn_inames is not None: if idomains == last_idomains and insn_inames == last_insn_inames: continue last_idomains = idomains last_insn_inames = insn_inames # }}} insn_impl_domain = idomains[0] for idomain in idomains[1:]: insn_impl_domain = insn_impl_domain | idomain assumption_non_param = isl.BasicSet.from_params(kernel.assumptions) assumptions, insn_impl_domain = align_two( assumption_non_param, insn_impl_domain) insn_impl_domain = ( (insn_impl_domain & assumptions) .project_out_except(insn_inames, [dim_type.set])) from loopy.kernel.instruction import BarrierInstruction from loopy.kernel.data import LocalIndexTag if isinstance(insn, BarrierInstruction): # project out local-id-mapped inames, solves #94 on gitlab non_lid_inames = frozenset(iname for iname in insn_inames if not kernel.iname_tags_of_type(iname, LocalIndexTag)) insn_impl_domain = insn_impl_domain.project_out_except( non_lid_inames, [dim_type.set]) insn_domain = kernel.get_inames_domain(insn_inames) insn_parameters = frozenset(insn_domain.get_var_names(dim_type.param)) assumptions, insn_domain = align_two(assumption_non_param, insn_domain) desired_domain = ((insn_domain & assumptions) .project_out_except(insn_inames, [dim_type.set]) .project_out_except(insn_parameters, [dim_type.param])) if isinstance(insn, BarrierInstruction): # project out local-id-mapped inames, solves #94 on gitlab desired_domain = desired_domain.project_out_except( non_lid_inames, [dim_type.set]) insn_impl_domain = (insn_impl_domain .project_out_except(insn_parameters, [dim_type.param])) insn_impl_domain, desired_domain = align_two( insn_impl_domain, desired_domain) if insn_impl_domain != desired_domain: i_minus_d = insn_impl_domain - desired_domain d_minus_i = desired_domain - insn_impl_domain parameter_inames = set( insn_domain.get_dim_name(dim_type.param, i) for i in range(insn_impl_domain.dim(dim_type.param))) lines = [] for bigger, smaller, diff_set, gist_domain in [ ("implemented", "desired", i_minus_d, desired_domain.gist(insn_impl_domain)), ("desired", "implemented", d_minus_i, insn_impl_domain.gist(desired_domain))]: if diff_set.is_empty(): continue diff_set = diff_set.coalesce() pt = diff_set.sample_point() assert not pt.is_void() #pt_set = isl.Set.from_point(pt) #lines.append("point implemented: %s" % (pt_set <= insn_impl_domain)) #lines.append("point desired: %s" % (pt_set <= desired_domain)) iname_to_dim = pt.get_space().get_var_dict() point_axes = [] for iname in kernel.insn_inames(insn) | parameter_inames: tp, dim = iname_to_dim[iname] point_axes.append("%s=%d" % ( iname, pt.get_coordinate_val(tp, dim).to_python())) lines.append( "sample point in %s but not %s: %s" % ( bigger, smaller, ", ".join(point_axes))) lines.append( "gist of constraints in %s but not %s: %s" % ( smaller, bigger, gist_domain)) if code is not None: print(79*"-") print("CODE:") print(79*"-") from loopy.target.execution import get_highlighted_code print(get_highlighted_code(code)) print(79*"-") raise LoopyError("sanity check failed--implemented and desired " "domain for instruction '%s' do not match\n\n" "implemented: %s\n\n" "desired:%s\n\n%s" % (insn_id, insn_impl_domain, desired_domain, "\n".join(lines))) # placate the assert at the call site return True
def __init__(self, name, dtype=None, shape=None, dim_tags=None, offset=0, dim_names=None, strides=None, order=None, for_atomic=False, target=None, alignment=None, **kwargs): """ All of the following (except *name*) are optional. Specify either strides or shape. :arg name: When passed to :class:`loopy.make_kernel`, this may contain multiple names separated by commas, in which case multiple arguments, each with identical properties, are created for each name. :arg shape: May be any of the things specified under :attr:`shape`, or a string which can be parsed into the previous form. :arg dim_tags: A comma-separated list of tags as understood by :func:`loopy.kernel.array.parse_array_dim_tags`. :arg strides: May be one of the following: * None * :class:`loopy.auto`. The strides will be determined by *order* and the access footprint. * a tuple like like :attr:`numpy.ndarray.shape`. Each entry of the tuple is also allowed to be a :mod:`pymbolic` expression involving kernel parameters, or a (potentially-comma separated) or a string that can be parsed to such an expression. * A string which can be parsed into the previous form. :arg order: "F" or "C" for C (row major) or Fortran (column major). Defaults to the *default_order* argument passed to :func:`loopy.make_kernel`. :arg for_atomic: Whether the array is declared for atomic access, and, if necessary, using atomic-capable data types. :arg offset: (See :attr:`offset`) :arg alignment: memory alignment in bytes """ for kwarg_name in kwargs: if kwarg_name not in self.allowed_extra_kwargs: raise TypeError("invalid kwarg: %s" % kwarg_name) import loopy as lp from loopy.types import to_loopy_type dtype = to_loopy_type(dtype, allow_auto=True, allow_none=True, for_atomic=for_atomic, target=target) if dtype is lp.auto: from warnings import warn warn( "Argument/temporary data type for '%s' should be None if " "unspecified, not auto. This usage will be disallowed in 2018." % name, DeprecationWarning, stacklevel=2) dtype = None strides_known = strides is not None and strides is not lp.auto shape_known = shape is not None and shape is not lp.auto if strides_known: strides = _parse_shape_or_strides(strides) if shape_known: shape = _parse_shape_or_strides(shape) # {{{ check dim_names if dim_names is not None: if len(dim_names) != len(set(dim_names)): raise LoopyError("dim_names are not unique") for n in dim_names: if not isinstance(n, str): raise LoopyError("found non-string '%s' in dim_names" % type(n).__name__) # }}} # {{{ convert strides to dim_tags (Note: strides override order) if dim_tags is not None and strides_known: raise TypeError("may not specify both strides and dim_tags") if dim_tags is None and strides_known: dim_tags = [FixedStrideArrayDimTag(s) for s in strides] strides = None # }}} if dim_tags is not None: dim_tags = parse_array_dim_tags( dim_tags, n_axes=(len(shape) if shape_known else None), use_increasing_target_axes=self.max_target_axes > 1, dim_names=dim_names) # {{{ determine number of user axes num_user_axes = None if shape_known: num_user_axes = len(shape) for dim_iterable in [dim_tags, dim_names]: if dim_iterable is not None: new_num_user_axes = len(dim_iterable) if num_user_axes is None: num_user_axes = new_num_user_axes else: if new_num_user_axes != num_user_axes: raise LoopyError( "contradictory values for number of " "dimensions of array '%s' from shape, strides, " "dim_tags, or dim_names" % name) del new_num_user_axes # }}} # {{{ convert order to dim_tags if order is None and self.max_target_axes > 1: # FIXME: Hackety hack. ImageArgs need to generate dim_tags even # if no order is specified. Plus they don't care that much. order = "C" if dim_tags is None and num_user_axes is not None and order is not None: dim_tags = parse_array_dim_tags( num_user_axes * [order], n_axes=num_user_axes, use_increasing_target_axes=self.max_target_axes > 1, dim_names=dim_names) order = None # }}} if dim_tags is not None: # {{{ find number of target axes target_axes = set() for dim_tag in dim_tags: if isinstance(dim_tag, _StrideArrayDimTagBase): target_axes.add(dim_tag.target_axis) if target_axes != set(range(len(target_axes))): raise LoopyError("target axes for variable '%s' are non-" "contiguous" % self.name) num_target_axes = len(target_axes) del target_axes # }}} if not (self.min_target_axes <= num_target_axes <= self.max_target_axes): raise LoopyError( "%s only supports between %d and %d target axes " "('%s' has %d)" % (type(self).__name__, self.min_target_axes, self.max_target_axes, self.name, num_target_axes)) new_dim_tags = convert_computed_to_fixed_dim_tags( name, num_user_axes, num_target_axes, shape, dim_tags) if new_dim_tags is not None: # successfully normalized dim_tags = new_dim_tags del new_dim_tags if dim_tags is not None: # for hashability dim_tags = tuple(dim_tags) order = None if strides is not None: # Preserve strides if we weren't able to process them yet. # That only happens if they're set to loopy.auto (and 'guessed' # in loopy.kernel.creation). kwargs["strides"] = strides if dim_names is not None and not isinstance(dim_names, tuple): from warnings import warn warn("dim_names is not a tuple when calling ArrayBase constructor", DeprecationWarning, stacklevel=2) ImmutableRecord.__init__(self, name=name, dtype=dtype, shape=shape, dim_tags=dim_tags, offset=offset, dim_names=dim_names, order=order, alignment=alignment, for_atomic=for_atomic, **kwargs)
def auto_test_vs_ref(ref_knl, ctx, test_knl=None, op_count=[], op_label=[], parameters={}, print_ref_code=False, print_code=True, warmup_rounds=2, dump_binary=False, fills_entire_output=None, do_check=True, check_result=None, max_test_kernel_count=1, quiet=False, blacklist_ref_vendors=[]): """Compare results of `ref_knl` to the kernels generated by scheduling *test_knl*. :arg check_result: a callable with :class:`numpy.ndarray` arguments *(result, reference_result)* returning a a tuple (class:`bool`, message) indicating correctness/acceptability of the result :arg max_test_kernel_count: Stop testing after this many *test_knl* """ import pyopencl as cl if test_knl is None: test_knl = ref_knl do_check = False if len(ref_knl.args) != len(test_knl.args): raise LoopyError("ref_knl and test_knl do not have the same number " "of arguments") for i, (ref_arg, test_arg) in enumerate(zip(ref_knl.args, test_knl.args)): if ref_arg.name != test_arg.name: raise LoopyError( "ref_knl and test_knl argument lists disagree at index " "%d (1-based)" % (i + 1)) if ref_arg.dtype != test_arg.dtype: raise LoopyError( "ref_knl and test_knl argument lists disagree at index " "%d (1-based)" % (i + 1)) from loopy.compiled import CompiledKernel from loopy.target.execution import get_highlighted_code if isinstance(op_count, (int, float)): warn("op_count should be a list", stacklevel=2) op_count = [op_count] if isinstance(op_label, str): warn("op_label should be a list", stacklevel=2) op_label = [op_label] from time import time if check_result is None: check_result = _default_check_result if fills_entire_output is not None: warn("fills_entire_output is deprecated", DeprecationWarning, stacklevel=2) # {{{ compile and run reference code from loopy.type_inference import infer_unknown_types ref_knl = infer_unknown_types(ref_knl, expect_completion=True) found_ref_device = False ref_errors = [] from loopy.kernel.data import ImageArg need_ref_image_support = any( isinstance(arg, ImageArg) for arg in ref_knl.args) for dev in _enumerate_cl_devices_for_ref_test(blacklist_ref_vendors, need_ref_image_support): ref_ctx = cl.Context([dev]) ref_queue = cl.CommandQueue( ref_ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) pp_ref_knl = lp.preprocess_kernel(ref_knl) for knl in lp.generate_loop_schedules(pp_ref_knl): ref_sched_kernel = knl break logger.info("{} (ref): trying {} for the reference calculation".format( ref_knl.name, dev)) ref_compiled = CompiledKernel(ref_ctx, ref_sched_kernel) if not quiet and print_ref_code: print(75 * "-") print("Reference Code:") print(75 * "-") print(get_highlighted_code(ref_compiled.get_code())) print(75 * "-") ref_kernel_info = ref_compiled.kernel_info(frozenset()) try: ref_args, ref_arg_data = \ make_ref_args(ref_sched_kernel, ref_kernel_info.implemented_data_info, ref_queue, parameters) ref_args["out_host"] = False except cl.RuntimeError as e: if e.code == cl.status_code.IMAGE_FORMAT_NOT_SUPPORTED: import traceback ref_errors.append("\n".join([ 75 * "-", "On %s:" % dev, 75 * "-", traceback.format_exc(), 75 * "-" ])) continue else: raise found_ref_device = True if not do_check: break ref_queue.finish() logger.info("{} (ref): using {} for the reference calculation".format( ref_knl.name, dev)) logger.info("%s (ref): run" % ref_knl.name) ref_start = time() if not AUTO_TEST_SKIP_RUN: ref_evt, _ = ref_compiled(ref_queue, **ref_args) else: ref_evt = cl.enqueue_marker(ref_queue) ref_queue.finish() ref_stop = time() ref_elapsed_wall = ref_stop - ref_start logger.info("%s (ref): run done" % ref_knl.name) ref_evt.wait() ref_elapsed_event = 1e-9 * (ref_evt.profile.END - ref_evt.profile.START) break if not found_ref_device: raise LoopyError("could not find a suitable device for the " "reference computation.\n" "These errors were encountered:\n" + "\n".join(ref_errors)) # }}} # {{{ compile and run parallel code need_check = do_check queue = cl.CommandQueue( ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) from loopy.kernel import KernelState from loopy.target.pyopencl import PyOpenCLTarget if test_knl.state not in [ KernelState.PREPROCESSED, KernelState.LINEARIZED ]: if isinstance(test_knl.target, PyOpenCLTarget): test_knl = test_knl.copy(target=PyOpenCLTarget(ctx.devices[0])) test_knl = lp.preprocess_kernel(test_knl) if not test_knl.schedule: test_kernels = lp.generate_loop_schedules(test_knl) else: test_kernels = [test_knl] test_kernel_count = 0 from loopy.type_inference import infer_unknown_types for i, kernel in enumerate(test_kernels): test_kernel_count += 1 if test_kernel_count > max_test_kernel_count: break kernel = infer_unknown_types(kernel, expect_completion=True) compiled = CompiledKernel(ctx, kernel) kernel_info = compiled.kernel_info(frozenset()) args = make_args(kernel, kernel_info.implemented_data_info, queue, ref_arg_data, parameters) args["out_host"] = False if not quiet: print(75 * "-") print("Kernel #%d:" % i) print(75 * "-") if print_code: print(compiled.get_highlighted_code()) print(75 * "-") if dump_binary: # {{{ find cl program for name in dir(kernel_info.cl_kernels): if name.startswith("__"): continue cl_kernel = getattr(kernel_info.cl_kernels, name) cl_program = cl_kernel.get_info(cl.kernel_info.PROGRAM) break else: assert False, "could not find cl_program" # }}} print(type(cl_program)) if hasattr(cl_program, "binaries"): print(cl_program.binaries[0]) print(75 * "-") logger.info("%s: run warmup" % (knl.name)) for i in range(warmup_rounds): if not AUTO_TEST_SKIP_RUN: compiled(queue, **args) if need_check and not AUTO_TEST_SKIP_RUN: for arg_desc in ref_arg_data: if arg_desc is None: continue if not arg_desc.needs_checking: continue from pyopencl.compyte.array import as_strided ref_ary = as_strided( arg_desc.ref_storage_array.get(), shape=arg_desc.ref_shape, strides=arg_desc.ref_numpy_strides).flatten() test_ary = as_strided( arg_desc.test_storage_array.get(), shape=arg_desc.test_shape, strides=arg_desc.test_numpy_strides).flatten() common_len = min(len(ref_ary), len(test_ary)) ref_ary = ref_ary[:common_len] test_ary = test_ary[:common_len] error_is_small, error = check_result(test_ary, ref_ary) if not error_is_small: raise AutomaticTestFailure(error) need_check = False events = [] queue.finish() logger.info("%s: warmup done" % (knl.name)) logger.info("%s: timing run" % (knl.name)) timing_rounds = max(warmup_rounds, 1) while True: from time import time start_time = time() evt_start = cl.enqueue_marker(queue) for i in range(timing_rounds): if not AUTO_TEST_SKIP_RUN: evt, _ = compiled(queue, **args) events.append(evt) else: events.append(cl.enqueue_marker(queue)) evt_end = cl.enqueue_marker(queue) queue.finish() stop_time = time() for evt in events: evt.wait() evt_start.wait() evt_end.wait() elapsed_event = (1e-9*events[-1].profile.END - 1e-9*events[0].profile.START) \ / timing_rounds try: elapsed_event_marker = ((1e-9 * evt_end.profile.START - 1e-9 * evt_start.profile.START) / timing_rounds) except cl.RuntimeError: elapsed_event_marker = None elapsed_wall = (stop_time - start_time) / timing_rounds if elapsed_wall * timing_rounds < 0.3: timing_rounds *= 4 else: break logger.info("%s: timing run done" % (knl.name)) rates = "" for cnt, lbl in zip(op_count, op_label): rates += " {:g} {}/s".format(cnt / elapsed_wall, lbl) if not quiet: def format_float_or_none(v): if v is None: return "<unavailable>" else: return "%g" % v print("elapsed: %s s event, %s s marker-event %s s wall " "(%d rounds)%s" % (format_float_or_none(elapsed_event), format_float_or_none(elapsed_event_marker), format_float_or_none(elapsed_wall), timing_rounds, rates)) if do_check: ref_rates = "" for cnt, lbl in zip(op_count, op_label): ref_rates += " {:g} {}/s".format(cnt / ref_elapsed_event, lbl) if not quiet: print("ref: elapsed: {:g} s event, {:g} s wall{}".format( ref_elapsed_event, ref_elapsed_wall, ref_rates)) # }}} result_dict = {} result_dict["elapsed_event"] = elapsed_event result_dict["elapsed_event_marker"] = elapsed_event_marker result_dict["elapsed_wall"] = elapsed_wall result_dict["timing_rounds"] = timing_rounds if do_check: result_dict["ref_elapsed_event"] = ref_elapsed_event result_dict["ref_elapsed_wall"] = ref_elapsed_wall return result_dict
def alias_temporaries(knl, names, base_name_prefix=None, synchronize_for_exclusive_use=True): """Sets all temporaries given by *names* to be backed by a single piece of storage. :arg synchronize_for_exclusive_use: A :class:`bool`. If ``True``, this also introduces ordering structures ("groups") to prevent the usage to ensure that the live ranges (i.e. the regions of code where each of the temporaries is used) do not overlap. This will allow two (or more) temporaries to share the same storage space as long as their live ranges do not need to be concurrent. :arg base_name_prefix: an identifier to be used for the common storage area .. versionchanged:: 2016.3 Added *synchronize_for_exclusive_use* flag. ``synchronize_for_exclusive_use=True`` was the previous default behavior. """ gng = knl.get_group_name_generator() group_names = [gng("tmpgrp_"+name) for name in names] if base_name_prefix is None: base_name_prefix = "temp_storage" vng = knl.get_var_name_generator() base_name = vng(base_name_prefix) names_set = set(names) if synchronize_for_exclusive_use: new_insns = [] for insn in knl.instructions: temp_deps = insn.dependency_names() & names_set if not temp_deps: new_insns.append(insn) continue if len(temp_deps) > 1: raise LoopyError("Instruction {insn} refers to multiple of the " "temporaries being aliased, namely '{temps}'. Cannot alias." .format( insn=insn.id, temps=", ".join(temp_deps))) temp_name, = temp_deps temp_idx = names.index(temp_name) group_name = group_names[temp_idx] other_group_names = ( frozenset(group_names[:temp_idx]) | frozenset(group_names[temp_idx+1:])) new_insns.append( insn.copy( groups=insn.groups | frozenset([group_name]), conflicts_with_groups=( insn.conflicts_with_groups | other_group_names))) else: new_insns = knl.instructions new_temporary_variables = {} for tv in six.itervalues(knl.temporary_variables): if tv.name in names_set: if tv.base_storage is not None: raise LoopyError("temporary variable '{tv}' already has " "a defined storage array -- cannot alias" .format(tv=tv.name)) new_temporary_variables[tv.name] = \ tv.copy(base_storage=base_name) else: new_temporary_variables[tv.name] = tv return knl.copy( instructions=new_insns, temporary_variables=new_temporary_variables)
def get_hw_axis_sizes_and_tags_for_save_slot(self, temporary): """ This is used for determining the amount of global storage needed for saving and restoring the temporary across kernel calls, due to hardware parallel inames (the inferred axes get prefixed to the number of dimensions in the temporary). In the case of local temporaries, inames that are tagged hw-local do not contribute to the global storage shape. """ accessor_insn_ids = frozenset( self.kernel.reader_map()[temporary.name] | self.kernel.writer_map()[temporary.name]) group_tags = None local_tags = None def _sortedtags(tags): return sorted(tags, key=lambda tag: tag.axis) for insn_id in accessor_insn_ids: insn = self.kernel.id_to_insn[insn_id] my_group_tags = [] my_local_tags = [] for iname in insn.within_inames: tag = self.kernel.iname_to_tag.get(iname) if tag is None: continue from loopy.kernel.data import (GroupIndexTag, LocalIndexTag, ParallelTag) if isinstance(tag, GroupIndexTag): my_group_tags.append(tag) elif isinstance(tag, LocalIndexTag): my_local_tags.append(tag) elif isinstance(tag, ParallelTag): raise LoopyError("iname '%s' is tagged with '%s' - only " "group and local tags are supported for " "auto save/reload of temporaries" % (iname, tag)) if group_tags is None: group_tags = _sortedtags(my_group_tags) local_tags = _sortedtags(my_local_tags) group_tags_originating_insn_id = insn_id if (group_tags != _sortedtags(my_group_tags) or local_tags != _sortedtags(my_local_tags)): raise LoopyError( "inconsistent parallel tags across instructions that access " "'%s' (specifically, instruction '%s' has tags '%s' but " "instruction '%s' has tags '%s')" % (temporary.name, group_tags_originating_insn_id, group_tags + local_tags, insn_id, my_group_tags + my_local_tags)) if group_tags is None: assert local_tags is None return (), () group_sizes, local_sizes = ( self.kernel.get_grid_sizes_for_insn_ids_as_exprs(accessor_insn_ids) ) if temporary.scope == lp.temp_var_scope.LOCAL: # Elide local axes in the save slot for local temporaries. del local_tags[:] local_sizes = () # We set hw_dims to be arranged according to the order: # g.0 < g.1 < ... < l.0 < l.1 < ... return (group_sizes + local_sizes), tuple(group_tags + local_tags)
def generate_code_v2(kernel): """ :returns: a :class:`CodeGenerationResult` """ from loopy.kernel import kernel_state if kernel.state == kernel_state.INITIAL: from loopy.preprocess import preprocess_kernel kernel = preprocess_kernel(kernel) if kernel.schedule is None: from loopy.schedule import get_one_scheduled_kernel kernel = get_one_scheduled_kernel(kernel) if kernel.state != kernel_state.SCHEDULED: raise LoopyError("cannot generate code for a kernel that has not been " "scheduled") # {{{ cache retrieval from loopy import CACHING_ENABLED if CACHING_ENABLED: input_kernel = kernel try: result = code_gen_cache[input_kernel] logger.debug("%s: code generation cache hit" % kernel.name) return result except KeyError: pass # }}} from loopy.type_inference import infer_unknown_types kernel = infer_unknown_types(kernel, expect_completion=True) from loopy.check import pre_codegen_checks pre_codegen_checks(kernel) logger.info("%s: generate code: start" % kernel.name) # {{{ examine arg list from loopy.kernel.data import ValueArg from loopy.kernel.array import ArrayBase implemented_data_info = [] for arg in kernel.args: is_written = arg.name in kernel.get_written_variables() if isinstance(arg, ArrayBase): implemented_data_info.extend( arg.decl_info( kernel.target, is_written=is_written, index_dtype=kernel.index_dtype)) elif isinstance(arg, ValueArg): implemented_data_info.append(ImplementedDataInfo( target=kernel.target, name=arg.name, dtype=arg.dtype, arg_class=ValueArg, is_written=is_written)) else: raise ValueError("argument type not understood: '%s'" % type(arg)) allow_complex = False for var in kernel.args + list(six.itervalues(kernel.temporary_variables)): if var.dtype.involves_complex(): allow_complex = True # }}} seen_dtypes = set() seen_functions = set() seen_atomic_dtypes = set() initial_implemented_domain = isl.BasicSet.from_params(kernel.assumptions) codegen_state = CodeGenerationState( kernel=kernel, implemented_data_info=implemented_data_info, implemented_domain=initial_implemented_domain, implemented_predicates=frozenset(), seen_dtypes=seen_dtypes, seen_functions=seen_functions, seen_atomic_dtypes=seen_atomic_dtypes, var_subst_map={}, allow_complex=allow_complex, var_name_generator=kernel.get_var_name_generator(), is_generating_device_code=False, gen_program_name=( kernel.target.host_program_name_prefix + kernel.name + kernel.target.host_program_name_suffix), schedule_index_end=len(kernel.schedule)) from loopy.codegen.result import generate_host_or_device_program codegen_result = generate_host_or_device_program( codegen_state, schedule_index=0) device_code_str = codegen_result.device_code() from loopy.check import check_implemented_domains assert check_implemented_domains(kernel, codegen_result.implemented_domains, device_code_str) # {{{ handle preambles for arg in kernel.args: seen_dtypes.add(arg.dtype) for tv in six.itervalues(kernel.temporary_variables): seen_dtypes.add(tv.dtype) preambles = kernel.preambles[:] preamble_info = PreambleInfo( kernel=kernel, seen_dtypes=seen_dtypes, seen_functions=seen_functions, # a set of LoopyTypes (!) seen_atomic_dtypes=seen_atomic_dtypes) preamble_generators = (kernel.preamble_generators + kernel.target.get_device_ast_builder().preamble_generators()) for prea_gen in preamble_generators: preambles.extend(prea_gen(preamble_info)) codegen_result = codegen_result.copy(device_preambles=preambles) # }}} logger.info("%s: generate code: done" % kernel.name) if CACHING_ENABLED: code_gen_cache[input_kernel] = codegen_result return codegen_result