def rename_argument(kernel, old_name, new_name, existing_ok=False): """ .. versionadded:: 2016.2 """ var_name_gen = kernel.get_var_name_generator() if old_name not in kernel.arg_dict: raise LoopyError("old arg name '%s' does not exist" % old_name) does_exist = var_name_gen.is_name_conflicting(new_name) if does_exist and not existing_ok: raise LoopyError( "argument name '%s' conflicts with an existing identifier" "--cannot rename" % new_name) # {{{ instructions from pymbolic import var subst_dict = {old_name: var(new_name)} from loopy.symbolic import (RuleAwareSubstitutionMapper, SubstitutionRuleMappingContext) from pymbolic.mapper.substitutor import make_subst_func rule_mapping_context = SubstitutionRuleMappingContext( kernel.substitutions, var_name_gen) smap = RuleAwareSubstitutionMapper(rule_mapping_context, make_subst_func(subst_dict), within=lambda kernel, insn, stack: True) kernel = rule_mapping_context.finish_kernel(smap.map_kernel(kernel)) # }}} # {{{ args new_args = [] for arg in kernel.args: if arg.name == old_name: arg = arg.copy(name=new_name) new_args.append(arg) # }}} # {{{ domain/assumptions def rename_arg_in_basic_set(dom): dom_var_dict = dom.get_var_dict() if old_name in dom_var_dict: dt, pos = dom_var_dict[old_name] dom = dom.set_dim_name(dt, pos, new_name) return dom new_domains = [] for dom in kernel.domains: dom = rename_arg_in_basic_set(dom) new_domains.append(dom) new_assumptions = rename_arg_in_basic_set(kernel.assumptions) # }}} return kernel.copy(domains=new_domains, args=new_args, assumptions=new_assumptions)
def _check_for_unused_hw_axes_in_kernel_chunk(kernel, sched_index=None): from loopy.schedule import (CallKernel, RunInstruction, Barrier, EnterLoop, LeaveLoop, ReturnFromKernel, get_insn_ids_for_block_at, gather_schedule_block) if sched_index is None: group_axes = set() local_axes = set() i = 0 loop_end_i = past_end_i = len(kernel.schedule) else: assert isinstance(kernel.schedule[sched_index], CallKernel) _, past_end_i = gather_schedule_block(kernel.schedule, sched_index) group_size, local_size = kernel.get_grid_sizes_for_insn_ids_as_exprs( get_insn_ids_for_block_at(kernel.schedule, sched_index)) group_axes = set(ax for ax, length in enumerate(group_size)) local_axes = set(ax for ax, length in enumerate(local_size)) i = sched_index + 1 assert isinstance(kernel.schedule[past_end_i - 1], ReturnFromKernel) loop_end_i = past_end_i - 1 # alternative: just disregard length-1 dimensions? from loopy.kernel.data import LocalIndexTag, AutoLocalIndexTagBase, GroupIndexTag while i < loop_end_i: sched_item = kernel.schedule[i] if isinstance(sched_item, CallKernel): i = _check_for_unused_hw_axes_in_kernel_chunk(kernel, i) elif isinstance(sched_item, RunInstruction): insn = kernel.id_to_insn[sched_item.insn_id] i += 1 if insn.boostable: continue group_axes_used = set() local_axes_used = set() for iname in kernel.insn_inames(insn): tag = kernel.iname_to_tag.get(iname) if isinstance(tag, LocalIndexTag): local_axes_used.add(tag.axis) elif isinstance(tag, GroupIndexTag): group_axes_used.add(tag.axis) elif isinstance(tag, AutoLocalIndexTagBase): raise LoopyError("auto local tag encountered") if group_axes != group_axes_used: raise LoopyError("instruction '%s' does not use all group hw axes " "(available: %s used:%s)" % (insn.id, ",".join(str(i) for i in group_axes), ",".join(str(i) for i in group_axes_used))) if local_axes != local_axes_used: raise LoopyError("instruction '%s' does not use all local hw axes " "(available: %s used:%s)" % (insn.id, ",".join(str(i) for i in local_axes), ",".join(str(i) for i in local_axes_used))) elif isinstance(sched_item, (Barrier, EnterLoop, LeaveLoop)): i += 1 continue else: raise TypeError( "schedule item not understood: %s" % type(sched_item).__name__) return past_end_i
def opencl_function_mangler(kernel, name, arg_dtypes): if not isinstance(name, str): return None # OpenCL has min(), max() for integer types if name in ["max", "min"] and len(arg_dtypes) == 2: dtype = np.find_common_type( [], [dtype.numpy_dtype for dtype in arg_dtypes]) if dtype.kind == "i": result_dtype = NumpyType(dtype) return CallMangleInfo(target_name=name, result_dtypes=(result_dtype, ), arg_dtypes=2 * (result_dtype, )) if name == "pow" and len(arg_dtypes) == 2: dtype = np.find_common_type( [], [dtype.numpy_dtype for dtype in arg_dtypes]) if dtype == np.float64: name = "powf64" elif dtype == np.float32: name = "powf32" else: raise LoopyTypeError(f"'pow' does not support type {dtype}.") result_dtype = NumpyType(dtype) return CallMangleInfo(target_name=name, result_dtypes=(result_dtype, ), arg_dtypes=2 * (result_dtype, )) if name == "dot": scalar_dtype, offset, field_name = arg_dtypes[0].numpy_dtype.fields[ "s0"] return CallMangleInfo(target_name=name, result_dtypes=(NumpyType(scalar_dtype), ), arg_dtypes=(arg_dtypes[0], ) * 2) if name in _CL_SIMPLE_MULTI_ARG_FUNCTIONS: num_args = _CL_SIMPLE_MULTI_ARG_FUNCTIONS[name] if len(arg_dtypes) != num_args: raise LoopyError("%s takes %d arguments (%d received)" % (name, num_args, len(arg_dtypes))) dtype = np.find_common_type( [], [dtype.numpy_dtype for dtype in arg_dtypes]) if dtype.kind == "c": raise LoopyError("%s does not support complex numbers" % name) result_dtype = NumpyType(dtype) return CallMangleInfo(target_name=name, result_dtypes=(result_dtype, ), arg_dtypes=(result_dtype, ) * num_args) if name in VECTOR_LITERAL_FUNCS: base_tp_name, dtype, count = VECTOR_LITERAL_FUNCS[name] if count != len(arg_dtypes): return None return CallMangleInfo(target_name="(%s%d) " % (base_tp_name, count), result_dtypes=(kernel.target.vector_dtype( NumpyType(dtype), count), ), arg_dtypes=(NumpyType(dtype), ) * count) return None
def generate_arg_setup(gen, kernel, implemented_data_info, options): import loopy as lp from loopy.kernel.data import KernelArgument from loopy.kernel.array import ArrayBase from loopy.symbolic import StringifyMapper from pymbolic import var gen("# {{{ set up array arguments") gen("") if not options.no_numpy: gen("_lpy_encountered_numpy = False") gen("_lpy_encountered_dev = False") gen("") args = [] strify = StringifyMapper() expect_no_more_arguments = False for arg_idx, arg in enumerate(implemented_data_info): is_written = arg.base_name in kernel.get_written_variables() kernel_arg = kernel.impl_arg_to_arg.get(arg.name) if not issubclass(arg.arg_class, KernelArgument): expect_no_more_arguments = True continue if expect_no_more_arguments: raise LoopyError("Further arguments encountered after arg info " "describing a global temporary variable") if not issubclass(arg.arg_class, ArrayBase): args.append(arg.name) continue gen("# {{{ process %s" % arg.name) gen("") if not options.no_numpy: gen("if isinstance(%s, _lpy_np.ndarray):" % arg.name) with Indentation(gen): gen("# synchronous, nothing to worry about") gen("%s = _lpy_cl_array.to_device(" "queue, %s, allocator=allocator)" % (arg.name, arg.name)) gen("_lpy_encountered_numpy = True") gen("elif %s is not None:" % arg.name) with Indentation(gen): gen("_lpy_encountered_dev = True") gen("") if not options.skip_arg_checks and not is_written: gen("if %s is None:" % arg.name) with Indentation(gen): gen("raise RuntimeError(\"input argument '%s' must " "be supplied\")" % arg.name) gen("") if (is_written and arg.arg_class is lp.ImageArg and not options.skip_arg_checks): gen("if %s is None:" % arg.name) with Indentation(gen): gen("raise RuntimeError(\"written image '%s' must " "be supplied\")" % arg.name) gen("") if is_written and arg.shape is None and not options.skip_arg_checks: gen("if %s is None:" % arg.name) with Indentation(gen): gen("raise RuntimeError(\"written argument '%s' has " "unknown shape and must be supplied\")" % arg.name) gen("") possibly_made_by_loopy = False # {{{ allocate written arrays, if needed if is_written and arg.arg_class in [lp.GlobalArg, lp.ConstantArg] \ and arg.shape is not None \ and all(si is not None for si in arg.shape): if not isinstance(arg.dtype, NumpyType): raise LoopyError("do not know how to pass arg of type '%s'" % arg.dtype) possibly_made_by_loopy = True gen("_lpy_made_by_loopy = False") gen("") gen("if %s is None:" % arg.name) with Indentation(gen): num_axes = len(arg.strides) for i in range(num_axes): gen("_lpy_shape_%d = %s" % (i, strify(arg.unvec_shape[i]))) itemsize = kernel_arg.dtype.numpy_dtype.itemsize for i in range(num_axes): gen("_lpy_strides_%d = %s" % (i, strify(itemsize * arg.unvec_strides[i]))) if not options.skip_arg_checks: for i in range(num_axes): gen("assert _lpy_strides_%d > 0, " "\"'%s' has negative stride in axis %d\"" % (i, arg.name, i)) sym_strides = tuple( var("_lpy_strides_%d" % i) for i in range(num_axes)) sym_shape = tuple( var("_lpy_shape_%d" % i) for i in range(num_axes)) alloc_size_expr = ( sum(astrd * (alen - 1) for alen, astrd in zip(sym_shape, sym_strides)) + itemsize) gen("_lpy_alloc_size = %s" % strify(alloc_size_expr)) gen("%(name)s = _lpy_cl_array.Array(queue, %(shape)s, " "%(dtype)s, strides=%(strides)s, " "data=allocator(_lpy_alloc_size), allocator=allocator)" % dict(name=arg.name, shape=strify(sym_shape), strides=strify(sym_strides), dtype=python_dtype_str(kernel_arg.dtype.numpy_dtype))) if not options.skip_arg_checks: for i in range(num_axes): gen("del _lpy_shape_%d" % i) gen("del _lpy_strides_%d" % i) gen("del _lpy_alloc_size") gen("") gen("_lpy_made_by_loopy = True") gen("") # }}} # {{{ argument checking if arg.arg_class in [lp.GlobalArg, lp.ConstantArg] \ and not options.skip_arg_checks: if possibly_made_by_loopy: gen("if not _lpy_made_by_loopy:") else: gen("if True:") with Indentation(gen): gen("if %s.dtype != %s:" % (arg.name, python_dtype_str(kernel_arg.dtype.numpy_dtype))) with Indentation(gen): gen("raise TypeError(\"dtype mismatch on argument '%s' " "(got: %%s, expected: %s)\" %% %s.dtype)" % (arg.name, arg.dtype, arg.name)) # {{{ generate shape checking code def strify_allowing_none(shape_axis): if shape_axis is None: return "None" else: return strify(shape_axis) def strify_tuple(t): if len(t) == 0: return "()" else: return "(%s,)" % ", ".join( strify_allowing_none(sa) for sa in t) shape_mismatch_msg = ( "raise TypeError(\"shape mismatch on argument '%s' " "(got: %%s, expected: %%s)\" " "%% (%s.shape, %s))" % (arg.name, arg.name, strify_tuple(arg.unvec_shape))) if kernel_arg.shape is None: pass elif any(shape_axis is None for shape_axis in kernel_arg.shape): gen("if len(%s.shape) != %s:" % (arg.name, len(arg.unvec_shape))) with Indentation(gen): gen(shape_mismatch_msg) for i, shape_axis in enumerate(arg.unvec_shape): if shape_axis is None: continue gen("if %s.shape[%d] != %s:" % (arg.name, i, strify(shape_axis))) with Indentation(gen): gen(shape_mismatch_msg) else: # not None, no Nones in tuple gen("if %s.shape != %s:" % (arg.name, strify(arg.unvec_shape))) with Indentation(gen): gen(shape_mismatch_msg) # }}} if arg.unvec_strides and kernel_arg.dim_tags: itemsize = kernel_arg.dtype.numpy_dtype.itemsize sym_strides = tuple(itemsize * s_i for s_i in arg.unvec_strides) gen("if %s.strides != %s:" % (arg.name, strify(sym_strides))) with Indentation(gen): gen("raise TypeError(\"strides mismatch on " "argument '%s' (got: %%s, expected: %%s)\" " "%% (%s.strides, %s))" % (arg.name, arg.name, strify(sym_strides))) if not arg.allows_offset: gen("if %s.offset:" % arg.name) with Indentation(gen): gen("raise ValueError(\"Argument '%s' does not " "allow arrays with offsets. Try passing " "default_offset=loopy.auto to make_kernel()." "\")" % arg.name) gen("") # }}} if possibly_made_by_loopy and not options.skip_arg_checks: gen("del _lpy_made_by_loopy") gen("") if arg.arg_class in [lp.GlobalArg, lp.ConstantArg]: args.append("%s.base_data" % arg.name) else: args.append("%s" % arg.name) gen("") gen("# }}}") gen("") gen("# }}}") gen("") return args
def check_loop_priority_inames_known(kernel): for prio in kernel.loop_priority: for iname in prio: if iname not in kernel.all_inames(): raise LoopyError("unknown iname '%s' in loop priorities" % iname)
def __init__(self, assignees, expression, id=None, depends_on=None, depends_on_is_final=None, groups=None, conflicts_with_groups=None, no_sync_with=None, within_inames_is_final=None, within_inames=None, boostable=None, boostable_into=None, tags=None, temp_var_types=None, priority=0, predicates=frozenset(), insn_deps=None, insn_deps_is_final=None, forced_iname_deps=None, forced_iname_deps_is_final=None): super(CallInstruction, self).__init__( id=id, depends_on=depends_on, depends_on_is_final=depends_on_is_final, groups=groups, conflicts_with_groups=conflicts_with_groups, no_sync_with=no_sync_with, within_inames_is_final=within_inames_is_final, within_inames=within_inames, boostable=boostable, boostable_into=boostable_into, priority=priority, predicates=predicates, tags=tags, insn_deps=insn_deps, insn_deps_is_final=insn_deps_is_final, forced_iname_deps=forced_iname_deps, forced_iname_deps_is_final=forced_iname_deps_is_final) from pymbolic.primitives import Call from loopy.symbolic import Reduction if not isinstance(expression, (Call, Reduction)) and expression is not None: raise LoopyError("'expression' argument to CallInstruction " "must be a function call") from loopy.symbolic import parse if isinstance(assignees, str): assignees = parse(assignees) if not isinstance(assignees, tuple): raise LoopyError("'assignees' argument to CallInstruction " "must be a tuple or a string parseable to a tuple" "--got '%s'" % type(assignees).__name__) if isinstance(expression, str): expression = parse(expression) from pymbolic.primitives import Variable, Subscript from loopy.symbolic import LinearSubscript for assignee in assignees: if not isinstance(assignee, (Variable, Subscript, LinearSubscript)): raise LoopyError("invalid lvalue '%s'" % assignee) self.assignees = assignees self.expression = expression if temp_var_types is None: self.temp_var_types = (None,) * len(self.assignees) else: self.temp_var_types = temp_var_types
def _merge_values(item_name, val_a, val_b): if val_a != val_b: raise LoopyError("inconsistent %ss in merge: %s and %s" % (item_name, val_a, val_b)) return val_a
def map_subscript(self, expr, type_context): def base_impl(expr, type_context): return self.rec(expr.aggregate, type_context)[self.rec(expr.index, 'i')] def make_var(name): from loopy import TaggedVariable if isinstance(expr.aggregate, TaggedVariable): return TaggedVariable(name, expr.aggregate.tag) else: return var(name) from pymbolic.primitives import Variable if not isinstance(expr.aggregate, Variable): return base_impl(expr, type_context) ary = self.find_array(expr) from loopy.kernel.array import get_access_info from pymbolic import evaluate from loopy.symbolic import simplify_using_aff index_tuple = tuple( simplify_using_aff(self.kernel, idx) for idx in expr.index_tuple) access_info = get_access_info(self.kernel.target, ary, index_tuple, lambda expr: evaluate(expr, self.codegen_state.var_subst_map), self.codegen_state.vectorization_info) from loopy.kernel.data import ( ImageArg, ArrayArg, TemporaryVariable, ConstantArg) if isinstance(ary, ImageArg): extra_axes = 0 num_target_axes = ary.num_target_axes() if num_target_axes in [1, 2]: idx_vec_type = "float2" extra_axes = 2-num_target_axes elif num_target_axes == 3: idx_vec_type = "float4" extra_axes = 4-num_target_axes else: raise LoopyError("unsupported number (%d) of target axes in image" % num_target_axes) idx_tuple = expr.index_tuple[::-1] + (0,) * extra_axes base_access = var("read_imagef")( var(ary.name), var("loopy_sampler"), var("(%s)" % idx_vec_type)(*self.rec(idx_tuple, 'i'))) if ary.dtype.numpy_dtype == np.float32: return base_access.attr("x") if self.kernel.target.is_vector_dtype(ary.dtype): return base_access elif ary.dtype.numpy_dtype == np.float64: return var("as_double")(base_access.attr("xy")) else: raise NotImplementedError( "non-floating-point images not supported for now") elif isinstance(ary, (ArrayArg, TemporaryVariable, ConstantArg)): if len(access_info.subscripts) == 0: if ( (isinstance(ary, (ConstantArg, ArrayArg)) or (isinstance(ary, TemporaryVariable) and ary.base_storage))): # unsubscripted global args are pointers result = make_var(access_info.array_name)[0] else: # unsubscripted temp vars are scalars # (unless they use base_storage) result = make_var(access_info.array_name) else: subscript, = access_info.subscripts result = make_var(access_info.array_name)[simplify_using_aff( self.kernel, self.rec(subscript, 'i'))] if access_info.vector_index is not None: return self.codegen_state.ast_builder.add_vector_access( result, access_info.vector_index) else: return result else: assert False
def map_call(self, expr, type_context): from pymbolic.primitives import Variable, Subscript identifier = expr.function # {{{ implement indexof, indexof_vec if identifier.name in ["indexof", "indexof_vec"]: if len(expr.parameters) != 1: raise LoopyError("%s takes exactly one argument" % identifier.name) arg, = expr.parameters if not isinstance(arg, Subscript): raise LoopyError( "argument to %s must be a subscript" % identifier.name) ary = self.find_array(arg) from loopy.kernel.array import get_access_info from pymbolic import evaluate access_info = get_access_info(self.kernel.target, ary, arg.index, lambda expr: evaluate(expr, self.codegen_state.var_subst_map), self.codegen_state.vectorization_info) from loopy.kernel.data import ImageArg if isinstance(ary, ImageArg): raise LoopyError("%s does not support images" % identifier.name) if identifier.name == "indexof": return access_info.subscripts[0] elif identifier.name == "indexof_vec": from loopy.kernel.array import VectorArrayDimTag ivec = None for iaxis, dim_tag in enumerate(ary.dim_tags): if isinstance(dim_tag, VectorArrayDimTag): ivec = iaxis if ivec is None: return access_info.subscripts[0] else: return ( access_info.subscripts[0]*ary.shape[ivec] + access_info.vector_index) else: raise RuntimeError("should not get here") # }}} if isinstance(identifier, Variable): identifier = identifier.name par_dtypes = tuple(self.infer_type(par) for par in expr.parameters) processed_parameters = None mangle_result = self.kernel.mangle_function( identifier, par_dtypes, ast_builder=self.codegen_state.ast_builder) if mangle_result is None: raise RuntimeError("function '%s' unknown--" "maybe you need to register a function mangler?" % identifier) if len(mangle_result.result_dtypes) != 1: raise LoopyError("functions with more or fewer than one return value " "may not be used in an expression") if mangle_result.arg_dtypes is not None: processed_parameters = tuple( self.rec(par, dtype_to_type_context(self.kernel.target, tgt_dtype), tgt_dtype) for par, par_dtype, tgt_dtype in zip( expr.parameters, par_dtypes, mangle_result.arg_dtypes)) else: # /!\ FIXME For some functions (e.g. 'sin'), it makes sense to # propagate the type context here. But for many others, it does # not. Using the inferred type as a stopgap for now. processed_parameters = tuple( self.rec(par, type_context=dtype_to_type_context( self.kernel.target, par_dtype)) for par, par_dtype in zip(expr.parameters, par_dtypes)) from warnings import warn warn("Calling function '%s' with unknown C signature--" "return CallMangleInfo.arg_dtypes" % identifier, LoopyWarning) from loopy.codegen import SeenFunction self.codegen_state.seen_functions.add( SeenFunction(identifier, mangle_result.target_name, mangle_result.arg_dtypes or par_dtypes)) return var(mangle_result.target_name)(*processed_parameters)
def assign_axis(recursion_axis, iname, axis=None): """Assign iname to local axis *axis* and start over by calling the surrounding function assign_automatic_axes. If *axis* is None, find a suitable axis automatically. """ try: with isl.SuppressedWarnings(kernel.isl_context): desired_length = kernel.get_constant_iname_length(iname) except isl.Error: # Likely unbounded, automatic assignment is not # going to happen for this iname. new_iname_to_tag = kernel.iname_to_tag.copy() new_iname_to_tag[iname] = None return assign_automatic_axes( kernel.copy(iname_to_tag=new_iname_to_tag), axis=recursion_axis) if axis is None: # {{{ find a suitable axis shorter_possible_axes = [] test_axis = 0 while True: if test_axis >= len(local_size): break if test_axis in assigned_local_axes: test_axis += 1 continue if local_size[test_axis] < desired_length: shorter_possible_axes.append(test_axis) test_axis += 1 continue else: axis = test_axis break # The loop above will find an unassigned local axis # that has enough 'room' for the iname. In the same traversal, # it also finds theoretically assignable axes that are shorter, # in the variable shorter_possible_axes. if axis is None and shorter_possible_axes: # sort as longest first shorter_possible_axes.sort(key=lambda ax: local_size[ax]) axis = shorter_possible_axes[0] # }}} if axis is None: new_tag = None else: new_tag = LocalIndexTag(axis) if desired_length > local_size[axis]: from loopy import split_iname # Don't be tempted to switch the outer tag to unroll--this may # generate tons of code on some examples. return assign_automatic_axes(split_iname( kernel, iname, inner_length=local_size[axis], outer_tag=None, inner_tag=new_tag, do_tagged_check=False), axis=recursion_axis, local_size=local_size) if not isinstance(kernel.iname_to_tag.get(iname), AutoLocalIndexTagBase): raise LoopyError("trying to reassign '%s'" % iname) new_iname_to_tag = kernel.iname_to_tag.copy() new_iname_to_tag[iname] = new_tag return assign_automatic_axes( kernel.copy(iname_to_tag=new_iname_to_tag), axis=recursion_axis, local_size=local_size)
def get_synchronization_poly(knl): """Count the number of synchronization events each thread encounters in a loopy kernel. :parameter knl: A :class:`loopy.LoopKernel` whose barriers are to be counted. :return: A dictionary mapping each type of synchronization event to a :class:`islpy.PwQPolynomial` holding the number of such events per thread. Possible keys include ``barrier_local``, ``barrier_global`` (if supported by the target) and ``kernel_launch``. Example usage:: # (first create loopy kernel and specify array data types) barrier_poly = get_barrier_poly(knl) params = {'n': 512, 'm': 256, 'l': 128} barrier_count = barrier_poly.eval_with_dict(params) # (now use this count to predict performance) """ from loopy.preprocess import preprocess_kernel, infer_unknown_types from loopy.schedule import (EnterLoop, LeaveLoop, Barrier, CallKernel, ReturnFromKernel, RunInstruction) from operator import mul knl = infer_unknown_types(knl, expect_completion=True) knl = preprocess_kernel(knl) knl = lp.get_one_scheduled_kernel(knl) iname_list = [] result = ToCountMap() one = isl.PwQPolynomial('{ 1 }') def get_count_poly(iname_list): if iname_list: # (if iname_list is not empty) ct = (count(knl, (knl.get_inames_domain(iname_list).project_out_except( iname_list, [dim_type.set]))), ) return reduce(mul, ct) else: return one for sched_item in knl.schedule: if isinstance(sched_item, EnterLoop): if sched_item.iname: # (if not empty) iname_list.append(sched_item.iname) elif isinstance(sched_item, LeaveLoop): if sched_item.iname: # (if not empty) iname_list.pop() elif isinstance(sched_item, Barrier): result = result + ToCountMap( {"barrier_%s" % sched_item.kind: get_count_poly(iname_list)}) elif isinstance(sched_item, CallKernel): result = result + ToCountMap( {"kernel_launch": get_count_poly(iname_list)}) elif isinstance(sched_item, (ReturnFromKernel, RunInstruction)): pass else: raise LoopyError("unexpected schedule item: %s" % type(sched_item).__name__) return result.dict
def get_auto_axis_iname_ranking_by_stride(kernel, insn): from loopy.kernel.data import ImageArg, ValueArg approximate_arg_values = {} for arg in kernel.args: if isinstance(arg, ValueArg): if arg.approximately is not None: approximate_arg_values[arg.name] = arg.approximately else: raise LoopyError( "No approximate arg value specified for '%s'" % arg.name) # {{{ find all array accesses in insn from loopy.symbolic import ArrayAccessFinder ary_acc_exprs = list(ArrayAccessFinder()(insn.expression)) from pymbolic.primitives import Subscript for assignee in insn.assignees: if isinstance(assignee, Subscript): ary_acc_exprs.append(assignee) # }}} # {{{ filter array accesses to only the global ones global_ary_acc_exprs = [] for aae in ary_acc_exprs: ary_name = aae.aggregate.name arg = kernel.arg_dict.get(ary_name) if arg is None: continue if isinstance(arg, ImageArg): continue global_ary_acc_exprs.append(aae) # }}} # {{{ figure out automatic-axis inames from loopy.kernel.data import AutoLocalIndexTagBase auto_axis_inames = set( iname for iname in kernel.insn_inames(insn) if isinstance(kernel.iname_to_tag.get(iname), AutoLocalIndexTagBase)) # }}} # {{{ figure out which iname should get mapped to local axis 0 # maps inames to "aggregate stride" aggregate_strides = {} from loopy.symbolic import CoefficientCollector from pymbolic.primitives import Variable for aae in global_ary_acc_exprs: index_expr = aae.index if not isinstance(index_expr, tuple): index_expr = (index_expr, ) ary_name = aae.aggregate.name arg = kernel.arg_dict.get(ary_name) if arg.dim_tags is None: from warnings import warn warn("Strides for '%s' are not known. Local axis assignment " "is likely suboptimal." % arg.name) ary_strides = [1] * len(index_expr) else: ary_strides = [] from loopy.kernel.array import FixedStrideArrayDimTag for dim_tag in arg.dim_tags: if isinstance(dim_tag, FixedStrideArrayDimTag): ary_strides.append(dim_tag.stride) # {{{ construct iname_to_stride_expr iname_to_stride_expr = {} for iexpr_i, stride in zip(index_expr, ary_strides): if stride is None: continue coeffs = CoefficientCollector()(iexpr_i) for var, coeff in six.iteritems(coeffs): if (isinstance(var, Variable) and var.name in auto_axis_inames): # excludes '1', i.e. the constant new_stride = coeff * stride old_stride = iname_to_stride_expr.get(var.name, None) if old_stride is None or new_stride < old_stride: iname_to_stride_expr[var.name] = new_stride # }}} from pymbolic import evaluate for iname, stride_expr in six.iteritems(iname_to_stride_expr): stride = evaluate(stride_expr, approximate_arg_values) aggregate_strides[iname] = aggregate_strides.get(iname, 0) + stride if aggregate_strides: very_large_stride = int(np.iinfo(np.int32).max) return sorted((iname for iname in kernel.insn_inames(insn)), key=lambda iname: (aggregate_strides.get(iname, very_large_stride), iname)) else: return None
def get_dot_dependency_graph(kernel, iname_cluster=True, use_insn_id=False): """Return a string in the `dot <http://graphviz.org/>`_ language depicting dependencies among kernel instructions. """ # make sure all automatically added stuff shows up from loopy.kernel.creation import apply_single_writer_depencency_heuristic kernel = apply_single_writer_depencency_heuristic(kernel, warn_if_used=False) if iname_cluster and not kernel.schedule: try: from loopy.schedule import get_one_scheduled_kernel kernel = get_one_scheduled_kernel(kernel) except RuntimeError as e: iname_cluster = False from warnings import warn warn("error encountered during scheduling for dep graph -- " "cannot perform iname clustering: %s(%s)" % (type(e).__name__, e)) dep_graph = {} lines = [] from loopy.kernel.data import MultiAssignmentBase, CInstruction for insn in kernel.instructions: if isinstance(insn, MultiAssignmentBase): op = "%s <- %s" % (insn.assignees, insn.expression) if len(op) > 200: op = op[:200] + "..." elif isinstance(insn, CInstruction): op = "<C instruction %s>" % insn.id else: op = "<instruction %s>" % insn.id if use_insn_id: insn_label = insn.id tooltip = op else: insn_label = op tooltip = insn.id lines.append("\"%s\" [label=\"%s\",shape=\"box\",tooltip=\"%s\"];" % ( insn.id, repr(insn_label)[1:-1], repr(tooltip)[1:-1], )) for dep in insn.depends_on: dep_graph.setdefault(insn.id, set()).add(dep) # {{{ O(n^3) transitive reduction # first, compute transitive closure by fixed point iteration while True: changed_something = False for insn_1 in dep_graph: for insn_2 in dep_graph.get(insn_1, set()).copy(): for insn_3 in dep_graph.get(insn_2, set()).copy(): if insn_3 not in dep_graph.get(insn_1, set()): changed_something = True dep_graph[insn_1].add(insn_3) if not changed_something: break for insn_1 in dep_graph: for insn_2 in dep_graph.get(insn_1, set()).copy(): for insn_3 in dep_graph.get(insn_2, set()).copy(): if insn_3 in dep_graph.get(insn_1, set()): dep_graph[insn_1].remove(insn_3) # }}} for insn_1 in dep_graph: for insn_2 in dep_graph.get(insn_1, set()): lines.append("%s -> %s" % (insn_2, insn_1)) if iname_cluster: from loopy.schedule import (EnterLoop, LeaveLoop, RunInstruction, Barrier, CallKernel, ReturnFromKernel) for sched_item in kernel.schedule: if isinstance(sched_item, EnterLoop): lines.append("subgraph cluster_%s { label=\"%s\"" % (sched_item.iname, sched_item.iname)) elif isinstance(sched_item, LeaveLoop): lines.append("}") elif isinstance(sched_item, RunInstruction): lines.append(sched_item.insn_id) elif isinstance(sched_item, (CallKernel, ReturnFromKernel, Barrier)): pass else: raise LoopyError("schedule item not unterstood: %r" % sched_item) return "digraph %s {\n%s\n}" % (kernel.name, "\n".join(lines))
def guess_var_shape(kernel, var_name): from loopy.symbolic import SubstitutionRuleExpander, AccessRangeMapper armap = AccessRangeMapper(kernel, var_name) submap = SubstitutionRuleExpander(kernel.substitutions) def run_through_armap(expr): armap(submap(expr), kernel.insn_inames(insn)) return expr try: for insn in kernel.instructions: insn.with_transformed_expressions(run_through_armap) except TypeError as e: from traceback import print_exc print_exc() raise LoopyError( "Failed to (automatically, as requested) find " "shape/strides for variable '%s'. " "Specifying the shape manually should get rid of this. " "The following error occurred: %s" % (var_name, str(e))) if armap.access_range is None: if armap.bad_subscripts: from loopy.symbolic import LinearSubscript if any( isinstance(sub, LinearSubscript) for sub in armap.bad_subscripts): raise LoopyError( "cannot determine access range for '%s': " "linear subscript(s) in '%s'" % (var_name, ", ".join(str(i) for i in armap.bad_subscripts))) n_axes_in_subscripts = set( len(sub.index_tuple) for sub in armap.bad_subscripts) if len(n_axes_in_subscripts) != 1: raise RuntimeError("subscripts of '%s' with differing " "numbers of axes were found" % var_name) n_axes, = n_axes_in_subscripts if n_axes == 1: # Leave shape undetermined--we can live with that for 1D. shape = (None, ) else: raise LoopyError( "cannot determine access range for '%s': " "undetermined index in subscript(s) '%s'" % (var_name, ", ".join(str(i) for i in armap.bad_subscripts))) else: # no subscripts found, let's call it a scalar shape = () else: from loopy.isl_helpers import static_max_of_pw_aff from loopy.symbolic import pw_aff_to_expr shape = [] for i in range(armap.access_range.dim(dim_type.set)): try: shape.append( pw_aff_to_expr( static_max_of_pw_aff(kernel.cache_manager.dim_max( armap.access_range, i) + 1, constants_only=False))) except: print("While trying to find shape axis %d of " "variable '%s', the following " "exception occurred:" % (i, var_name), file=sys.stderr) print("*** ADVICE: You may need to manually specify the " "shape of argument '%s'." % (var_name), file=sys.stderr) raise shape = tuple(shape) return shape
def infer_unknown_types(kernel, expect_completion=False): """Infer types on temporaries and arguments.""" logger.debug("%s: infer types" % kernel.name) from functools import partial debug = partial(_debug, kernel) import time start_time = time.time() unexpanded_kernel = kernel if kernel.substitutions: from loopy.transform.subst import expand_subst kernel = expand_subst(kernel) new_temp_vars = kernel.temporary_variables.copy() new_arg_dict = kernel.arg_dict.copy() # {{{ find names_with_unknown_types # contains both arguments and temporaries names_for_type_inference = [] import loopy as lp for tv in six.itervalues(kernel.temporary_variables): if tv.dtype is lp.auto: names_for_type_inference.append(tv.name) for arg in kernel.args: if arg.dtype is None: names_for_type_inference.append(arg.name) # }}} logger.debug("finding types for {count:d} names".format( count=len(names_for_type_inference))) writer_map = kernel.writer_map() dep_graph = dict(( written_var, set(read_var for insn_id in writer_map.get(written_var, []) for read_var in kernel.id_to_insn[insn_id].read_dependency_names() if read_var in names_for_type_inference)) for written_var in names_for_type_inference) from loopy.tools import compute_sccs # To speed up processing, we sort the variables by computing the SCCs of the # type dependency graph. Each SCC represents a set of variables whose types # mutually depend on themselves. The SCCs are returned and processed in # topological order. sccs = compute_sccs(dep_graph) item_lookup = _DictUnionView([new_temp_vars, new_arg_dict]) type_inf_mapper = TypeInferenceMapper(kernel, item_lookup) from loopy.symbolic import SubstitutionRuleExpander subst_expander = SubstitutionRuleExpander(kernel.substitutions) # {{{ work on type inference queue from loopy.kernel.data import TemporaryVariable, KernelArgument for var_chain in sccs: changed_during_last_queue_run = False queue = var_chain[:] failed_names = set() while queue or changed_during_last_queue_run: if not queue and changed_during_last_queue_run: changed_during_last_queue_run = False # Optimization: If there's a single variable in the SCC without # a self-referential dependency, then the type is known after a # single iteration (we don't need to look at the expressions # again). if len(var_chain) == 1: single_var, = var_chain if single_var not in dep_graph[single_var]: break queue = var_chain[:] name = queue.pop(0) item = item_lookup[name] debug("inferring type for %s %s", type(item).__name__, item.name) result, symbols_with_unavailable_types = (_infer_var_type( kernel, item.name, type_inf_mapper, subst_expander)) failed = not result if not failed: new_dtype, = result debug(" success: %s", new_dtype) if new_dtype != item.dtype: debug(" changed from: %s", item.dtype) changed_during_last_queue_run = True if isinstance(item, TemporaryVariable): new_temp_vars[name] = item.copy(dtype=new_dtype) elif isinstance(item, KernelArgument): new_arg_dict[name] = item.copy(dtype=new_dtype) else: raise LoopyError( "unexpected item type in type inference") else: debug(" failure") if failed: if item.name in failed_names: # this item has failed before, give up. advice = "" if symbols_with_unavailable_types: advice += ( " (need type of '%s'--check for missing arguments)" % ", ".join(symbols_with_unavailable_types)) if expect_completion: raise LoopyError("could not determine type of '%s'%s" % (item.name, advice)) else: # We're done here. break # remember that this item failed failed_names.add(item.name) if set(queue) == failed_names: # We did what we could... print(queue, failed_names, item.name) assert not expect_completion break # can't infer type yet, put back into queue queue.append(name) else: # we've made progress, reset failure markers failed_names = set() # }}} end_time = time.time() logger.debug("type inference took {dur:.2f} seconds".format(dur=end_time - start_time)) return unexpanded_kernel.copy( temporary_variables=new_temp_vars, args=[new_arg_dict[arg.name] for arg in kernel.args], )
def map_group_hw_index(self, expr, type_context): raise LoopyError("plain C does not have group hw axes")
def __init__(self, id, depends_on, depends_on_is_final, groups, conflicts_with_groups, no_sync_with, within_inames_is_final, within_inames, priority, boostable, boostable_into, predicates, tags, insn_deps=None, insn_deps_is_final=None, forced_iname_deps=None, forced_iname_deps_is_final=None): # {{{ backwards compatibility goop if depends_on is not None and insn_deps is not None: raise LoopyError("may not specify both insn_deps and depends_on") elif insn_deps is not None: warn("insn_deps is deprecated, use depends_on", DeprecationWarning, stacklevel=2) depends_on = insn_deps depends_on_is_final = insn_deps_is_final if forced_iname_deps is not None and within_inames is not None: raise LoopyError("may not specify both forced_iname_deps " "and within_inames") elif forced_iname_deps is not None: warn("forced_iname_deps is deprecated, use within_inames", DeprecationWarning, stacklevel=2) within_inames = forced_iname_deps within_inames_is_final = forced_iname_deps_is_final if predicates is None: predicates = frozenset() new_predicates = set() for pred in predicates: if isinstance(pred, str): from pymbolic.primitives import LogicalNot from loopy.symbolic import parse if pred.startswith("!"): warn("predicates starting with '!' are deprecated. " "Simply use 'not' instead") pred = LogicalNot(parse(pred[1:])) else: pred = parse(pred) new_predicates.add(pred) predicates = frozenset(new_predicates) del new_predicates # }}} if depends_on is None: depends_on = frozenset() if groups is None: groups = frozenset() if conflicts_with_groups is None: conflicts_with_groups = frozenset() if no_sync_with is None: no_sync_with = frozenset() if within_inames is None: within_inames = frozenset() if within_inames_is_final is None: within_inames_is_final = False if isinstance(depends_on, str): depends_on = frozenset( s.strip() for s in depends_on.split(",") if s.strip()) if depends_on_is_final is None: depends_on_is_final = False if depends_on_is_final and not isinstance(depends_on, frozenset): raise LoopyError("Setting depends_on_is_final to True requires " "actually specifying depends_on") if tags is None: tags = frozenset() if priority is None: priority = 0 if not isinstance(tags, frozenset): # was previously allowed to be tuple tags = frozenset(tags) # Periodically reenable these and run the tests to ensure all # performance-relevant identifiers are interned. # # from loopy.tools import is_interned # assert is_interned(id) # assert all(is_interned(dep) for dep in depends_on) # assert all(is_interned(grp) for grp in groups) # assert all(is_interned(grp) for grp in conflicts_with_groups) # assert all(is_interned(iname) for iname in within_inames) # assert all(is_interned(pred) for pred in predicates) assert isinstance(within_inames, frozenset) assert isinstance(depends_on, frozenset) or depends_on is None assert isinstance(groups, frozenset) assert isinstance(conflicts_with_groups, frozenset) ImmutableRecord.__init__(self, id=id, depends_on=depends_on, depends_on_is_final=depends_on_is_final, no_sync_with=no_sync_with, groups=groups, conflicts_with_groups=conflicts_with_groups, within_inames_is_final=within_inames_is_final, within_inames=within_inames, priority=priority, boostable=boostable, boostable_into=boostable_into, predicates=predicates, tags=tags)
def map_local_hw_index(self, expr, type_context): raise LoopyError("plain C does not have local hw axes")
def generate_arg_setup( self, gen, kernel, implemented_data_info, options): import loopy as lp from loopy.kernel.data import KernelArgument from loopy.kernel.array import ArrayBase from loopy.symbolic import StringifyMapper from loopy.types import NumpyType gen("# {{{ set up array arguments") gen("") if not options.no_numpy: gen("_lpy_encountered_numpy = False") gen("_lpy_encountered_dev = False") gen("") args = [] strify = StringifyMapper() expect_no_more_arguments = False for arg in implemented_data_info: is_written = arg.base_name in kernel.get_written_variables() kernel_arg = kernel.impl_arg_to_arg.get(arg.name) if not issubclass(arg.arg_class, KernelArgument): expect_no_more_arguments = True continue if expect_no_more_arguments: raise LoopyError("Further arguments encountered after arg info " "describing a global temporary variable") if not issubclass(arg.arg_class, ArrayBase): args.append(arg.name) continue gen("# {{{ process %s" % arg.name) gen("") if not options.no_numpy: self.handle_non_numpy_arg(gen, arg) if not options.skip_arg_checks and not is_written: gen("if %s is None:" % arg.name) with Indentation(gen): gen("raise RuntimeError(\"input argument '%s' must " 'be supplied")' % arg.name) gen("") if (is_written and arg.arg_class is lp.ImageArg and not options.skip_arg_checks): gen("if %s is None:" % arg.name) with Indentation(gen): gen("raise RuntimeError(\"written image '%s' must " 'be supplied")' % arg.name) gen("") if is_written and arg.shape is None and not options.skip_arg_checks: gen("if %s is None:" % arg.name) with Indentation(gen): gen("raise RuntimeError(\"written argument '%s' has " 'unknown shape and must be supplied")' % arg.name) gen("") possibly_made_by_loopy = False # {{{ allocate written arrays, if needed if is_written and arg.arg_class in [lp.ArrayArg, lp.ConstantArg] \ and arg.shape is not None \ and all(si is not None for si in arg.shape): if not isinstance(arg.dtype, NumpyType): raise LoopyError("do not know how to pass arg of type '%s'" % arg.dtype) possibly_made_by_loopy = True gen("_lpy_made_by_loopy = False") gen("") gen("if %s is None:" % arg.name) with Indentation(gen): self.handle_alloc( gen, arg, kernel_arg, strify, options.skip_arg_checks) gen("_lpy_made_by_loopy = True") gen("") # }}} # {{{ argument checking if arg.arg_class in [lp.ArrayArg, lp.ConstantArg] \ and not options.skip_arg_checks: if possibly_made_by_loopy: gen("if not _lpy_made_by_loopy:") else: gen("if True:") with Indentation(gen): gen("if %s.dtype != %s:" % (arg.name, self.python_dtype_str( gen, kernel_arg.dtype.numpy_dtype))) with Indentation(gen): gen("raise TypeError(\"dtype mismatch on argument '%s' " '(got: %%s, expected: %s)" %% %s.dtype)' % (arg.name, arg.dtype, arg.name)) # {{{ generate shape checking code def strify_allowing_none(shape_axis): if shape_axis is None: return "None" else: return strify(shape_axis) def strify_tuple(t): if len(t) == 0: return "()" else: return "(%s,)" % ", ".join( strify_allowing_none(sa) for sa in t) shape_mismatch_msg = ( "raise TypeError(\"shape mismatch on argument '%s' " '(got: %%s, expected: %%s)" ' "%% (%s.shape, %s))" % (arg.name, arg.name, strify_tuple(arg.unvec_shape))) if kernel_arg.shape is None: pass elif any(shape_axis is None for shape_axis in kernel_arg.shape): gen("if len(%s.shape) != %s:" % (arg.name, len(arg.unvec_shape))) with Indentation(gen): gen(shape_mismatch_msg) for i, shape_axis in enumerate(arg.unvec_shape): if shape_axis is None: continue gen("if %s.shape[%d] != %s:" % (arg.name, i, strify(shape_axis))) with Indentation(gen): gen(shape_mismatch_msg) else: # not None, no Nones in tuple gen("if %s.shape != %s:" % (arg.name, strify(arg.unvec_shape))) with Indentation(gen): gen(shape_mismatch_msg) # }}} if arg.unvec_strides and kernel_arg.dim_tags: itemsize = kernel_arg.dtype.numpy_dtype.itemsize sym_strides = tuple( itemsize*s_i for s_i in arg.unvec_strides) ndim = len(arg.unvec_shape) shape = ["_lpy_shape_%d" % i for i in range(ndim)] strides = ["_lpy_stride_%d" % i for i in range(ndim)] gen("({},) = {}.shape".format(", ".join(shape), arg.name)) gen("({},) = {}.strides".format( ", ".join(strides), arg.name)) gen("if not (%s):" % self.get_strides_check_expr( shape, strides, (strify(s) for s in sym_strides))) with Indentation(gen): gen("_lpy_got = tuple(stride " "for (dim, stride) in zip(%s.shape, %s.strides) " "if dim > 1)" % (arg.name, arg.name)) gen("_lpy_expected = tuple(stride " "for (dim, stride) in zip(%s.shape, %s) " "if dim > 1)" % (arg.name, strify_tuple(sym_strides))) gen('raise TypeError("strides mismatch on ' "argument '%s' " "(after removing unit length dims, " 'got: %%s, expected: %%s)" ' "%% (_lpy_got, _lpy_expected))" % arg.name) if not arg.allows_offset: gen("if hasattr({}, 'offset') and {}.offset:".format( arg.name, arg.name)) with Indentation(gen): gen("raise ValueError(\"Argument '%s' does not " "allow arrays with offsets. Try passing " "default_offset=loopy.auto to make_kernel()." '")' % arg.name) gen("") # }}} if possibly_made_by_loopy and not options.skip_arg_checks: gen("del _lpy_made_by_loopy") gen("") if arg.arg_class in [lp.ArrayArg, lp.ConstantArg]: args.append(self.get_arg_pass(arg)) else: args.append("%s" % arg.name) gen("") gen("# }}}") gen("") gen("# }}}") gen("") return args
def emit_multiple_assignment(self, codegen_state, insn): ecm = codegen_state.expression_to_code_mapper from pymbolic.primitives import Variable from pymbolic.mapper.stringifier import PREC_NONE func_id = insn.expression.function parameters = insn.expression.parameters if isinstance(func_id, Variable): func_id = func_id.name assignee_var_descriptors = [ codegen_state.kernel.get_var_descriptor(a) for a in insn.assignee_var_names()] par_dtypes = tuple(ecm.infer_type(par) for par in parameters) mangle_result = codegen_state.kernel.mangle_function(func_id, par_dtypes) if mangle_result is None: raise RuntimeError("function '%s' unknown--" "maybe you need to register a function mangler?" % func_id) assert mangle_result.arg_dtypes is not None if mangle_result.target_name == "loopy_make_tuple": # This shorcut avoids actually having to emit a 'make_tuple' function. return self.emit_tuple_assignment(codegen_state, insn) from loopy.expression import dtype_to_type_context c_parameters = [ ecm(par, PREC_NONE, dtype_to_type_context(self.target, tgt_dtype), tgt_dtype).expr for par, par_dtype, tgt_dtype in zip( parameters, par_dtypes, mangle_result.arg_dtypes)] from loopy.codegen import SeenFunction codegen_state.seen_functions.add( SeenFunction(func_id, mangle_result.target_name, mangle_result.arg_dtypes)) from pymbolic import var for i, (a, tgt_dtype) in enumerate( zip(insn.assignees[1:], mangle_result.result_dtypes[1:])): if tgt_dtype != ecm.infer_type(a): raise LoopyError("type mismatch in %d'th (1-based) left-hand " "side of instruction '%s'" % (i+1, insn.id)) c_parameters.append( # TODO Yuck: The "where-at function": &(...) var("&")( ecm(a, PREC_NONE, dtype_to_type_context(self.target, tgt_dtype), tgt_dtype).expr)) from pymbolic import var result = var(mangle_result.target_name)(*c_parameters) # In case of no assignees, we are done if len(mangle_result.result_dtypes) == 0: from cgen import ExpressionStatement return ExpressionStatement( CExpression(self.get_c_expression_to_code_mapper(), result)) result = ecm.wrap_in_typecast( mangle_result.result_dtypes[0], assignee_var_descriptors[0].dtype, result) lhs_code = ecm(insn.assignees[0], prec=PREC_NONE, type_context=None) from cgen import Assign return Assign( lhs_code, CExpression(self.get_c_expression_to_code_mapper(), result))
def _fuse_two_kernels(knla, knlb): from loopy.kernel import kernel_state if knla.state != kernel_state.INITIAL or knlb.state != kernel_state.INITIAL: raise LoopyError("can only fuse kernels in INITIAL state") # {{{ fuse domains new_domains = knla.domains[:] for dom_b in knlb.domains: i_fuse = _find_fusable_loop_domain_index(dom_b, new_domains) if i_fuse is None: new_domains.append(dom_b) else: dom_a = new_domains[i_fuse] dom_a, dom_b = isl.align_two(dom_a, dom_b) shared_inames = list( set(dom_a.get_var_dict(dim_type.set)) & set(dom_b.get_var_dict(dim_type.set))) dom_a_s = dom_a.project_out_except(shared_inames, [dim_type.set]) dom_b_s = dom_a.project_out_except(shared_inames, [dim_type.set]) if not (dom_a_s <= dom_b_s and dom_b_s <= dom_a_s): raise LoopyError("kernels do not agree on domain of " "inames '%s'" % (",".join(shared_inames))) new_domain = dom_a & dom_b new_domains[i_fuse] = new_domain # }}} vng = knla.get_var_name_generator() b_var_renames = {} # {{{ fuse args new_args = knla.args[:] for b_arg in knlb.args: if b_arg.name not in knla.arg_dict: new_arg_name = vng(b_arg.name) if new_arg_name != b_arg.name: b_var_renames[b_arg.name] = var(new_arg_name) new_args.append(b_arg.copy(name=new_arg_name)) else: if b_arg != knla.arg_dict[b_arg.name]: raise LoopyError( "argument '{arg_name}' has inconsistent definition between " "the two kernels being merged ({arg_a} <-> {arg_b})". format(arg_name=b_arg.name, arg_a=str(knla.arg_dict[b_arg.name]), arg_b=str(b_arg))) # }}} # {{{ fuse temporaries new_temporaries = knla.temporary_variables.copy() for b_name, b_tv in six.iteritems(knlb.temporary_variables): assert b_name == b_tv.name new_tv_name = vng(b_name) if new_tv_name != b_name: b_var_renames[b_name] = var(new_tv_name) assert new_tv_name not in new_temporaries new_temporaries[new_tv_name] = b_tv.copy(name=new_tv_name) # }}} knlb = _apply_renames_in_exprs(knlb, b_var_renames) from pymbolic.imperative.transform import \ fuse_instruction_streams_with_unique_ids new_instructions, old_b_id_to_new_b_id = \ fuse_instruction_streams_with_unique_ids( knla.instructions, knlb.instructions) # {{{ fuse assumptions assump_a = knla.assumptions assump_b = knlb.assumptions assump_a, assump_b = isl.align_two(assump_a, assump_b) shared_param_names = list( set(assump_a.get_var_dict(dim_type.set)) & set(assump_b.get_var_dict(dim_type.set))) assump_a_s = assump_a.project_out_except(shared_param_names, [dim_type.param]) assump_b_s = assump_a.project_out_except(shared_param_names, [dim_type.param]) if not (assump_a_s <= assump_b_s and assump_b_s <= assump_a_s): raise LoopyError("assumptions do not agree on kernels to be merged") new_assumptions = (assump_a & assump_b).params() # }}} from loopy.kernel import LoopKernel return LoopKernel( domains=new_domains, instructions=new_instructions, args=new_args, name="%s_and_%s" % (knla.name, knlb.name), preambles=_ordered_merge_lists(knla.preambles, knlb.preambles), preamble_generators=_ordered_merge_lists(knla.preamble_generators, knlb.preamble_generators), assumptions=new_assumptions, local_sizes=_merge_dicts("local size", knla.local_sizes, knlb.local_sizes), temporary_variables=new_temporaries, iname_to_tag=_merge_dicts("iname-to-tag mapping", knla.iname_to_tag, knlb.iname_to_tag), substitutions=_merge_dicts("substitution", knla.substitutions, knlb.substitutions), function_manglers=_ordered_merge_lists(knla.function_manglers, knlb.function_manglers), symbol_manglers=_ordered_merge_lists(knla.symbol_manglers, knlb.symbol_manglers), iname_slab_increments=_merge_dicts("iname slab increment", knla.iname_slab_increments, knlb.iname_slab_increments), loop_priority=_ordered_merge_lists(knla.loop_priority, knlb.loop_priority), silenced_warnings=_ordered_merge_lists(knla.silenced_warnings, knlb.silenced_warnings), applied_iname_rewrites=_ordered_merge_lists( knla.applied_iname_rewrites, knlb.applied_iname_rewrites), index_dtype=_merge_values("index dtype", knla.index_dtype, knlb.index_dtype), target=_merge_values("target", knla.target, knlb.target), options=knla.options), old_b_id_to_new_b_id
def emit_assignment(self, codegen_state, insn): kernel = codegen_state.kernel ecm = codegen_state.expression_to_code_mapper assignee_var_name, = insn.assignee_var_names() lhs_var = codegen_state.kernel.get_var_descriptor(assignee_var_name) lhs_dtype = lhs_var.dtype if insn.atomicity: raise NotImplementedError("atomic ops in ISPC") from loopy.expression import dtype_to_type_context from pymbolic.mapper.stringifier import PREC_NONE rhs_type_context = dtype_to_type_context(kernel.target, lhs_dtype) rhs_code = ecm(insn.expression, prec=PREC_NONE, type_context=rhs_type_context, needed_dtype=lhs_dtype) lhs = insn.assignee # {{{ handle streaming stores if "!streaming_store" in insn.tags: ary = ecm.find_array(lhs) from loopy.kernel.array import get_access_info from pymbolic import evaluate from loopy.symbolic import simplify_using_aff index_tuple = tuple( simplify_using_aff(kernel, idx) for idx in lhs.index_tuple) access_info = get_access_info( kernel.target, ary, index_tuple, lambda expr: evaluate(expr, self.codegen_state.var_subst_map), codegen_state.vectorization_info) from loopy.kernel.data import GlobalArg, TemporaryVariable if not isinstance(ary, (GlobalArg, TemporaryVariable)): raise LoopyError("array type not supported in ISPC: %s" % type(ary).__name) if len(access_info.subscripts) != 1: raise LoopyError("streaming stores must have a subscript") subscript, = access_info.subscripts from pymbolic.primitives import Sum, flattened_sum, Variable if isinstance(subscript, Sum): terms = subscript.children else: terms = (subscript.children, ) new_terms = [] from loopy.kernel.data import LocalIndexTag from loopy.symbolic import get_dependencies saw_l0 = False for term in terms: if (isinstance(term, Variable) and isinstance( kernel.iname_to_tag.get(term.name), LocalIndexTag) and kernel.iname_to_tag.get(term.name).axis == 0): if saw_l0: raise LoopyError("streaming store must have stride 1 " "in local index, got: %s" % subscript) saw_l0 = True continue else: for dep in get_dependencies(term): if (isinstance(kernel.iname_to_tag.get(dep), LocalIndexTag) and kernel.iname_to_tag.get(dep).axis == 0): raise LoopyError( "streaming store must have stride 1 " "in local index, got: %s" % subscript) new_terms.append(term) if not saw_l0: raise LoopyError("streaming store must have stride 1 in " "local index, got: %s" % subscript) if access_info.vector_index is not None: raise LoopyError("streaming store may not use a short-vector " "data type") rhs_has_programindex = any( isinstance(kernel.iname_to_tag.get(dep), LocalIndexTag) and kernel.iname_to_tag.get(dep).axis == 0 for dep in get_dependencies(insn.expression)) if not rhs_has_programindex: rhs_code = "broadcast(%s, 0)" % rhs_code from cgen import Statement return Statement( "streaming_store(%s + %s, %s)" % (access_info.array_name, ecm(flattened_sum(new_terms), PREC_NONE, 'i'), rhs_code)) # }}} from cgen import Assign return Assign(ecm(lhs, prec=PREC_NONE, type_context=None), rhs_code)
def __init__(self, name, dtype=None, shape=auto, address_space=None, dim_tags=None, offset=0, dim_names=None, strides=None, order=None, base_indices=None, storage_shape=None, base_storage=None, initializer=None, read_only=False, _base_storage_access_may_be_aliasing=False, **kwargs): """ :arg dtype: :class:`loopy.auto` or a :class:`numpy.dtype` :arg shape: :class:`loopy.auto` or a shape tuple :arg base_indices: :class:`loopy.auto` or a tuple of base indices """ scope = kwargs.pop("scope", None) if scope is not None: warn("Passing 'scope' is deprecated. Use 'address_space' instead.", DeprecationWarning, stacklevel=2) if address_space is not None: raise ValueError("only one of 'scope' and 'address_space' " "may be specified") else: address_space = scope del scope if address_space is None: address_space = auto if address_space is None: raise LoopyError("temporary variable '%s': " "address_space must not be None" % name) if initializer is None: pass elif isinstance(initializer, np.ndarray): if offset != 0: raise LoopyError("temporary variable '%s': " "offset must be 0 if initializer specified" % name) from loopy.types import NumpyType, to_loopy_type if dtype is auto or dtype is None: dtype = NumpyType(initializer.dtype) elif to_loopy_type(dtype) != to_loopy_type(initializer.dtype): raise LoopyError("temporary variable '%s': " "dtype of initializer does not match " "dtype of array." % name) if shape is auto: shape = initializer.shape else: if shape != initializer.shape: raise LoopyError("Shape of '{}' does not match that of the" " initializer.".format(name)) else: raise LoopyError("temporary variable '%s': " "initializer must be None or a numpy array" % name) if order is None: order = "C" if base_indices is None and shape is not auto: base_indices = (0, ) * len(shape) if not read_only and initializer is not None: raise LoopyError("temporary variable '%s': " "read-write variables with initializer " "are not currently supported " "(did you mean to set read_only=True?)" % name) if base_storage is not None and initializer is not None: raise LoopyError("temporary variable '%s': " "base_storage and initializer are " "mutually exclusive" % name) if base_storage is None and _base_storage_access_may_be_aliasing: raise LoopyError( "temporary variable '%s': " "_base_storage_access_may_be_aliasing option, but no " "base_storage given!" % name) ArrayBase.__init__(self, name=intern(name), dtype=dtype, shape=shape, strides=strides, dim_tags=dim_tags, offset=offset, dim_names=dim_names, order=order, base_indices=base_indices, address_space=address_space, storage_shape=storage_shape, base_storage=base_storage, initializer=initializer, read_only=read_only, _base_storage_access_may_be_aliasing=( _base_storage_access_may_be_aliasing), **kwargs)
def map_local_hw_index(self, expr, type_context): if expr.axis == 0: return var("(varying %s) programIndex" % self._get_index_ctype()) else: raise LoopyError("ISPC only supports one local axis")
def map_subscript(self, expr): WalkMapper.map_subscript(self, expr) from pymbolic.primitives import Variable assert isinstance(expr.aggregate, Variable) shape = None var_name = expr.aggregate.name if var_name in self.kernel.arg_dict: arg = self.kernel.arg_dict[var_name] shape = arg.shape elif var_name in self.kernel.temporary_variables: tv = self.kernel.temporary_variables[var_name] shape = tv.shape if shape is not None: subscript = expr.index if not isinstance(subscript, tuple): subscript = (subscript,) from loopy.symbolic import get_dependencies, get_access_range available_vars = set(self.domain.get_var_dict()) shape_deps = set() for shape_axis in shape: if shape_axis is not None: shape_deps.update(get_dependencies(shape_axis)) if not (get_dependencies(subscript) <= available_vars and shape_deps <= available_vars): return if len(subscript) != len(shape): raise LoopyError("subscript to '%s' in '%s' has the wrong " "number of indices (got: %d, expected: %d)" % ( expr.aggregate.name, expr, len(subscript), len(shape))) try: access_range = get_access_range(self.domain, subscript, self.kernel.assumptions) except isl.Error: # Likely: index was non-linear, nothing we can do. return except TypeError: # Likely: index was non-linear, nothing we can do. return shape_domain = isl.BasicSet.universe(access_range.get_space()) for idim in range(len(subscript)): shape_axis = shape[idim] if shape_axis is not None: from loopy.isl_helpers import make_slab slab = make_slab( shape_domain.get_space(), (dim_type.in_, idim), 0, shape_axis) shape_domain = shape_domain.intersect(slab) if not access_range.is_subset(shape_domain): raise LoopyError("'%s' in instruction '%s' " "accesses out-of-bounds array element" % (expr, self.insn_id))
def generate_value_arg_setup(kernel, devices, implemented_data_info): options = kernel.options import loopy as lp from loopy.kernel.array import ArrayBase # {{{ arg counting bug handling # For example: # https://github.com/pocl/pocl/issues/197 # (but Apple CPU has a similar bug) work_around_arg_count_bug = False warn_about_arg_count_bug = False try: from pyopencl.characterize import has_struct_arg_count_bug except ImportError: count_bug_per_dev = [False] * len(devices) else: count_bug_per_dev = [ has_struct_arg_count_bug(dev) if dev is not None else False for dev in devices ] if any(dev is None for dev in devices): warn("{knl_name}: device not supplied to PyOpenCLTarget--" "workarounds for broken OpenCL implementations " "(such as those relating to complex numbers) " "may not be enabled when needed".format(knl_name=kernel.name)) if any(count_bug_per_dev): if all(count_bug_per_dev): work_around_arg_count_bug = True else: warn_about_arg_count_bug = True # }}} cl_arg_idx = 0 arg_idx_to_cl_arg_idx = {} fp_arg_count = 0 from genpy import (Comment, Line, If, Raise, Assign, Statement as S, Suite) result = [] gen = result.append for arg_idx, idi in enumerate(implemented_data_info): arg_idx_to_cl_arg_idx[arg_idx] = cl_arg_idx if not issubclass(idi.arg_class, lp.ValueArg): assert issubclass(idi.arg_class, ArrayBase) # assume each of those generates exactly one... cl_arg_idx += 1 continue gen(Comment("{{{ process %s" % idi.name)) gen(Line()) if not options.skip_arg_checks: gen( If( "%s is None" % idi.name, Raise('RuntimeError("input argument \'{name}\' ' 'must be supplied")'.format(name=idi.name)))) if idi.dtype.is_integral(): gen( Comment("cast to Python int to avoid trouble " "with struct packing or Boost.Python")) if sys.version_info < (3, ): py_type = "long" else: py_type = "int" gen(Assign(idi.name, "%s(%s)" % (py_type, idi.name))) gen(Line()) if idi.dtype.is_composite(): gen(S("_lpy_knl.set_arg(%d, %s)" % (cl_arg_idx, idi.name))) cl_arg_idx += 1 elif idi.dtype.is_complex(): assert isinstance(idi.dtype, NumpyType) dtype = idi.dtype if warn_about_arg_count_bug: warn("{knl_name}: arguments include complex numbers, and " "some (but not all) of the target devices mishandle " "struct kernel arguments (hence the workaround is " "disabled".format(knl_name=kernel.name)) if dtype.numpy_dtype == np.complex64: arg_char = "f" elif dtype.numpy_dtype == np.complex128: arg_char = "d" else: raise TypeError("unexpected complex type: %s" % dtype) if (work_around_arg_count_bug and dtype.numpy_dtype == np.complex128 and fp_arg_count + 2 <= 8): gen( Assign( "_lpy_buf", "_lpy_pack('{arg_char}', {arg_var}.real)".format( arg_char=arg_char, arg_var=idi.name))) gen( S("_lpy_knl.set_arg({cl_arg_idx}, _lpy_buf)".format( cl_arg_idx=cl_arg_idx))) cl_arg_idx += 1 gen( Assign( "_lpy_buf", "_lpy_pack('{arg_char}', {arg_var}.imag)".format( arg_char=arg_char, arg_var=idi.name))) gen( S("_lpy_knl.set_arg({cl_arg_idx}, _lpy_buf)".format( cl_arg_idx=cl_arg_idx))) cl_arg_idx += 1 else: gen( Assign( "_lpy_buf", "_lpy_pack('{arg_char}{arg_char}', " "{arg_var}.real, {arg_var}.imag)".format( arg_char=arg_char, arg_var=idi.name))) gen( S("_lpy_knl.set_arg({cl_arg_idx}, _lpy_buf)".format( cl_arg_idx=cl_arg_idx))) cl_arg_idx += 1 fp_arg_count += 2 elif isinstance(idi.dtype, NumpyType): if idi.dtype.dtype.kind == "f": fp_arg_count += 1 gen( S("_lpy_knl.set_arg(%d, _lpy_pack('%s', %s))" % (cl_arg_idx, idi.dtype.dtype.char, idi.name))) cl_arg_idx += 1 else: raise LoopyError("do not know how to pass argument of type '%s'" % idi.dtype) gen(Line()) gen(Comment("}}}")) gen(Line()) return Suite(result), arg_idx_to_cl_arg_idx, cl_arg_idx
def check_implemented_domains(kernel, implemented_domains, code=None): from islpy import dim_type from islpy import align_two last_idomains = None last_insn_inames = None for insn_id, idomains in six.iteritems(implemented_domains): insn = kernel.id_to_insn[insn_id] assert idomains insn_inames = kernel.insn_inames(insn) # {{{ if we've checked the same thing before, no need to check it again if last_idomains is not None and last_insn_inames is not None: if idomains == last_idomains and insn_inames == last_insn_inames: continue last_idomains = idomains last_insn_inames = insn_inames # }}} insn_impl_domain = idomains[0] for idomain in idomains[1:]: insn_impl_domain = insn_impl_domain | idomain assumption_non_param = isl.BasicSet.from_params(kernel.assumptions) assumptions, insn_impl_domain = align_two( assumption_non_param, insn_impl_domain) insn_impl_domain = ( (insn_impl_domain & assumptions) .project_out_except(insn_inames, [dim_type.set])) from loopy.kernel.instruction import BarrierInstruction from loopy.kernel.data import LocalIndexTag if isinstance(insn, BarrierInstruction): # project out local-id-mapped inames, solves #94 on gitlab non_lid_inames = frozenset( [iname for iname in insn_inames if not isinstance( kernel.iname_to_tag.get(iname), LocalIndexTag)]) insn_impl_domain = insn_impl_domain.project_out_except( non_lid_inames, [dim_type.set]) insn_domain = kernel.get_inames_domain(insn_inames) insn_parameters = frozenset(insn_domain.get_var_names(dim_type.param)) assumptions, insn_domain = align_two(assumption_non_param, insn_domain) desired_domain = ((insn_domain & assumptions) .project_out_except(insn_inames, [dim_type.set]) .project_out_except(insn_parameters, [dim_type.param])) if isinstance(insn, BarrierInstruction): # project out local-id-mapped inames, solves #94 on gitlab desired_domain = desired_domain.project_out_except( non_lid_inames, [dim_type.set]) insn_impl_domain = (insn_impl_domain .project_out_except(insn_parameters, [dim_type.param])) insn_impl_domain, desired_domain = align_two( insn_impl_domain, desired_domain) if insn_impl_domain != desired_domain: i_minus_d = insn_impl_domain - desired_domain d_minus_i = desired_domain - insn_impl_domain parameter_inames = set( insn_domain.get_dim_name(dim_type.param, i) for i in range(insn_impl_domain.dim(dim_type.param))) lines = [] for bigger, smaller, diff_set, gist_domain in [ ("implemented", "desired", i_minus_d, desired_domain.gist(insn_impl_domain)), ("desired", "implemented", d_minus_i, insn_impl_domain.gist(desired_domain))]: if diff_set.is_empty(): continue diff_set = diff_set.coalesce() pt = diff_set.sample_point() assert not pt.is_void() #pt_set = isl.Set.from_point(pt) #lines.append("point implemented: %s" % (pt_set <= insn_impl_domain)) #lines.append("point desired: %s" % (pt_set <= desired_domain)) iname_to_dim = pt.get_space().get_var_dict() point_axes = [] for iname in kernel.insn_inames(insn) | parameter_inames: tp, dim = iname_to_dim[iname] point_axes.append("%s=%d" % ( iname, pt.get_coordinate_val(tp, dim).to_python())) lines.append( "sample point in %s but not %s: %s" % ( bigger, smaller, ", ".join(point_axes))) lines.append( "gist of constraints in %s but not %s: %s" % ( smaller, bigger, gist_domain)) if code is not None: print(79*"-") print("CODE:") print(79*"-") from loopy.compiled import get_highlighted_cl_code print(get_highlighted_cl_code(code)) print(79*"-") raise LoopyError("sanity check failed--implemented and desired " "domain for instruction '%s' do not match\n\n" "implemented: %s\n\n" "desired:%s\n\n%s" % (insn_id, insn_impl_domain, desired_domain, "\n".join(lines))) # placate the assert at the call site return True
def parse_match(expr): """Syntax examples:: * ``id:yoink and writes:a_temp`` * ``id:yoink and (not writes:a_temp or tag:input)`` """ if not expr: return All() def parse_terminal(pstate): next_tag = pstate.next_tag() if next_tag is _id: result = Id(pstate.next_match_obj().group(1)) pstate.advance() return result elif next_tag is _tag: result = Tagged(pstate.next_match_obj().group(1)) pstate.advance() return result elif next_tag is _writes: result = Writes(pstate.next_match_obj().group(1)) pstate.advance() return result elif next_tag is _reads: result = Reads(pstate.next_match_obj().group(1)) pstate.advance() return result elif next_tag is _in_kernel: result = InKernel(pstate.next_match_obj().group(1)) pstate.advance() return result elif next_tag is _iname: result = Iname(pstate.next_match_obj().group(1)) pstate.advance() return result else: pstate.expected("terminal") def inner_parse(pstate, min_precedence=0): pstate.expect_not_end() if pstate.is_next(_not): pstate.advance() left_query = Not(inner_parse(pstate, _PREC_NOT)) elif pstate.is_next(_openpar): pstate.advance() left_query = inner_parse(pstate) pstate.expect(_closepar) pstate.advance() else: left_query = parse_terminal(pstate) did_something = True while did_something: did_something = False if pstate.is_at_end(): return left_query next_tag = pstate.next_tag() if next_tag is _and and _PREC_AND > min_precedence: pstate.advance() left_query = And((left_query, inner_parse(pstate, _PREC_AND))) did_something = True elif next_tag is _or and _PREC_OR > min_precedence: pstate.advance() left_query = Or((left_query, inner_parse(pstate, _PREC_OR))) did_something = True return left_query if isinstance(expr, MatchExpressionBase): return expr from pytools.lex import LexIterator, lex, InvalidTokenError try: pstate = LexIterator( [(tag, s, idx, matchobj) for (tag, s, idx, matchobj) in lex(_LEX_TABLE, expr, match_objects=True) if tag is not _whitespace], expr) except InvalidTokenError as e: from loopy.diagnostic import LoopyError raise LoopyError( "invalid match expression: '{match_expr}' ({err_type}: {err_str})". format(match_expr=expr, err_type=type(e).__name__, err_str=str(e))) if pstate.is_at_end(): pstate.raise_parse_error("unexpected end of input") result = inner_parse(pstate) if not pstate.is_at_end(): pstate.raise_parse_error("leftover input after completed parse") return result
def emit_atomic_update(self, codegen_state, lhs_atomicity, lhs_var, lhs_expr, rhs_expr, lhs_dtype, rhs_type_context): from pymbolic.mapper.stringifier import PREC_NONE # FIXME: Could detect operations, generate atomic_{add,...} when # appropriate. if isinstance(lhs_dtype, NumpyType) and lhs_dtype.numpy_dtype in [ np.int32, np.int64, np.float32, np.float64 ]: from cgen import Block, DoWhile, Assign from loopy.target.c import POD old_val_var = codegen_state.var_name_generator("loopy_old_val") new_val_var = codegen_state.var_name_generator("loopy_new_val") from loopy.kernel.data import TemporaryVariable, AddressSpace ecm = codegen_state.expression_to_code_mapper.with_assignments({ old_val_var: TemporaryVariable(old_val_var, lhs_dtype, shape=()), new_val_var: TemporaryVariable(new_val_var, lhs_dtype, shape=()), }) lhs_expr_code = ecm(lhs_expr, prec=PREC_NONE, type_context=None) from pymbolic.mapper.substitutor import make_subst_func from pymbolic import var from loopy.symbolic import SubstitutionMapper subst = SubstitutionMapper( make_subst_func({lhs_expr: var(old_val_var)})) rhs_expr_code = ecm(subst(rhs_expr), prec=PREC_NONE, type_context=rhs_type_context, needed_dtype=lhs_dtype) if lhs_dtype.numpy_dtype.itemsize == 4: func_name = "atomic_cmpxchg" elif lhs_dtype.numpy_dtype.itemsize == 8: func_name = "atom_cmpxchg" else: raise LoopyError("unexpected atomic size") cast_str = "" old_val = old_val_var new_val = new_val_var if lhs_dtype.numpy_dtype.kind == "f": if lhs_dtype.numpy_dtype == np.float32: ctype = "int" elif lhs_dtype.numpy_dtype == np.float64: ctype = "long" else: assert False from loopy.kernel.data import (TemporaryVariable, ArrayArg) if (isinstance(lhs_var, ArrayArg) and lhs_var.address_space == AddressSpace.GLOBAL): var_kind = "__global" elif (isinstance(lhs_var, ArrayArg) and lhs_var.address_space == AddressSpace.LOCAL): var_kind = "__local" elif (isinstance(lhs_var, TemporaryVariable) and lhs_var.address_space == AddressSpace.LOCAL): var_kind = "__local" elif (isinstance(lhs_var, TemporaryVariable) and lhs_var.address_space == AddressSpace.GLOBAL): var_kind = "__global" else: raise LoopyError("unexpected kind of variable '%s' in " "atomic operation: '%s'" % (lhs_var.name, type(lhs_var).__name__)) old_val = "*(%s *) &" % ctype + old_val new_val = "*(%s *) &" % ctype + new_val cast_str = f"({var_kind} {ctype} *) " return Block([ POD(self, NumpyType(lhs_dtype.dtype, target=self.target), old_val_var), POD(self, NumpyType(lhs_dtype.dtype, target=self.target), new_val_var), DoWhile( "%(func_name)s(" "%(cast_str)s&(%(lhs_expr)s), " "%(old_val)s, " "%(new_val)s" ") != %(old_val)s" % { "func_name": func_name, "cast_str": cast_str, "lhs_expr": lhs_expr_code, "old_val": old_val, "new_val": new_val, }, Block([ Assign(old_val_var, lhs_expr_code), Assign(new_val_var, rhs_expr_code), ])) ]) else: raise NotImplementedError("atomic update for '%s'" % lhs_dtype)
def alias_temporaries(kernel, names, base_name_prefix=None, synchronize_for_exclusive_use=True): """Sets all temporaries given by *names* to be backed by a single piece of storage. :arg synchronize_for_exclusive_use: A :class:`bool`. If ``True``, this also introduces ordering structures ("groups") to prevent the usage to ensure that the live ranges (i.e. the regions of code where each of the temporaries is used) do not overlap. This will allow two (or more) temporaries to share the same storage space as long as their live ranges do not need to be concurrent. :arg base_name_prefix: an identifier to be used for the common storage area .. versionchanged:: 2016.3 Added *synchronize_for_exclusive_use* flag. ``synchronize_for_exclusive_use=True`` was the previous default behavior. """ gng = kernel.get_group_name_generator() group_names = [gng("tmpgrp_" + name) for name in names] if base_name_prefix is None: base_name_prefix = "temp_storage" vng = kernel.get_var_name_generator() base_name = vng(base_name_prefix) names_set = set(names) if synchronize_for_exclusive_use: new_insns = [] for insn in kernel.instructions: temp_deps = insn.dependency_names() & names_set if not temp_deps: new_insns.append(insn) continue if len(temp_deps) > 1: raise LoopyError( "Instruction {insn} refers to multiple of the " "temporaries being aliased, namely '{temps}'. Cannot alias." .format(insn=insn.id, temps=", ".join(temp_deps))) temp_name, = temp_deps temp_idx = names.index(temp_name) group_name = group_names[temp_idx] other_group_names = (frozenset(group_names[:temp_idx]) | frozenset(group_names[temp_idx + 1:])) new_insns.append( insn.copy(groups=insn.groups | frozenset([group_name]), conflicts_with_groups=(insn.conflicts_with_groups | other_group_names))) else: new_insns = kernel.instructions new_temporary_variables = {} for tv in kernel.temporary_variables.values(): if tv.name in names_set: if tv.base_storage is not None: raise LoopyError( "temporary variable '{tv}' already has " "a defined storage array -- cannot alias".format( tv=tv.name)) new_temporary_variables[tv.name] = \ tv.copy(base_storage=base_name) else: new_temporary_variables[tv.name] = tv return kernel.copy(instructions=new_insns, temporary_variables=new_temporary_variables)