def count(kernel, bset): try: return bset.card() except AttributeError: pass if not bset.is_box(): from loopy.diagnostic import warn warn(kernel, "count_overestimate", "Barvinok wrappers are not installed. " "Counting routines may overestimate the " "number of integer points in your loop " "domain.") result = None for i in range(bset.dim(isl.dim_type.set)): dmax = bset.dim_max(i) dmin = bset.dim_min(i) length = isl.PwQPolynomial.from_pw_aff(dmax - dmin + 1) if result is None: result = length else: result = result * length return result
def get_DRAM_access_poly(knl): from warnings import warn warn( "get_DRAM_access_poly is deprecated. Use get_gmem_access_poly instead", DeprecationWarning, stacklevel=2) return get_gmem_access_poly(knl)
def generate_vectorize_loop(codegen_state, sched_index): kernel = codegen_state.kernel iname = kernel.schedule[sched_index].iname bounds = kernel.get_iname_bounds(iname, constants_only=True) from loopy.isl_helpers import ( static_max_of_pw_aff, static_value_of_pw_aff) from loopy.symbolic import pw_aff_to_expr length_aff = static_max_of_pw_aff(bounds.size, constants_only=True) if not length_aff.is_cst(): warn(kernel, "vec_upper_not_const", "upper bound for vectorized loop '%s' is not a constant, " "cannot vectorize--unrolling instead") return generate_unroll_loop(kernel, sched_index, codegen_state) length = int(pw_aff_to_expr(length_aff)) try: lower_bound_aff = static_value_of_pw_aff( bounds.lower_bound_pw_aff.coalesce(), constants_only=False) except Exception as e: raise type(e)("while finding lower bound of '%s': " % iname) if not lower_bound_aff.plain_is_zero(): warn(kernel, "vec_lower_not_0", "lower bound for vectorized loop '%s' is not zero, " "cannot vectorize--unrolling instead") return generate_unroll_loop(kernel, sched_index, codegen_state) # {{{ 'implement' vectorization bounds domain = kernel.get_inames_domain(iname) from loopy.isl_helpers import make_slab slab = make_slab(domain.get_space(), iname, lower_bound_aff, lower_bound_aff+length) codegen_state = codegen_state.intersect(slab) # }}} from loopy.codegen import VectorizationInfo new_codegen_state = codegen_state.copy( vectorization_info=VectorizationInfo( iname=iname, length=length, space=length_aff.space)) return build_loop_nest(new_codegen_state, sched_index+1)
def generate_vectorize_loop(codegen_state, sched_index): kernel = codegen_state.kernel iname = kernel.schedule[sched_index].iname bounds = kernel.get_iname_bounds(iname, constants_only=True) from loopy.isl_helpers import ( static_max_of_pw_aff, static_value_of_pw_aff) from loopy.symbolic import pw_aff_to_expr length_aff = static_max_of_pw_aff(bounds.size, constants_only=True) if not length_aff.is_cst(): warn(kernel, "vec_upper_not_const", "upper bound for vectorized loop '%s' is not a constant, " "cannot vectorize--unrolling instead") return generate_unroll_loop(codegen_state, sched_index) length = int(pw_aff_to_expr(length_aff)) try: lower_bound_aff = static_value_of_pw_aff( bounds.lower_bound_pw_aff.coalesce(), constants_only=False) except Exception as e: raise type(e)("while finding lower bound of '%s': " % iname) if not lower_bound_aff.plain_is_zero(): warn(kernel, "vec_lower_not_0", "lower bound for vectorized loop '%s' is not zero, " "cannot vectorize--unrolling instead") return generate_unroll_loop(codegen_state, sched_index) # {{{ 'implement' vectorization bounds domain = kernel.get_inames_domain(iname) from loopy.isl_helpers import make_slab slab = make_slab(domain.get_space(), iname, lower_bound_aff, lower_bound_aff+length) codegen_state = codegen_state.intersect(slab) # }}} from loopy.codegen import VectorizationInfo new_codegen_state = codegen_state.copy( vectorization_info=VectorizationInfo( iname=iname, length=length, space=length_aff.space)) return build_loop_nest(new_codegen_state, sched_index+1)
def generate_code(kernel, device=None): if device is not None: from warnings import warn warn("passing 'device' to generate_code() is deprecated", DeprecationWarning, stacklevel=2) codegen_result = generate_code_v2(kernel) if len(codegen_result.device_programs) > 1: raise LoopyError("kernel passed to generate_code yielded multiple " "device programs. Use generate_code_v2.") return codegen_result.device_code(), codegen_result.implemented_data_info
def gather_access_footprints(kernel, ignore_uncountable=False): """Return a dictionary mapping ``(var_name, direction)`` to :class:`islpy.Set` instances capturing which indices of each the array *var_name* are read/written (where *direction* is either ``read`` or ``write``. :arg ignore_uncountable: If *True*, an error will be raised for accesses on which the footprint cannot be determined (e.g. data-dependent or nonlinear indices) """ from loopy.preprocess import preprocess_kernel, infer_unknown_types kernel = infer_unknown_types(kernel, expect_completion=True) kernel = preprocess_kernel(kernel) write_footprints = [] read_footprints = [] for insn in kernel.instructions: if not isinstance(insn, MultiAssignmentBase): warn( kernel, "count_non_assignment", "Non-assignment instruction encountered in " "gather_access_footprints, not counted") continue insn_inames = kernel.insn_inames(insn) inames_domain = kernel.get_inames_domain(insn_inames) domain = (inames_domain.project_out_except(insn_inames, [dim_type.set])) afg = AccessFootprintGatherer(kernel, domain, ignore_uncountable=ignore_uncountable) for assignee in insn.assignees: write_footprints.append(afg(insn.assignees)) read_footprints.append(afg(insn.expression)) write_footprints = AccessFootprintGatherer.combine(write_footprints) read_footprints = AccessFootprintGatherer.combine(read_footprints) result = {} for vname, footprint in six.iteritems(write_footprints): result[(vname, "write")] = footprint for vname, footprint in six.iteritems(read_footprints): result[(vname, "read")] = footprint return result
def gather_access_footprints(kernel, ignore_uncountable=False): """Return a dictionary mapping ``(var_name, direction)`` to :class:`islpy.Set` instances capturing which indices of each the array *var_name* are read/written (where *direction* is either ``read`` or ``write``. :arg ignore_uncountable: If *True*, an error will be raised for accesses on which the footprint cannot be determined (e.g. data-dependent or nonlinear indices) """ from loopy.preprocess import preprocess_kernel, infer_unknown_types kernel = infer_unknown_types(kernel, expect_completion=True) kernel = preprocess_kernel(kernel) write_footprints = [] read_footprints = [] for insn in kernel.instructions: if not isinstance(insn, MultiAssignmentBase): warn(kernel, "count_non_assignment", "Non-assignment instruction encountered in " "gather_access_footprints, not counted") continue insn_inames = kernel.insn_inames(insn) inames_domain = kernel.get_inames_domain(insn_inames) domain = (inames_domain.project_out_except(insn_inames, [dim_type.set])) afg = AccessFootprintGatherer(kernel, domain, ignore_uncountable=ignore_uncountable) for assignee in insn.assignees: write_footprints.append(afg(insn.assignees)) read_footprints.append(afg(insn.expression)) write_footprints = AccessFootprintGatherer.combine(write_footprints) read_footprints = AccessFootprintGatherer.combine(read_footprints) result = {} for vname, footprint in six.iteritems(write_footprints): result[(vname, "write")] = footprint for vname, footprint in six.iteritems(read_footprints): result[(vname, "read")] = footprint return result
def add_default_dependencies(kernel): logger.debug("%s: default deps" % kernel.name) writer_map = kernel.writer_map() arg_names = set(arg.name for arg in kernel.args) var_names = arg_names | set(six.iterkeys(kernel.temporary_variables)) dep_map = dict( (insn.id, insn.read_dependency_names() & var_names) for insn in kernel.instructions) new_insns = [] for insn in kernel.instructions: if not insn.insn_deps_is_final: auto_deps = set() # {{{ add automatic dependencies all_my_var_writers = set() for var in dep_map[insn.id]: var_writers = writer_map.get(var, set()) all_my_var_writers |= var_writers if not var_writers and var not in arg_names: warn(kernel, "read_no_write(%s)" % var, "temporary variable '%s' is read, but never written." % var) if len(var_writers) == 1: auto_deps.update(var_writers - set([insn.id])) # }}} insn_deps = insn.insn_deps if insn_deps is None: insn_deps = frozenset() insn = insn.copy(insn_deps=frozenset(auto_deps) | insn_deps) new_insns.append(insn) return kernel.copy(instructions=new_insns)
def try_vectorized(self, what, func): """If *self* is in a vectorizing state (:attr:`vectorization_info` is not None), tries to call func (which must be a callable accepting a single :class:`CodeGenerationState` argument). If this fails with :exc:`Unvectorizable`, it unrolls the vectorized loop instead. *func* should return a :class:`GeneratedCode` instance. :returns: :class:`GeneratedCode` """ if self.vectorization_info is None: return func(self) try: return func(self) except Unvectorizable as e: warn(self.kernel, "vectorize_failed", "Vectorization of '%s' failed because '%s'" % (what, e)) return self.unvectorize(func)
def count(kernel, set): try: return set.card() except AttributeError: pass count = isl.PwQPolynomial.zero( set.space.drop_dims(dim_type.set, 0, set.dim(dim_type.set)).add_dims(dim_type.set, 1)) set = set.make_disjoint() from loopy.isl_helpers import get_simple_strides for bset in set.get_basic_sets(): bset_count = None bset_rebuilt = bset.universe(bset.space) bset_strides = get_simple_strides(bset, key_by="index") for i in range(bset.dim(isl.dim_type.set)): dmax = bset.dim_max(i) dmin = bset.dim_min(i) stride = bset_strides.get((dim_type.set, i)) if stride is None: stride = 1 length = isl.PwQPolynomial.from_pw_aff(dmax - dmin + stride) length = length.scale_down_val(stride) if bset_count is None: bset_count = length else: bset_count = bset_count * length # {{{ rebuild check domain zero = isl.Aff.zero_on_domain(isl.LocalSpace.from_space( bset.space)) iname = isl.PwAff.from_aff( zero.set_coefficient_val(isl.dim_type.in_, i, 1)) dmin_matched = dmin.insert_dims(dim_type.in_, 0, bset.dim(isl.dim_type.set)) dmax_matched = dmax.insert_dims(dim_type.in_, 0, bset.dim(isl.dim_type.set)) for idx in range(bset.dim(isl.dim_type.set)): if bset.has_dim_id(isl.dim_type.set, idx): dim_id = bset.get_dim_id(isl.dim_type.set, idx) dmin_matched = dmin_matched.set_dim_id( isl.dim_type.in_, idx, dim_id) dmax_matched = dmax_matched.set_dim_id( isl.dim_type.in_, idx, dim_id) bset_rebuilt = ( bset_rebuilt & iname.le_set(dmax_matched) & iname.ge_set(dmin_matched) & (iname - dmin_matched).mod_val(stride).eq_set(zero)) # }}} if bset_count is not None: count += bset_count is_subset = bset <= bset_rebuilt is_superset = bset >= bset_rebuilt if not (is_subset and is_superset): if is_subset: from loopy.diagnostic import warn warn( kernel, "count_overestimate", "Barvinok wrappers are not installed. " "Counting routines have overestimated the " "number of integer points in your loop " "domain.") elif is_superset: from loopy.diagnostic import warn warn( kernel, "count_underestimate", "Barvinok wrappers are not installed. " "Counting routines have underestimated the " "number of integer points in your loop " "domain.") else: from loopy.diagnostic import warn warn( kernel, "count_misestimate", "Barvinok wrappers are not installed. " "Counting routines have misestimated the " "number of integer points in your loop " "domain.") return count
def generate_code(kernel, device=None): if device is not None: from warnings import warn warn("passing 'device' to generate_code() is deprecated", DeprecationWarning, stacklevel=2) if kernel.schedule is None: from loopy.schedule import get_one_scheduled_kernel kernel = get_one_scheduled_kernel(kernel) from loopy.kernel import kernel_state if kernel.state != kernel_state.SCHEDULED: raise LoopyError("cannot generate code for a kernel that has not been " "scheduled") # {{{ cache retrieval from loopy import CACHING_ENABLED if CACHING_ENABLED: input_kernel = kernel try: result = code_gen_cache[input_kernel] logger.info("%s: code generation cache hit" % kernel.name) return result except KeyError: pass # }}} from loopy.preprocess import infer_unknown_types kernel = infer_unknown_types(kernel, expect_completion=True) from loopy.check import pre_codegen_checks pre_codegen_checks(kernel) logger.info("%s: generate code: start" % kernel.name) # {{{ examine arg list from loopy.kernel.data import ValueArg from loopy.kernel.array import ArrayBase impl_arg_info = [] for arg in kernel.args: if isinstance(arg, ArrayBase): impl_arg_info.extend( arg.decl_info( kernel.target, is_written=arg.name in kernel.get_written_variables(), index_dtype=kernel.index_dtype)) elif isinstance(arg, ValueArg): impl_arg_info.append(ImplementedDataInfo( target=kernel.target, name=arg.name, dtype=arg.dtype, cgen_declarator=arg.get_arg_decl(kernel.target), arg_class=ValueArg)) else: raise ValueError("argument type not understood: '%s'" % type(arg)) allow_complex = False for var in kernel.args + list(six.itervalues(kernel.temporary_variables)): if var.dtype.kind == "c": allow_complex = True # }}} seen_dtypes = set() seen_functions = set() initial_implemented_domain = isl.BasicSet.from_params(kernel.assumptions) codegen_state = CodeGenerationState( kernel=kernel, implemented_domain=initial_implemented_domain, implemented_predicates=frozenset(), seen_dtypes=seen_dtypes, seen_functions=seen_functions, var_subst_map={}, allow_complex=allow_complex) code_str, implemented_domains = kernel.target.generate_code( kernel, codegen_state, impl_arg_info) from loopy.check import check_implemented_domains assert check_implemented_domains(kernel, implemented_domains, code_str) # {{{ handle preambles for arg in kernel.args: seen_dtypes.add(arg.dtype) for tv in six.itervalues(kernel.temporary_variables): seen_dtypes.add(tv.dtype) preambles = kernel.preambles[:] preamble_generators = (kernel.preamble_generators + kernel.target.preamble_generators()) for prea_gen in preamble_generators: preambles.extend(prea_gen(kernel, seen_dtypes, seen_functions)) seen_preamble_tags = set() dedup_preambles = [] for tag, preamble in sorted(preambles, key=lambda tag_code: tag_code[0]): if tag in seen_preamble_tags: continue seen_preamble_tags.add(tag) dedup_preambles.append(preamble) from loopy.tools import remove_common_indentation preamble_codes = [ remove_common_indentation(lines) + "\n" for lines in dedup_preambles] code_str = "".join(preamble_codes) + code_str # }}} logger.info("%s: generate code: done" % kernel.name) result = code_str, impl_arg_info if CACHING_ENABLED: code_gen_cache[input_kernel] = result return result
def get_auto_axis_iname_ranking_by_stride(kernel, insn): from loopy.kernel.data import ImageArg, ValueArg approximate_arg_values = {} for arg in kernel.args: if isinstance(arg, ValueArg): if arg.approximately is not None: approximate_arg_values[arg.name] = arg.approximately else: raise LoopyError("No approximate arg value specified for '%s'" % arg.name) # {{{ find all array accesses in insn from loopy.symbolic import ArrayAccessFinder ary_acc_exprs = list(ArrayAccessFinder()(insn.expression)) from pymbolic.primitives import Subscript if isinstance(insn.assignee, Subscript): ary_acc_exprs.append(insn.assignee) # }}} # {{{ filter array accesses to only the global ones global_ary_acc_exprs = [] for aae in ary_acc_exprs: ary_name = aae.aggregate.name arg = kernel.arg_dict.get(ary_name) if arg is None: continue if isinstance(arg, ImageArg): continue global_ary_acc_exprs.append(aae) # }}} # {{{ figure out automatic-axis inames from loopy.kernel.data import AutoLocalIndexTagBase auto_axis_inames = set( iname for iname in kernel.insn_inames(insn) if isinstance(kernel.iname_to_tag.get(iname), AutoLocalIndexTagBase)) # }}} # {{{ figure out which iname should get mapped to local axis 0 # maps inames to "aggregate stride" aggregate_strides = {} from loopy.symbolic import CoefficientCollector from pymbolic.primitives import Variable for aae in global_ary_acc_exprs: index_expr = aae.index if not isinstance(index_expr, tuple): index_expr = (index_expr,) ary_name = aae.aggregate.name arg = kernel.arg_dict.get(ary_name) if arg.dim_tags is None: from warnings import warn warn("Strides for '%s' are not known. Local axis assignment " "is likely suboptimal." % arg.name) ary_strides = [1] * len(index_expr) else: ary_strides = [] from loopy.kernel.array import FixedStrideArrayDimTag for dim_tag in arg.dim_tags: if isinstance(dim_tag, FixedStrideArrayDimTag): ary_strides.append(dim_tag.stride) # {{{ construct iname_to_stride_expr iname_to_stride_expr = {} for iexpr_i, stride in zip(index_expr, ary_strides): if stride is None: continue coeffs = CoefficientCollector()(iexpr_i) for var, coeff in six.iteritems(coeffs): if (isinstance(var, Variable) and var.name in auto_axis_inames): # excludes '1', i.e. the constant new_stride = coeff*stride old_stride = iname_to_stride_expr.get(var.name, None) if old_stride is None or new_stride < old_stride: iname_to_stride_expr[var.name] = new_stride # }}} from pymbolic import evaluate for iname, stride_expr in six.iteritems(iname_to_stride_expr): stride = evaluate(stride_expr, approximate_arg_values) aggregate_strides[iname] = aggregate_strides.get(iname, 0) + stride if aggregate_strides: very_large_stride = np.iinfo(np.int32).max return sorted((iname for iname in kernel.insn_inames(insn)), key=lambda iname: aggregate_strides.get(iname, very_large_stride)) else: return None
def mark_local_temporaries(kernel): logger.debug("%s: mark local temporaries" % kernel.name) new_temp_vars = {} from loopy.kernel.data import LocalIndexTagBase import loopy as lp writers = kernel.writer_map() from loopy.symbolic import get_dependencies for temp_var in six.itervalues(kernel.temporary_variables): # Only fill out for variables that do not yet know if they're # local. (I.e. those generated by implicit temporary generation.) if temp_var.is_local is not lp.auto: new_temp_vars[temp_var.name] = temp_var continue my_writers = writers.get(temp_var.name, []) wants_to_be_local_per_insn = [] for insn_id in my_writers: insn = kernel.id_to_insn[insn_id] # A write race will emerge if: # # - the variable is local # and # - the instruction is run across more inames (locally) parallel # than are reflected in the assignee indices. locparallel_compute_inames = set(iname for iname in kernel.insn_inames(insn_id) if isinstance(kernel.iname_to_tag.get(iname), LocalIndexTagBase)) locparallel_assignee_inames = set(iname for _, assignee_indices in insn.assignees_and_indices() for iname in get_dependencies(assignee_indices) & kernel.all_inames() if isinstance(kernel.iname_to_tag.get(iname), LocalIndexTagBase)) assert locparallel_assignee_inames <= locparallel_compute_inames if (locparallel_assignee_inames != locparallel_compute_inames and bool(locparallel_assignee_inames)): warn(kernel, "write_race_local(%s)" % insn_id, "instruction '%s' looks invalid: " "it assigns to indices based on local IDs, but " "its temporary '%s' cannot be made local because " "a write race across the iname(s) '%s' would emerge. " "(Do you need to add an extra iname to your prefetch?)" % (insn_id, temp_var.name, ", ".join( locparallel_compute_inames - locparallel_assignee_inames)), WriteRaceConditionWarning) wants_to_be_local_per_insn.append( locparallel_assignee_inames == locparallel_compute_inames # doesn't want to be local if there aren't any # parallel inames: and bool(locparallel_compute_inames)) if not wants_to_be_local_per_insn: warn(kernel, "temp_to_write(%s)" % temp_var.name, "temporary variable '%s' never written, eliminating" % temp_var.name, LoopyAdvisory) continue is_local = any(wants_to_be_local_per_insn) from pytools import all if not all(wtbl == is_local for wtbl in wants_to_be_local_per_insn): raise LoopyError("not all instructions agree on whether " "temporary '%s' should be in local memory" % temp_var.name) new_temp_vars[temp_var.name] = temp_var.copy(is_local=is_local) return kernel.copy(temporary_variables=new_temp_vars)
def preprocess_kernel(kernel, device=None): if device is not None: from warnings import warn warn("passing 'device' to preprocess_kernel() is deprecated", DeprecationWarning, stacklevel=2) from loopy.kernel import kernel_state if kernel.state != kernel_state.INITIAL: raise LoopyError("cannot re-preprocess an already preprocessed " "kernel") # {{{ cache retrieval from loopy import CACHING_ENABLED if CACHING_ENABLED: input_kernel = kernel try: result = preprocess_cache[kernel] logger.info("%s: preprocess cache hit" % kernel.name) return result except KeyError: pass # }}} logger.info("%s: preprocess start" % kernel.name) from loopy.subst import expand_subst kernel = expand_subst(kernel) # Ordering restriction: # Type inference doesn't handle substitutions. Get them out of the # way. kernel = infer_unknown_types(kernel, expect_completion=False) kernel = add_default_dependencies(kernel) # Ordering restrictions: # # - realize_reduction must happen after type inference because it needs # to be able to determine the types of the reduced expressions. # # - realize_reduction must happen after default dependencies are added # because it manipulates the insn_deps field, which could prevent # defaults from being applied. kernel = realize_reduction(kernel) # Ordering restriction: # duplicate_private_temporaries_for_ilp because reduction accumulators # need to be duplicated by this. kernel = duplicate_private_temporaries_for_ilp_and_vec(kernel) kernel = mark_local_temporaries(kernel) kernel = assign_automatic_axes(kernel) kernel = find_boostability(kernel) kernel = limit_boostability(kernel) kernel = kernel.target.preprocess(kernel) logger.info("%s: preprocess done" % kernel.name) kernel = kernel.copy( state=kernel_state.PREPROCESSED) # {{{ prepare for caching # PicklableDtype instances for example need to know the target they're working # towards in order to pickle and unpickle them. This is the first pass that # uses caching, so we need to be ready to pickle. This means propagating # this target information. if CACHING_ENABLED: input_kernel = prepare_for_caching(input_kernel) kernel = prepare_for_caching(kernel) # }}} if CACHING_ENABLED: preprocess_cache[input_kernel] = kernel return kernel
def count(kernel, set): try: return set.card() except AttributeError: pass count = isl.PwQPolynomial.zero( set.space .drop_dims(dim_type.set, 0, set.dim(dim_type.set)) .add_dims(dim_type.set, 1)) set = set.make_disjoint() from loopy.isl_helpers import get_simple_strides for bset in set.get_basic_sets(): bset_count = None bset_rebuilt = bset.universe(bset.space) bset_strides = get_simple_strides(bset, key_by="index") for i in range(bset.dim(isl.dim_type.set)): dmax = bset.dim_max(i) dmin = bset.dim_min(i) stride = bset_strides.get((dim_type.set, i)) if stride is None: stride = 1 length = isl.PwQPolynomial.from_pw_aff(dmax - dmin + stride) length = length.scale_down_val(stride) if bset_count is None: bset_count = length else: bset_count = bset_count * length # {{{ rebuild check domain zero = isl.Aff.zero_on_domain(isl.LocalSpace.from_space(bset.space)) iname = isl.PwAff.from_aff( zero.set_coefficient_val(isl.dim_type.in_, i, 1)) dmin_matched = dmin.insert_dims( dim_type.in_, 0, bset.dim(isl.dim_type.set)) dmax_matched = dmax.insert_dims( dim_type.in_, 0, bset.dim(isl.dim_type.set)) for idx in range(bset.dim(isl.dim_type.set)): if bset.has_dim_id(isl.dim_type.set, idx): dim_id = bset.get_dim_id(isl.dim_type.set, idx) dmin_matched = dmin_matched.set_dim_id( isl.dim_type.in_, idx, dim_id) dmax_matched = dmax_matched.set_dim_id( isl.dim_type.in_, idx, dim_id) bset_rebuilt = ( bset_rebuilt & iname.le_set(dmax_matched) & iname.ge_set(dmin_matched) & (iname-dmin_matched).mod_val(stride).eq_set(zero)) # }}} if bset_count is not None: count += bset_count is_subset = bset <= bset_rebuilt is_superset = bset >= bset_rebuilt if not (is_subset and is_superset): if is_subset: from loopy.diagnostic import warn warn(kernel, "count_overestimate", "Barvinok wrappers are not installed. " "Counting routines have overestimated the " "number of integer points in your loop " "domain.") elif is_superset: from loopy.diagnostic import warn warn(kernel, "count_underestimate", "Barvinok wrappers are not installed. " "Counting routines have underestimated the " "number of integer points in your loop " "domain.") else: from loopy.diagnostic import warn warn(kernel, "count_misestimate", "Barvinok wrappers are not installed. " "Counting routines have misestimated the " "number of integer points in your loop " "domain.") return count
def check_for_write_races(kernel): from loopy.symbolic import DependencyMapper from loopy.kernel.data import ParallelTag, GroupIndexTag, LocalIndexTagBase depmap = DependencyMapper(composite_leaves=False) iname_to_tag = kernel.iname_to_tag.get for insn in kernel.instructions: for assignee_name, assignee_indices in insn.assignees_and_indices(): assignee_indices = depmap(assignee_indices) def strip_var(expr): from pymbolic.primitives import Variable assert isinstance(expr, Variable) return expr.name assignee_indices = set(strip_var(index) for index in assignee_indices) assignee_inames = assignee_indices & kernel.all_inames() if not assignee_inames <= kernel.insn_inames(insn): raise LoopyError( "assignee of instructiosn '%s' references " "iname that the instruction does not depend on" % insn.id) if assignee_name in kernel.arg_dict: # Any parallel tags that are not depended upon by the assignee # will cause write races. raceable_parallel_insn_inames = set( iname for iname in kernel.insn_inames(insn) if isinstance(iname_to_tag(iname), ParallelTag)) elif assignee_name in kernel.temporary_variables: temp_var = kernel.temporary_variables[assignee_name] if temp_var.is_local is True: raceable_parallel_insn_inames = set( iname for iname in kernel.insn_inames(insn) if isinstance(iname_to_tag(iname), ParallelTag) and not isinstance(iname_to_tag(iname), GroupIndexTag)) elif temp_var.is_local is False: raceable_parallel_insn_inames = set( iname for iname in kernel.insn_inames(insn) if isinstance(iname_to_tag(iname), ParallelTag) and not isinstance(iname_to_tag(iname), GroupIndexTag) and not isinstance(iname_to_tag(iname), LocalIndexTagBase)) else: raise LoopyError("temp var '%s' hasn't decided on " "whether it is local" % temp_var.name) else: raise LoopyError("invalid assignee name in instruction '%s'" % insn.id) race_inames = \ raceable_parallel_insn_inames - assignee_inames if race_inames: warn(kernel, "write_race(%s)" % insn.id, "instruction '%s' contains a write race: " "instruction will be run across parallel iname(s) " "'%s', which is/are not referenced in the lhs index" % (insn.id, ",".join(race_inames)), WriteRaceConditionWarning)
def get_DRAM_access_poly(knl): from warnings import warn warn("get_DRAM_access_poly is deprecated. Use get_gmem_access_poly instead", DeprecationWarning, stacklevel=2) return get_gmem_access_poly(knl)