def process(func, state): """ Apply ``func`` to the IETs in ``state._efuncs``, and update ``state`` accordingly. """ # Create a Call graph. `func` will be applied to each node in the Call graph. # `func` might change an `efunc` signature; the Call graph will be used to # propagate such change through the `efunc` callers dag = DAG(nodes=['root']) queue = ['root'] while queue: caller = queue.pop(0) callees = FindNodes(Call).visit(state._efuncs[caller]) for callee in filter_ordered([i.name for i in callees]): if callee in state._efuncs: # Exclude foreign Calls, e.g., MPI calls try: dag.add_node(callee) queue.append(callee) except KeyError: # `callee` already in `dag` pass dag.add_edge(callee, caller) assert dag.size == len(state._efuncs) # Apply `func` for i in dag.topological_sort(): state._efuncs[i], metadata = func(state._efuncs[i]) # Track any new Dimensions introduced by `func` state._dimensions.extend(list(metadata.get('dimensions', []))) # Track any new #include required by `func` state._includes.extend(list(metadata.get('includes', []))) state._includes = filter_ordered(state._includes) # Track any new ElementalFunctions state._efuncs.update(OrderedDict([(i.name, i) for i in metadata.get('efuncs', [])])) # If there's a change to the `args` and the `iet` is an efunc, then # we must update the call sites as well, as the arguments dropped down # to the efunc have just increased args = as_tuple(metadata.get('args')) if args: # `extif` avoids redundant updates to the parameters list, due # to multiple children wanting to add the same input argument extif = lambda v: list(v) + [e for e in args if e not in v] stack = [i] + dag.all_downstreams(i) for n in stack: efunc = state._efuncs[n] calls = [c for c in FindNodes(Call).visit(efunc) if c.name in stack] mapper = {c: c._rebuild(arguments=extif(c.arguments)) for c in calls} efunc = Transformer(mapper).visit(efunc) if efunc.is_Callable: efunc = efunc._rebuild(parameters=extif(efunc.parameters)) state._efuncs[n] = efunc
def reorder(cls, items, relations): if not all(isinstance(i, AbstractInterval) for i in items): raise ValueError( "Cannot create an IntervalGroup from objects of type [%s]" % ', '.join(str(type(i)) for i in items)) # The relations are between dimensions, not intervals. So we take # care of that here ordering = filter_ordered(toposort(relations) + [i.dim for i in items]) return sorted(items, key=lambda i: ordering.index(i.dim))
def __init__(self, expr, dtype=None): assert isinstance(expr, Eq) assert isinstance(expr.lhs, (Symbol, Indexed)) self.expr = expr self.dtype = dtype # Traverse /expression/ to determine meta information # Note: at this point, expressions have already been indexified self.reads = [ i for i in retrieve_terminals(self.expr.rhs) if isinstance(i, (types.Indexed, types.Symbol)) ] self.reads = filter_ordered(self.reads) self.functions = [self.write] + [i.base.function for i in self.reads] self.functions = filter_ordered(self.functions) # Filter collected dimensions and functions self.dimensions = flatten(i.indices for i in self.functions) self.dimensions = filter_ordered(self.dimensions)
def default_rules(obj, functions): def generate_subs(deriv_order, function, index): dim = retrieve_dimensions(index)[0] if dim.is_Time: fd_order = function.time_order elif dim.is_Space: fd_order = function.space_order else: # Shouldn't arrive here raise TypeError("Dimension type not recognised") subs = {} mapper = {dim: index} indices, x0 = generate_indices(function, dim, fd_order, side=None, x0=mapper) coeffs = sympy.finite_diff_weights(deriv_order, indices, x0)[-1][-1] for j in range(len(coeffs)): subs.update({ function._coeff_symbol(indices[j], deriv_order, function, index): coeffs[j] }) return subs # Determine which 'rules' are missing sym = get_sym(functions) terms = obj.find(sym) args_present = filter_ordered(term.args[1:] for term in terms) subs = obj.substitutions if subs: args_provided = [(i.deriv_order, i.function, i.index) for i in subs.coefficients] else: args_provided = [] # NOTE: Do we want to throw a warning if the same arg has # been provided twice? args_provided = list(set(args_provided)) not_provided = [ i for i in args_present if i not in frozenset(args_provided) ] rules = {} for i in not_provided: rules = {**rules, **generate_subs(*i)} return rules
def generate_nthreads(nthreads, args, level): ret = [((i.name, args[i.name]), ) for i in nthreads] # On the KNL, also try running with a different number of hyperthreads if level == 'aggressive' and configuration['platform'] == 'knl': ret.extend([((i.name, psutil.cpu_count()), ) for i in nthreads]) ret.extend([((i.name, psutil.cpu_count() // 2), ) for i in nthreads]) ret.extend([((i.name, psutil.cpu_count() // 4), ) for i in nthreads]) return filter_ordered(ret)
def _dist_datamap(self): """ Mapper ``M : MPI rank -> required sparse data``. """ ret = {} for i, s in enumerate(self._support): # Sparse point `i` is "required" by the following ranks for r in self.grid.distributor.glb_to_rank(s): ret.setdefault(r, []).append(i) return {k: filter_ordered(v) for k, v in ret.items()}
def _dist_datamap(self): """ Mapper ``M : MPI rank -> required sparse data``. """ ret = {} for i, s in enumerate(self._support): # Sparse point `i` is "required" by the following ranks for r in self.grid.distributor.glb_to_rank(s): ret.setdefault(r, []).append(i) return {k: filter_ordered(v) for k, v in ret.items()}
def init_configuration(configuration=configuration, env_vars_mapper=env_vars_mapper, env_vars_deprecated=env_vars_deprecated): # Populate `configuration` with user-provided options if environ.get('DEVITO_CONFIG') is None: # It is important to configure `platform`, `compiler` and `backend` in this order process_order = filter_ordered(['platform', 'compiler', 'backend'] + list(env_vars_mapper.values())) queue = sorted(env_vars_mapper.items(), key=lambda i: process_order.index(i[1])) unprocessed = OrderedDict([(v, environ.get(k, configuration._defaults[v])) for k, v in queue]) # Handle deprecated env vars mapper = dict(queue) for k, (v, msg) in env_vars_deprecated.items(): if environ.get(k): warning("`%s` is deprecated. %s" % (k, msg)) if environ.get(v): warning("Both `%s` and `%s` set. Ignoring `%s`" % (k, v, k)) else: warning("Setting `%s=%s`" % (v, environ[k])) unprocessed[mapper[v]] = environ[k] else: # Attempt reading from the specified configuration file raise NotImplementedError("Devito doesn't support configuration via file yet.") # Parameters validation for k, v in unprocessed.items(): try: items = v.split(';') # Env variable format: 'var=k1:v1;k2:v2:k3:v3:...' keys, values = zip(*[i.split(':') for i in items]) # Casting values = [eval(i) for i in values] except AttributeError: # Env variable format: 'var=v', 'v' is not a string keys = [v] values = [] except ValueError: # Env variable format: 'var=k1;k2:v2...' or even just 'var=v' keys = [i.split(':')[0] for i in items] values = [] # Cast to integer for i, j in enumerate(list(keys)): try: keys[i] = int(j) except (TypeError, ValueError): keys[i] = j if len(keys) == len(values): configuration.update(k, dict(zip(keys, values))) elif len(keys) == 1: configuration.update(k, keys[0]) else: configuration.update(k, keys) configuration.initialize()
def mark_parallel(analysis): """Update the ``analysis`` detecting the ``SEQUENTIAL`` and ``PARALLEL`` Iterations within ``analysis.iet``.""" properties = OrderedDict() for tree in analysis.trees: for depth, i in enumerate(tree): if properties.get(i) is SEQUENTIAL: # Speed-up analysis continue if i.uindices: # Only ++/-- increments of iteration variables are supported properties.setdefault(i, []).append(SEQUENTIAL) continue # Get all dimensions up to and including Iteration /i/, grouped by Iteration dims = [filter_ordered(j.dimensions) for j in tree[:depth + 1]] # Get all dimensions up to and including Iteration /i-1/ prev = flatten(dims[:-1]) # Get all dimensions up to and including Iteration /i/ dims = flatten(dims) # The i-th Iteration is PARALLEL if for all dependences (d_1, ..., d_n): # test0 := (d_1, ..., d_{i-1}) > 0, OR # test1 := (d_1, ..., d_i) = 0 is_parallel = True # The i-th Iteration is PARALLEL_IF_ATOMIC if for all dependeces: # test0 OR test1 OR the write is an associative and commutative increment is_atomic_parallel = True for dep in analysis.scopes[i].d_all: test0 = len(prev) > 0 and any(dep.is_carried(d) for d in prev) test1 = all(dep.is_indep(d) for d in dims) test2 = all(dep.is_reduce_atmost(d) for d in prev) and dep.is_indep(i.dim) if not (test0 or test1 or test2): is_parallel = False if not dep.is_increment: is_atomic_parallel = False break if is_parallel: properties.setdefault(i, []).append(PARALLEL) elif is_atomic_parallel: properties.setdefault(i, []).append(PARALLEL_IF_ATOMIC) else: properties.setdefault(i, []).append(SEQUENTIAL) # Reduction (e.g, SEQUENTIAL takes priority over PARALLEL) priorities = {PARALLEL: 0, PARALLEL_IF_ATOMIC: 1, SEQUENTIAL: 2} properties = OrderedDict([(k, max(v, key=lambda i: priorities[i])) for k, v in properties.items()]) analysis.update(properties)
def _make_clauses(cls, **kwargs): kwargs['chunk_size'] = False clauses = super()._make_clauses(**kwargs) indexeds = FindSymbols('indexeds').visit(kwargs['nodes']) deviceptrs = filter_ordered(i.name for i in indexeds if i.function._mem_local) if deviceptrs: clauses.append("is_device_ptr(%s)" % ",".join(deviceptrs)) return clauses
def _dist_gather_mask(self): """ A mask to index into the ``data`` received upon returning from ``self._dist_alltoall``. This mask creates a new data array in which duplicate sparse data values have been discarded. The resulting data array can thus be used to populate ``self.data``. """ ret = list(self._dist_scatter_mask) mask = ret[self._sparse_position] ret[self._sparse_position] = [mask.tolist().index(i) for i in filter_ordered(mask)] return tuple(ret)
def _dist_gather_mask(self): """ A mask to index into the ``data`` received upon returning from ``self._dist_alltoall``. This mask creates a new data array in which duplicate sparse data values have been discarded. The resulting data array can thus be used to populate ``self.data``. """ ret = list(self._dist_scatter_mask) mask = ret[self._sparse_position] ret[self._sparse_position] = [mask.tolist().index(i) for i in filter_ordered(mask)] return tuple(ret)
def __init__(self, exprs, **kwargs): # Check input legality mapper = OrderedDict([(i.lhs, i) for i in exprs]) if len(set(mapper)) != len(mapper): raise DSEException( "Found redundant node, cannot build TemporariesGraph.") # Construct Temporaries, tracking reads and readby tensor_map = DefaultOrderedDict(list) for i in mapper: tensor_map[as_symbol(i)].append(i) reads = DefaultOrderedDict(set) readby = DefaultOrderedDict(set) for k, v in mapper.items(): handle = retrieve_terminals(v.rhs) for i in list(handle): if i.is_Indexed: for idx in i.indices: handle |= retrieve_terminals(idx) reads[k].update( set(flatten([tensor_map.get(as_symbol(i), []) for i in handle]))) for i in reads[k]: readby[i].add(k) # Make sure read-after-writes are honored for scalar temporaries processed = [i for i in mapper if i.is_Indexed] queue = [i for i in mapper if i not in processed] while queue: k = queue.pop(0) if not readby[k]: processed.insert(0, k) elif all(i in processed for i in readby[k]): index = min(processed.index(i) for i in readby[k]) processed.insert(index, k) else: queue.append(k) # Build up the TemporariesGraph temporaries = [(i, Temporary(*mapper[i].args, inc=q_inc(mapper[i]), reads=reads[i], readby=readby[i])) for i in processed] super(TemporariesGraph, self).__init__(temporaries, **kwargs) # Determine indices along the space and time dimensions terms = [ v for k, v in self.items() if v.is_tensor and not q_indirect(k) ] indices = filter_ordered(flatten([i.function.indices for i in terms])) self.space_indices = tuple(i for i in indices if i.is_Space) self.time_indices = tuple(i for i in indices if i.is_Time)
def __init__(self, exprs, **kwargs): # Always convert to SSA exprs = convert_to_SSA(exprs) mapper = OrderedDict([(i.lhs, i) for i in exprs]) assert len(set(mapper)) == len(exprs), "not SSA Cluster?" # Construct the Nodes, tracking reads and readby tensor_map = DefaultOrderedDict(list) for i in mapper: tensor_map[as_symbol(i)].append(i) reads = DefaultOrderedDict(set) readby = DefaultOrderedDict(set) for k, v in mapper.items(): handle = retrieve_terminals(v.rhs) for i in list(handle): if i.is_Indexed: for idx in i.indices: handle |= retrieve_terminals(idx) reads[k].update( set(flatten([tensor_map.get(as_symbol(i), []) for i in handle]))) for i in reads[k]: readby[i].add(k) # Make sure read-after-writes are honored for scalar temporaries processed = [i for i in mapper if i.is_Indexed] queue = [i for i in mapper if i not in processed] while queue: k = queue.pop(0) if not readby[k]: processed.insert(0, k) elif all(i in processed for i in readby[k]): index = min(processed.index(i) for i in readby[k]) processed.insert(index, k) else: queue.append(k) # Build up the FlowGraph temporaries = [(i, Node(*mapper[i].args, inc=q_inc(mapper[i]), reads=reads[i], readby=readby[i])) for i in processed] super(FlowGraph, self).__init__(temporaries, **kwargs) # Determine indices along the space and time dimensions terms = [ v for k, v in self.items() if v.is_tensor and not q_indirect(k) ] indices = filter_ordered(flatten([i.function.indices for i in terms])) self.space_indices = tuple(i for i in indices if i.is_Space) self.time_indices = tuple(i for i in indices if i.is_Time)
def __init__(self, intervals, sub_iterators=None, directions=None): super(IterationSpace, self).__init__(intervals) # Normalize sub-iterators sub_iterators = sub_iterators or {} self._sub_iterators = frozendict([(k, tuple(filter_ordered(as_tuple(v)))) for k, v in sub_iterators.items()]) # Normalize directions if directions is None: self._directions = frozendict([(i.dim, Any) for i in self.intervals]) else: self._directions = frozendict(directions)
def _make_waitprefetch(self, iet, sync_ops, pieces, *args): ff = SharedData._field_flag waits = [] objs = filter_ordered(pieces.objs.get(s) for s in sync_ops) for sdata, threads in objs: wait = BusyWait( CondNe(FieldFromComposite(ff, sdata[threads.index]), 1)) waits.append(wait) iet = List(header=c.Comment("Wait for the arrival of prefetched data"), body=waits + [BlankLine, iet]) return iet
def mark_parallel(analysis): """Update the ``analysis`` detecting the ``SEQUENTIAL`` and ``PARALLEL`` Iterations within ``analysis.iet``.""" properties = OrderedDict() for tree in analysis.trees: for depth, i in enumerate(tree): if i in properties: continue if i.uindices: # Only ++/-- increments of iteration variables are supported properties[i] = SEQUENTIAL continue # Get all dimensions up to and including Iteration /i/, grouped by Iteration dims = [filter_ordered(j.dimensions) for j in tree[:depth + 1]] # Get all dimensions up to and including Iteration /i-1/ prev = flatten(dims[:-1]) # Get all dimensions up to and including Iteration /i/ dims = flatten(dims) # The i-th Iteration is PARALLEL if for all dependences (d_1, ..., d_n): # test0 := (d_1, ..., d_{i-1}) > 0, OR # test1 := (d_1, ..., d_i) = 0 is_parallel = True # The i-th Iteration is PARALLEL_IF_ATOMIC if for all dependeces: # test0 OR test1 OR the write is an associative and commutative increment is_atomic_parallel = True for dep in analysis.scopes[i].d_all: test0 = len(prev) > 0 and any(dep.is_carried(d) for d in prev) test1 = all(dep.is_indep(d) for d in dims) test2 = all(dep.is_reduce_atmost(d) for d in prev) and dep.is_indep(i.dim) if not (test0 or test1 or test2): is_parallel = False if not dep.is_increment: is_atomic_parallel = False break if is_parallel: properties[i] = PARALLEL elif is_atomic_parallel: properties[i] = PARALLEL_IF_ATOMIC else: properties[i] = SEQUENTIAL analysis.update(properties)
def init_configuration(configuration=configuration, env_vars_mapper=env_vars_mapper): # Populate /configuration/ with user-provided options if environ.get('DEVITO_CONFIG') is None: # At init time, it is important to first configure the compiler, then # the backend (which is impacted by the compiler), finally everything # else in any arbitrary order process_order = filter_ordered(['compiler', 'backend'] + list(env_vars_mapper.values())) queue = sorted(env_vars_mapper.items(), key=lambda i: process_order.index(i[1])) unprocessed = OrderedDict([ (v, environ.get(k, configuration._defaults[v])) for k, v in queue ]) else: # Attempt reading from the specified configuration file raise NotImplementedError( "Devito doesn't support configuration via file yet.") # Parameters validation for k, v in unprocessed.items(): try: items = v.split(';') # Env variable format: 'var=k1:v1;k2:v2:k3:v3:...' keys, values = zip(*[i.split(':') for i in items]) # Casting values = [eval(i) for i in values] except AttributeError: # Env variable format: 'var=v', 'v' is not a string keys = [v] values = [] except ValueError: # Env variable format: 'var=k1;k2:v2...' or even just 'var=v' keys = [i.split(':')[0] for i in items] values = [] # Cast to integer for i, j in enumerate(list(keys)): try: keys[i] = int(j) except (TypeError, ValueError): keys[i] = j if len(keys) == len(values): configuration.update(k, dict(zip(keys, values))) elif len(keys) == 1: configuration.update(k, keys[0]) else: configuration.update(k, keys) configuration.initialize()
def visit_Iteration(self, o, subs={}, offsets=defaultdict(set)): nodes = self.visit(o.children, subs=subs, offsets=offsets) if o.dim.is_Buffered: # For buffered dimensions insert the explicit # definition of buffered variables, eg. t+1 => t1 init = [] for i, off in enumerate(filter_ordered(offsets[o.dim])): vname = Symbol("%s%d" % (o.dim.name, i)) value = (o.dim.parent + off) % o.dim.modulo init.append(UnboundedIndex(vname, value, value)) subs[o.dim + off] = LoweredDimension(vname.name, o.dim, off) # Always lower to symbol subs[o.dim.parent] = Symbol(o.dim.parent.name) return o._rebuild(index=o.dim.parent.name, uindices=init) else: return o._rebuild(*nodes)
def _index_matrix(self, offset): # Note about the use of *memoization* # Since this method is called by `_interpolation_indices`, using # memoization avoids a proliferation of symbolically identical # ConditionalDimensions for a given set of indirection indices # List of indirection indices for all adjacent grid points index_matrix = [tuple(idx + ii + offset for ii, idx in zip(inc, self._coordinate_indices)) for inc in self._point_increments] # A unique symbol for each indirection index indices = filter_ordered(flatten(index_matrix)) points = OrderedDict([(p, Symbol(name='ii_%s_%d' % (self.name, i))) for i, p in enumerate(indices)]) return index_matrix, points
def _index_matrix(self, offset): # Note about the use of *memoization* # Since this method is called by `_interpolation_indices`, using # memoization avoids a proliferation of symbolically identical # ConditionalDimensions for a given set of indirection indices # List of indirection indices for all adjacent grid points index_matrix = [tuple(idx + ii + offset for ii, idx in zip(inc, self._coordinate_indices)) for inc in self._point_increments] # A unique symbol for each indirection index indices = filter_ordered(flatten(index_matrix)) points = OrderedDict([(p, Symbol(name='ii_%s_%d' % (self.name, i))) for i, p in enumerate(indices)]) return index_matrix, points
def __init__(self, exprs, **kwargs): # Always convert to SSA exprs = makeit_ssa(exprs) mapper = OrderedDict([(i.lhs, i) for i in exprs]) assert len(set(mapper)) == len(exprs), "not SSA Cluster?" # Construct the Nodes, tracking reads and readby tensor_map = DefaultOrderedDict(list) for i in mapper: tensor_map[as_symbol(i)].append(i) reads = DefaultOrderedDict(set) readby = DefaultOrderedDict(set) for k, v in mapper.items(): handle = retrieve_terminals(v.rhs) for i in list(handle): if i.is_Indexed: for idx in i.indices: handle |= retrieve_terminals(idx) reads[k].update(set(flatten([tensor_map.get(as_symbol(i), []) for i in handle]))) for i in reads[k]: readby[i].add(k) # Make sure read-after-writes are honored for scalar nodes processed = [i for i in mapper if i.is_Indexed] queue = [i for i in mapper if i not in processed] while queue: k = queue.pop(0) if not readby[k] or k in readby[k]: processed.insert(0, k) elif all(i in processed for i in readby[k]): index = min(processed.index(i) for i in readby[k]) processed.insert(index, k) else: queue.append(k) # Build up the FlowGraph nodes = [(i, Node(mapper[i], reads=reads[i], readby=readby[i])) for i in processed] super(FlowGraph, self).__init__(nodes, **kwargs) # Determine indices along the space and time dimensions terms = [v for k, v in self.items() if v.is_Tensor and not q_indirect(k)] indices = filter_ordered(flatten([i.function.indices for i in terms])) self.space_indices = tuple(i for i in indices if i.is_Space) self.time_indices = tuple(i for i in indices if i.is_Time)
def union(self, others): """ Create a new HaloScheme representing the union of ``self`` with other HaloSchemes. """ fmapper = dict(self.fmapper) for i in as_tuple(others): for k, v in i.fmapper.items(): hse = fmapper.setdefault(k, v) # At this point, the `loc_indices` must match if hse.loc_indices != v.loc_indices: raise ValueError( "Cannot compute the union of one or more HaloScheme " "when the `loc_indices` differ") halos = tuple(filter_ordered(hse.halos + v.halos)) fmapper[k] = HaloSchemeEntry(hse.loc_indices, halos) return HaloScheme(fmapper=fmapper)
def generate_block_shapes(blockable, args, level): if not blockable: raise ValueError # Max attemptable block shape max_bs = tuple((d.step.name, d.max_step.subs(args)) for d in blockable) # Attempted block shapes: # 1) Defaults (basic mode) ret = [ tuple((d.step.name, v) for d in blockable) for v in options['blocksize'] ] # 2) Always try the entire iteration space (degenerate block) ret.append(max_bs) # 3) More attempts if auto-tuning in aggressive mode if level in ['aggressive', 'max']: # Ramp up to larger block shapes handle = tuple((i, options['blocksize'][-1]) for i, _ in ret[0]) for i in range(3): new_bs = tuple((b, v * 2) for b, v in handle) ret.insert(ret.index(handle) + 1, new_bs) handle = new_bs handle = [] # Extended shuffling for the smaller block shapes for bs in ret[:4]: for i in ret: handle.append(bs[:-1] + (i[-1], )) # Some more shuffling for all block shapes for bs in list(ret): ncombs = len(bs) for i in range(ncombs): for j in combinations(dict(bs), i + 1): handle.append( tuple((b, v * 2 if b in j else v) for b, v in bs)) ret.extend(handle) # Drop unnecessary attempts: # 1) Block shapes exceeding the iteration space extent ret = [i for i in ret if all(dict(i)[k] <= v for k, v in max_bs)] # 2) Redundant block shapes ret = filter_ordered(ret) return ret
def iet_make(stree): """ Create an Iteration/Expression tree (IET) from a :class:`ScheduleTree`. """ nsections = 0 queues = OrderedDict() for i in stree.visit(): if i == stree: # We hit this handle at the very end of the visit return List(body=queues.pop(i)) elif i.is_Exprs: exprs = [Expression(e) for e in i.exprs] body = [ExpressionBundle(i.shape, i.ops, i.traffic, body=exprs)] elif i.is_Conditional: body = [Conditional(i.guard, queues.pop(i))] elif i.is_Iteration: # Generate `uindices` uindices = [] for d, offs in i.sub_iterators: modulo = len(offs) for n, o in enumerate(filter_ordered(offs)): value = (i.dim + o) % modulo symbol = Scalar(name="%s%d" % (d.name, n), dtype=np.int32) uindices.append( UnboundedIndex(symbol, value, value, d, d + o)) # Generate Iteration body = [ Iteration(queues.pop(i), i.dim, i.dim.limits, offsets=i.limits, direction=i.direction, uindices=uindices) ] elif i.is_Section: body = [Section('section%d' % nsections, body=queues.pop(i))] nsections += 1 queues.setdefault(i.parent, []).extend(body) assert False
def init_configuration(configuration=configuration, env_vars_mapper=env_vars_mapper): # Populate /configuration/ with user-provided options if environ.get('DEVITO_CONFIG') is None: # At init time, it is important to configure `platform`, `compiler` and `backend` # in this order process_order = filter_ordered(['platform', 'compiler', 'backend'] + list(env_vars_mapper.values())) queue = sorted(env_vars_mapper.items(), key=lambda i: process_order.index(i[1])) unprocessed = OrderedDict([(v, environ.get(k, configuration._defaults[v])) for k, v in queue]) else: # Attempt reading from the specified configuration file raise NotImplementedError("Devito doesn't support configuration via file yet.") # Parameters validation for k, v in unprocessed.items(): try: items = v.split(';') # Env variable format: 'var=k1:v1;k2:v2:k3:v3:...' keys, values = zip(*[i.split(':') for i in items]) # Casting values = [eval(i) for i in values] except AttributeError: # Env variable format: 'var=v', 'v' is not a string keys = [v] values = [] except ValueError: # Env variable format: 'var=k1;k2:v2...' or even just 'var=v' keys = [i.split(':')[0] for i in items] values = [] # Cast to integer for i, j in enumerate(list(keys)): try: keys[i] = int(j) except (TypeError, ValueError): keys[i] = j if len(keys) == len(values): configuration.update(k, dict(zip(keys, values))) elif len(keys) == 1: configuration.update(k, keys[0]) else: configuration.update(k, keys) configuration.initialize()
def eliminate_arrays(clusters, template): """ Eliminate redundant expressions stored in Arrays. """ mapper = {} processed = [] for c in clusters: if not c.is_dense: processed.append(c) continue # Search for any redundant RHSs seen = {} for e in c.exprs: f = e.lhs.function if not f.is_Array: continue v = seen.get(e.rhs) if v is not None: # Found a redundant RHS mapper[f] = v else: seen[e.rhs] = f if not mapper: # Do not waste time processed.append(c) continue # Replace redundancies subs = {} for f, v in mapper.items(): for i in filter_ordered(i.indexed for i in c.scope[f]): subs[i] = v[f.indices] exprs = [] for e in c.exprs: if e.lhs.function in mapper: # Drop the write continue exprs.append(e.xreplace(subs)) processed.append(c.rebuild(exprs)) return processed
def scalarize(clusters, template): """ Turn local "isolated" Arrays, that is Arrays appearing only in one Cluster, into Scalars. """ processed = [] for c in clusters: # Get any Arrays appearing only in `c` impacted = set(clusters) - {c} arrays = {i for i in c.scope.writes if i.is_Array} arrays -= set().union(*[i.scope.reads for i in impacted]) # Turn them into scalars # # r[x,y,z] = g(b[x,y,z]) t0 = g(b[x,y,z]) # ... = r[x,y,z] + r[x,y,z+1]` ----> t1 = g(b[x,y,z+1]) # ... = t0 + t1 mapper = {} exprs = [] for n, e in enumerate(c.exprs): f = e.lhs.function if f in arrays: indexeds = [i.indexed for i in c.scope[f] if i.timestamp > n] for i in filter_ordered(indexeds): mapper[i] = Scalar(name=template(), dtype=f.dtype) assert len(f.indices) == len(e.lhs.indices) == len( i.indices) shifting = { idx: idx + (o2 - o1) for idx, o1, o2 in zip(f.indices, e.lhs.indices, i.indices) } handle = e.func(mapper[i], e.rhs.xreplace(mapper)) handle = xreplace_indices(handle, shifting) exprs.append(handle) else: exprs.append(e.func(e.lhs, e.rhs.xreplace(mapper))) processed.append(c.rebuild(exprs)) return processed
def generate_nthreads(nthreads, args, level): if nthreads == 1: return [((None, 1),)] ret = [((nthreads.name, args[nthreads.name]),)] if level == 'max': # Be sure to try with: # 1) num_threads == num_physical_cores # 2) num_threads == num_hyperthreads if configuration['platform'] is KNL: ret.extend([((nthreads.name, psutil.cpu_count() // 4),), ((nthreads.name, psutil.cpu_count() // 2),), ((nthreads.name, psutil.cpu_count()),)]) else: ret.extend([((nthreads.name, psutil.cpu_count() // 2),), ((nthreads.name, psutil.cpu_count()),)]) return filter_ordered(ret)
def generate_nthreads(nthreads, args, level): if nthreads == 1: return [((None, 1),)] ret = [((nthreads.name, args[nthreads.name]),)] if level == 'max': # Be sure to try with: # 1) num_threads == num_physical_cores # 2) num_threads == num_hyperthreads if configuration['platform'] is KNL: ret.extend([((nthreads.name, psutil.cpu_count() // 4),), ((nthreads.name, psutil.cpu_count() // 2),), ((nthreads.name, psutil.cpu_count()),)]) else: ret.extend([((nthreads.name, psutil.cpu_count() // 2),), ((nthreads.name, psutil.cpu_count()),)]) return filter_ordered(ret)
def expr_symbols(self): retval = [] for i in self.arguments: if isinstance(i, AbstractFunction): continue elif isinstance(i, (Indexed, IndexedBase, LocalObject, Symbol)): retval.append(i) elif isinstance(i, Call): retval.extend(i.expr_symbols) else: try: retval.extend(i.free_symbols) except AttributeError: pass if self.base is not None: retval.append(self.base) if self.retobj is not None: retval.extend(self.retobj.free_symbols) return tuple(filter_ordered(retval))
def functions(self): retval = [] for i in self.arguments: if isinstance(i, numbers.Number): continue elif isinstance(i, (AbstractFunction, Indexed, LocalObject)): retval.append(i.function) else: for s in i.free_symbols: try: f = s.function except AttributeError: continue if isinstance(f, AbstractFunction): retval.append(f) if self.base is not None: retval.append(self.base.function) if self.retobj is not None: retval.append(self.retobj.function) return tuple(filter_ordered(retval))
def _create_call_graph(self): dag = DAG(nodes=['root']) queue = ['root'] while queue: caller = queue.pop(0) callees = FindNodes(Call).visit(self.efuncs[caller]) for callee in filter_ordered([i.name for i in callees]): if callee in self.efuncs: # Exclude foreign Calls, e.g., MPI calls try: dag.add_node(callee) queue.append(callee) except KeyError: # `callee` already in `dag` pass dag.add_edge(callee, caller) # Sanity check assert dag.size == len(self.efuncs) return dag
def generate_block_shapes(blockable, args, level): if not blockable: raise ValueError # Max attemptable block shape max_bs = tuple((d.step.name, d.max_step.subs(args)) for d in blockable) # Attempted block shapes: # 1) Defaults (basic mode) ret = [tuple((d.step.name, v) for d in blockable) for v in options['blocksize']] # 2) Always try the entire iteration space (degenerate block) ret.append(max_bs) # 3) More attempts if auto-tuning in aggressive mode if level in ['aggressive', 'max']: # Ramp up to larger block shapes handle = tuple((i, options['blocksize'][-1]) for i, _ in ret[0]) for i in range(3): new_bs = tuple((b, v*2) for b, v in handle) ret.insert(ret.index(handle) + 1, new_bs) handle = new_bs handle = [] # Extended shuffling for the smaller block shapes for bs in ret[:4]: for i in ret: handle.append(bs[:-1] + (i[-1],)) # Some more shuffling for all block shapes for bs in list(ret): ncombs = len(bs) for i in range(ncombs): for j in combinations(dict(bs), i+1): handle.append(tuple((b, v*2 if b in j else v) for b, v in bs)) ret.extend(handle) # Drop unnecessary attempts: # 1) Block shapes exceeding the iteration space extent ret = [i for i in ret if all(dict(i)[k] <= v for k, v in max_bs)] # 2) Redundant block shapes ret = filter_ordered(ret) return ret
def normalize_syncs(*args): if not args: return if len(args) == 1: return args[0] syncs = defaultdict(list) for _dict in args: for k, v in _dict.items(): syncs[k].extend(v) syncs = {k: filter_ordered(v) for k, v in syncs.items()} for v in syncs.values(): waitlocks = [i for i in v if i.is_WaitLock] withlocks = [i for i in v if i.is_WithLock] if waitlocks and withlocks: # We do not allow mixing up WaitLock and WithLock ops raise ValueError("Incompatible SyncOps") return syncs
def visit_Iteration(self, o, subs={}, offsets=defaultdict(set)): nodes = self.visit(o.children, subs=subs, offsets=offsets) if o.dim.is_Buffered: # For buffered dimensions insert the explicit # definition of buffered variables, eg. t+1 => t1 init = [] for i, off in enumerate(filter_ordered(offsets[o.dim])): vname = "%s%d" % (o.dim.name, i) value = o.dim.parent + off modulo = o.dim.modulo init += [ c.Initializer(c.Value('int', vname), "(%s) %% %d" % (value, modulo)) ] subs[o.dim + off] = LoweredDimension(vname, o.dim, off) # Always lower to symbol subs[o.dim.parent] = Symbol(o.dim.parent.name) # Insert block with modulo initialisations newnodes = (List(header=init, body=nodes[0]), ) return o._rebuild(newnodes, index=o.dim.parent.name) else: return o._rebuild(*nodes)
def omapper(self): """ Mapper describing the OWNED ('o'-mapper) region offset from the DOMAIN extremes, along each Dimension and DataSide. Examples -------- Consider a HaloScheme comprising two one-dimensional Functions, ``u`` and ``v``. ``u``'s halo, on the LEFT and RIGHT DataSides respectively, is (2, 2), while ``v``'s is (4, 4). The situation is depicted below. .. code-block:: python xx**----------------**xx u xxxx****------------****xxxx v Where 'x' represents a HALO point, '*' a OWNED point, and '-' a CORE point. Together, '*' and '-' constitute the DOMAIN. In this example, the "cumulative" OWNED size is (4, 4), that is the max on each DataSide across all Functions, namely ``u`` and ``v``. Then, the ``omapper``, which provides *relative offsets*, not sizes, will be ``{d0: (4, -4)}``. Note that, for each Function, the 'x' and '*' are exactly the same on *all MPI ranks*, so the output of this method is guaranteed to be consistent across *all MPI ranks*. """ mapper = {} for f, v in self.halos.items(): dimensions = filter_ordered(flatten(i.dim for i in v)) for d, s in zip(f.dimensions, f._size_owned): if d in dimensions: mapper.setdefault(d, []).append(s) for k, v in list(mapper.items()): left, right = zip(*v) mapper[k] = (max(left), -max(right)) return mapper
def functions(self): retval = [] for i in self.arguments: if isinstance(i, (AbstractFunction, Indexed, LocalObject)): retval.append(i.function) elif isinstance(i, Call): retval.extend(i.functions) else: try: v = i.free_symbols except AttributeError: continue for s in v: try: # `try-except` necessary for e.g. Macro if isinstance(s.function, AbstractFunction): retval.append(s.function) except AttributeError: continue if self.base is not None: retval.append(self.base.function) if self.retobj is not None: retval.append(self.retobj.function) return tuple(filter_ordered(retval))
def reorder(cls, items, relations): # The relations are between dimensions, not intervals. So we take # care of that here ordering = filter_ordered(toposort(relations) + [i.dim for i in items]) return sorted(items, key=lambda i: ordering.index(i.dim))
def input(self): ret = [i for i in self._input + list(self.parameters) if i.is_Input] return tuple(filter_ordered(ret))
def _specialize_iet(self, iet, **kwargs): """ Transform the Iteration/Expression tree to offload the computation of one or more loop nests onto YASK. This involves calling the YASK compiler to generate YASK code. Such YASK code is then called from within the transformed Iteration/Expression tree. """ mapper = {} self.yk_solns = OrderedDict() for n, (section, trees) in enumerate(find_affine_trees(iet).items()): dimensions = tuple(filter_ordered(i.dim.root for i in flatten(trees))) context = contexts.fetch(dimensions, self._dtype) # A unique name for the 'real' compiler and kernel solutions name = namespace['jit-soln'](Signer._digest(configuration, *[i.root for i in trees])) # Create a YASK compiler solution for this Operator yc_soln = context.make_yc_solution(name) try: # Generate YASK grids and populate `yc_soln` with equations local_grids = yaskit(trees, yc_soln) # Build the new IET nodes yk_soln_obj = YaskSolnObject(namespace['code-soln-name'](n)) funcall = make_sharedptr_funcall(namespace['code-soln-run'], ['time'], yk_soln_obj) funcall = Offloaded(funcall, self._dtype) mapper[trees[0].root] = funcall mapper.update({i.root: mapper.get(i.root) for i in trees}) # Drop trees # Mark `funcall` as an external function call self._func_table[namespace['code-soln-run']] = MetaCall(None, False) # JIT-compile the newly-created YASK kernel yk_soln = context.make_yk_solution(name, yc_soln, local_grids) self.yk_solns[(dimensions, yk_soln_obj)] = yk_soln # Print some useful information about the newly constructed solution log("Solution '%s' contains %d grid(s) and %d equation(s)." % (yc_soln.get_name(), yc_soln.get_num_grids(), yc_soln.get_num_equations())) except NotImplementedError as e: log("Unable to offload a candidate tree. Reason: [%s]" % str(e)) iet = Transformer(mapper).visit(iet) if not self.yk_solns: log("No offloadable trees found") # Some Iteration/Expression trees are not offloaded to YASK and may # require further processing to be executed in YASK, due to the differences # in storage layout employed by Devito and YASK yk_grid_objs = {i.name: YaskGridObject(i.name) for i in self._input if i.from_YASK} yk_grid_objs.update({i: YaskGridObject(i) for i in self._local_grids}) iet = make_grid_accesses(iet, yk_grid_objs) # Finally optimize all non-yaskized loops iet = super(OperatorYASK, self)._specialize_iet(iet, **kwargs) return iet
def add_library_dirs(self, dirs): self.library_dirs = filter_ordered(self.library_dirs + list(as_tuple(dirs)))
def add_ldflags(self, flags): self.ldflags = filter_ordered(self.ldflags + list(as_tuple(flags)))
def mark_iteration_parallel(analysis): """ Update the ``analysis`` detecting the SEQUENTIAL and PARALLEL Iterations within ``analysis.iet``. """ properties = OrderedDict() for tree in analysis.trees: for depth, i in enumerate(tree): if properties.get(i) is SEQUENTIAL: # Speed-up analysis continue if i.uindices: # Only ++/-- increments of iteration variables are supported properties.setdefault(i, []).append(SEQUENTIAL) continue # Get all dimensions up to and including Iteration /i/, grouped by Iteration dims = [filter_ordered(j.dimensions) for j in tree[:depth + 1]] # Get all dimensions up to and including Iteration /i-1/ prev = flatten(dims[:-1]) # Get all dimensions up to and including Iteration /i/ dims = flatten(dims) # The i-th Iteration is PARALLEL if for all dependences (d_1, ..., d_n): # test0 := (d_1, ..., d_{i-1}) > 0, OR # test1 := (d_1, ..., d_i) = 0 is_parallel = True # The i-th Iteration is PARALLEL_IF_ATOMIC if for all dependeces: # test0 OR test1 OR the write is an associative and commutative increment is_atomic_parallel = True for dep in analysis.scopes[i].d_all: test1 = all(dep.is_indep(d) for d in dims) if test1: continue test0 = len(prev) > 0 and any(dep.is_carried(d) for d in prev) if test0: continue test2 = all(dep.is_reduce_atmost(d) for d in prev) and dep.is_indep(i.dim) if test2: continue is_parallel = False if not dep.is_increment: is_atomic_parallel = False break if is_parallel: properties.setdefault(i, []).append(PARALLEL) elif is_atomic_parallel: properties.setdefault(i, []).append(PARALLEL_IF_ATOMIC) else: properties.setdefault(i, []).append(SEQUENTIAL) # Reduction (e.g, SEQUENTIAL takes priority over PARALLEL) priorities = {PARALLEL: 0, PARALLEL_IF_ATOMIC: 1, SEQUENTIAL: 2} properties = OrderedDict([(k, max(v, key=lambda i: priorities[i])) for k, v in properties.items()]) analysis.update(properties)
def autotune(operator, args, level, mode): """ Operator autotuning. Parameters ---------- operator : Operator Input Operator. args : dict_like The runtime arguments with which `operator` is run. level : str The autotuning aggressiveness (basic, aggressive, max). A more aggressive autotuning might eventually result in higher runtime performance, but the autotuning phase will take longer. mode : str The autotuning mode (preemptive, runtime). In preemptive mode, the output runtime values supplied by the user to `operator.apply` are replaced with shadow copies. """ key = [level, mode] accepted = configuration._accepted['autotuning'] if key not in accepted: raise ValueError("The accepted `(level, mode)` combinations are `%s`; " "provided `%s` instead" % (accepted, key)) # We get passed all the arguments, but the cfunction only requires a subset at_args = OrderedDict([(p.name, args[p.name]) for p in operator.parameters]) # User-provided output data won't be altered in `preemptive` mode if mode == 'preemptive': output = {i.name: i for i in operator.output} copies = {k: output[k]._C_as_ndarray(v).copy() for k, v in args.items() if k in output} # WARNING: `copies` keeps references to numpy arrays, which is required # to avoid garbage collection to kick in during autotuning and prematurely # free the shadow copies handed over to C-land at_args.update({k: output[k]._C_make_dataobj(v) for k, v in copies.items()}) # Disable halo exchanges through MPI_PROC_NULL if mode in ['preemptive', 'destructive']: for p in operator.parameters: if isinstance(p, MPINeighborhood): at_args.update(MPINeighborhood(p.fields)._arg_values()) for i in p.fields: setattr(at_args[p.name]._obj, i, MPI.PROC_NULL) elif isinstance(p, MPIMsgEnriched): at_args.update(MPIMsgEnriched(p.name, p.function, p.halos)._arg_values()) for i in at_args[p.name]: i.fromrank = MPI.PROC_NULL i.torank = MPI.PROC_NULL roots = [operator.body] + [i.root for i in operator._func_table.values()] trees = filter_ordered(retrieve_iteration_tree(roots), key=lambda i: i.root) # Detect the time-stepping Iteration; shrink its iteration range so that # each autotuning run only takes a few iterations steppers = {i for i in flatten(trees) if i.dim.is_Time} if len(steppers) == 0: stepper = None timesteps = 1 elif len(steppers) == 1: stepper = steppers.pop() timesteps = init_time_bounds(stepper, at_args) if timesteps is None: return args, {} else: warning("cannot perform autotuning unless there is one time loop; skipping") return args, {} # Perform autotuning timings = {} for n, tree in enumerate(trees): blockable = [i.dim for i in tree if isinstance(i.dim, BlockDimension)] # Tunable arguments try: tunable = [] tunable.append(generate_block_shapes(blockable, args, level)) tunable.append(generate_nthreads(operator.nthreads, args, level)) tunable = list(product(*tunable)) except ValueError: # Some arguments are cumpolsory, otherwise autotuning is skipped continue # Symbolic number of loop-blocking blocks per thread nblocks_per_thread = calculate_nblocks(tree, blockable) / operator.nthreads for bs, nt in tunable: # Can we safely autotune over the given time range? if not check_time_bounds(stepper, at_args, args, mode): break # Update `at_args` to use the new tunable arguments run = [(k, v) for k, v in bs + nt if k in at_args] at_args.update(dict(run)) # Drop run if not at least one block per thread if not configuration['develop-mode'] and nblocks_per_thread.subs(at_args) < 1: continue # Make sure we remain within stack bounds, otherwise skip run try: stack_footprint = operator._mem_summary['stack'] if int(evaluate(stack_footprint, **at_args)) > options['stack_limit']: continue except TypeError: warning("couldn't determine stack size; skipping run %s" % str(i)) continue except AttributeError: assert stack_footprint == 0 # Run the Operator operator.cfunction(*list(at_args.values())) elapsed = operator._profiler.timer.total timings.setdefault(nt, OrderedDict()).setdefault(n, {})[bs] = elapsed log("run <%s> took %f (s) in %d timesteps" % (','.join('%s=%s' % i for i in run), elapsed, timesteps)) # Prepare for the next autotuning run update_time_bounds(stepper, at_args, timesteps, mode) # Reset profiling timers operator._profiler.timer.reset() # The best variant is the one that for a given number of threads had the minium # turnaround time try: runs = 0 mapper = {} for k, v in timings.items(): for i in v.values(): runs += len(i) record = mapper.setdefault(k, Record()) record.add(min(i, key=i.get), min(i.values())) best = min(mapper, key=mapper.get) best = OrderedDict(best + tuple(mapper[best].args)) best.pop(None, None) log("selected <%s>" % (','.join('%s=%s' % i for i in best.items()))) except ValueError: warning("couldn't perform any runs") return args, {} # Update the argument list with the tuned arguments args.update(best) # In `runtime` mode, some timesteps have been executed already, so we must # adjust the time range finalize_time_bounds(stepper, at_args, args, mode) # Autotuning summary summary = {} summary['runs'] = runs summary['tpr'] = timesteps # tpr -> timesteps per run summary['tuned'] = dict(best) return args, summary
def dimensions(self): return filter_ordered([i.dim for i in self])
def dimensions(self): return filter_ordered(self.intervals.dimensions)
def indices(self): return tuple(filter_ordered(flatten(getattr(i, 'indices', ()) for i in self._args_diff)))
def dimensions(self): sub_dims = [i.parent for v in self.sub_iterators.values() for i in v] return filter_ordered(self.intervals.dimensions + sub_dims)
def add_include_dirs(self, dirs): self.include_dirs = filter_ordered(self.include_dirs + list(as_tuple(dirs)))
def add_libraries(self, libs): self.libraries = filter_ordered(self.libraries + list(as_tuple(libs)))