def _padding(self, nodes, state): """ Introduce temporary buffers padded to the nearest multiple of the vector length, to maximize data alignment. At the bottom of the kernel, the values in the padded temporaries will be copied back into the input arrays. """ mapper = OrderedDict() # Assess feasibility of the transformation handle = FindSymbols('symbolics-writes').visit(nodes) if not handle: return nodes, {} shape = max([i.shape for i in handle], key=len) if not shape: return nodes, {} candidates = [i for i in handle if i.shape[-1] == shape[-1]] if not candidates: return nodes, {} # Retrieve the maximum number of items in a SIMD register when processing # the expressions in /node/ exprs = FindNodes(Expression).visit(nodes) exprs = [e for e in exprs if e.write in candidates] assert len(exprs) > 0 dtype = exprs[0].dtype assert all(e.dtype == dtype for e in exprs) try: simd_items = get_simd_items(dtype) except KeyError: # Fallback to 16 (maximum expectable padding, for AVX512 registers) simd_items = simdinfo['avx512f'] / np.dtype(dtype).itemsize shapes = { k: k.shape[:-1] + (roundm(k.shape[-1], simd_items), ) for k in candidates } mapper.update( OrderedDict([(k.indexed, Array(name='p%s' % k.name, shape=shapes[k], dimensions=k.indices, onstack=k._mem_stack).indexed) for k in candidates])) # Substitute original arrays with padded buffers processed = SubstituteExpression(mapper).visit(nodes) # Build Iteration trees for initialization and copy-back of padded arrays mapper = OrderedDict([(k, v) for k, v in mapper.items() if k.function.is_SymbolicFunction]) init = copy_arrays(mapper, reverse=True) copyback = copy_arrays(mapper) processed = List(body=init + as_tuple(processed) + copyback) return processed, {}
def simple_function_with_paddable_arrays(a_dense, b_dense, exprs, iters): # void foo(a_dense, b_dense) # for i # for j # for k # expr0 symbols = [i.base.function for i in [a_dense, b_dense]] body = iters[0](iters[1](iters[2](exprs[6]))) f = Callable('foo', body, 'void', symbols, ()) f, subs = ResolveTimeStepping().visit(f) f = SubstituteExpression(subs=subs).visit(f) return f
def simple_function_fissionable(a, b, exprs, iters): # void foo(a, b) # for i # for j # for k # expr0 # expr2 symbols = [i.base.function for i in [a, b]] body = iters[0](iters[1](iters[2]([exprs[0], exprs[2]]))) f = Callable('foo', body, 'void', symbols, ()) f, subs = ResolveTimeStepping().visit(f) f = SubstituteExpression(subs=subs).visit(f) return f
def complex_function(a, b, c, d, exprs, iters): # void foo(a, b, c, d) # for i # for s # expr0 # for j # for k # expr1 # expr2 # for p # expr3 symbols = [i.base.function for i in [a, b, c, d]] body = iters[0]([ iters[3](exprs[2]), iters[1](iters[2]([exprs[3], exprs[4]])), iters[4](exprs[5]) ]) f = Callable('foo', body, 'void', symbols, ()) f, subs = ResolveTimeStepping().visit(f) f = SubstituteExpression(subs=subs).visit(f) return f
def iet_build(clusters, dtype): """ Create an Iteration/Expression tree (IET) given an iterable of :class:`Cluster`s. The nodes in the returned IET are decorated with properties deriving from data dependence analysis. """ # Clusters -> Iteration/Expression tree iet = iet_make(clusters, dtype) # Data dependency analysis. Properties are attached directly to nodes iet = iet_analyze(iet) # Substitute derived dimensions (e.g., t -> t0, t + 1 -> t1) # This is postponed up to this point to ease /iet_analyze/'s life subs = {} for tree in retrieve_iteration_tree(iet): uindices = flatten(i.uindices for i in tree) subs.update({i.expr: LoweredDimension(name=i.index.name, origin=i.expr) for i in uindices}) iet = SubstituteExpression(subs).visit(iet) return iet
def __init__(self, expressions, **kwargs): expressions = as_tuple(expressions) # Input check if any(not isinstance(i, sympy.Eq) for i in expressions): raise InvalidOperator("Only SymPy expressions are allowed.") self.name = kwargs.get("name", "Kernel") subs = kwargs.get("subs", {}) time_axis = kwargs.get("time_axis", Forward) dse = kwargs.get("dse", configuration['dse']) dle = kwargs.get("dle", configuration['dle']) # Header files, etc. self._headers = list(self._default_headers) self._includes = list(self._default_includes) self._globals = list(self._default_globals) # Required for compilation self._compiler = configuration['compiler'] self._lib = None self._cfunction = None # References to local or external routines self.func_table = OrderedDict() # Expression lowering expressions = [indexify(s) for s in expressions] expressions = [s.xreplace(subs) for s in expressions] # Analysis self.dtype = retrieve_dtype(expressions) self.input, self.output, self.dimensions = retrieve_symbols( expressions) stencils = make_stencils(expressions) self.offsets = { d.end_name: v for d, v in retrieve_offsets(stencils).items() } # Set the direction of time acoording to the given TimeAxis for time in [d for d in self.dimensions if d.is_Time]: if not time.is_Stepping: time.reverse = time_axis == Backward # Parameters of the Operator (Dimensions necessary for data casts) parameters = self.input + self.dimensions # Group expressions based on their Stencil and data dependences clusters = clusterize(expressions, stencils) # Apply the Devito Symbolic Engine (DSE) for symbolic optimization clusters = rewrite(clusters, mode=set_dse_mode(dse)) # Wrap expressions with Iterations according to dimensions nodes = self._schedule_expressions(clusters) # Data dependency analysis. Properties are attached directly to nodes nodes = analyze_iterations(nodes) # Introduce C-level profiling infrastructure nodes, self.profiler = self._profile_sections(nodes, parameters) # Resolve and substitute dimensions for loop index variables nodes, subs = ResolveTimeStepping().visit(nodes) nodes = SubstituteExpression(subs=subs).visit(nodes) # Translate into backend-specific representation (e.g., GPU, Yask) nodes = self._specialize(nodes, parameters) # Apply the Devito Loop Engine (DLE) for loop optimization dle_state = transform(nodes, *set_dle_mode(dle)) # Update the Operator state based on the DLE self.dle_arguments = dle_state.arguments self.dle_flags = dle_state.flags self.func_table.update( OrderedDict([(i.name, FunMeta(i, True)) for i in dle_state.elemental_functions])) parameters.extend([i.argument for i in self.dle_arguments]) self.dimensions.extend([ i.argument for i in self.dle_arguments if isinstance(i.argument, Dimension) ]) self._includes.extend(list(dle_state.includes)) # Introduce all required C declarations nodes = self._insert_declarations(dle_state.nodes) # Finish instantiation super(Operator, self).__init__(self.name, nodes, 'int', parameters, ())