def _make_copy(self, f, hse, key='', swap=False): buf_dims = [] buf_indices = [] for d in f.dimensions: if d not in hse.loc_indices: buf_dims.append(Dimension(name='buf_%s' % d.root)) buf_indices.append(d.root) buf = Array(name='buf', dimensions=buf_dims, dtype=f.dtype) f_offsets = [] f_indices = [] for d in f.dimensions: offset = Symbol(name='o%s' % d.root) f_offsets.append(offset) f_indices.append(offset + (d.root if d not in hse.loc_indices else 0)) if swap is False: eq = DummyEq(buf[buf_indices], f[f_indices]) name = 'gather%s' % key else: eq = DummyEq(f[f_indices], buf[buf_indices]) name = 'scatter%s' % key iet = Expression(eq) for i, d in reversed(list(zip(buf_indices, buf_dims))): # The -1 below is because an Iteration, by default, generates <= iet = Iteration(iet, i, d.symbolic_size - 1) iet = iet._rebuild(properties=PARALLEL) parameters = [buf] + list(buf.shape) + [f] + f_offsets return Callable(name, iet, 'void', parameters, ('static',))
def exprs(a, b): return [ Expression(Eq(a, a + b + 5.)), Expression(Eq(a, b - a)), Expression(Eq(a, 4 * (b * a))), Expression(Eq(a, (6. / b) + (8. * a))) ]
def exprs(dims): a = Array(name='a', shape=(3,), dimensions=(dims["i"],)).indexify() b = Array(name='b', shape=(3,), dimensions=(dims["i"],)).indexify() return [Expression(DummyEq(a, a + b + 5.)), Expression(DummyEq(a, b - a)), Expression(DummyEq(a, 4 * (b * a))), Expression(DummyEq(a, (6. / b) + (8. * a)))]
def _make_poke(self, hs, key, msgs): lflag = Symbol(name='lflag') gflag = Symbol(name='gflag') # Init flags body = [Expression(DummyEq(lflag, 0)), Expression(DummyEq(gflag, 1))] # For each msg, build an Iteration calling MPI_Test on all peers for msg in msgs: dim = Dimension(name='i') msgi = IndexedPointer(msg, dim) rrecv = Byref(FieldFromComposite(msg._C_field_rrecv, msgi)) testrecv = Call( 'MPI_Test', [rrecv, Byref(lflag), Macro('MPI_STATUS_IGNORE')]) rsend = Byref(FieldFromComposite(msg._C_field_rsend, msgi)) testsend = Call( 'MPI_Test', [rsend, Byref(lflag), Macro('MPI_STATUS_IGNORE')]) update = AugmentedExpression(DummyEq(gflag, lflag), '&') body.append( Iteration([testsend, update, testrecv, update], dim, msg.npeers - 1)) body.append(Return(gflag)) return make_efunc('pokempi%d' % key, List(body=body), retval='int')
def copy_arrays(mapper, reverse=False): """ Build an Iteration/Expression tree performing the copy ``k = v``, or ``v = k`` if reverse=True, for each (k, v) in mapper. (k, v) are expected to be of type :class:`IndexedData`. The loop bounds are inferred from the dimensions used in ``k``. """ if not mapper: return () # Build the Iteration tree for the copy iterations = [] for k, v in mapper.items(): handle = [] indices = k.function.indices for i, j in zip(k.shape, indices): handle.append(Iteration([], dimension=j, limits=i)) lhs, rhs = (v, k) if reverse else (k, v) handle.append( Expression(Eq(lhs[indices], rhs[indices]), dtype=k.function.dtype)) iterations.append(compose_nodes(handle)) # Maybe some Iterations are mergeable iterations = MergeOuterIterations().visit(iterations) return iterations
def iet_build(stree): """ Construct an Iteration/Expression tree(IET) from a ScheduleTree. """ nsections = 0 queues = OrderedDict() for i in stree.visit(): if i == stree: # We hit this handle at the very end of the visit return List(body=queues.pop(i)) elif i.is_Exprs: exprs = [Increment(e) if e.is_Increment else Expression(e) for e in i.exprs] body = ExpressionBundle(i.ispace, i.ops, i.traffic, body=exprs) elif i.is_Conditional: body = Conditional(i.guard, queues.pop(i)) elif i.is_Iteration: body = Iteration(queues.pop(i), i.dim, i.limits, direction=i.direction, properties=i.properties, uindices=i.sub_iterators) elif i.is_Section: body = Section('section%d' % nsections, body=queues.pop(i)) nsections += 1 elif i.is_Halo: body = HaloSpot(i.halo_scheme, body=queues.pop(i)) queues.setdefault(i.parent, []).append(body) assert False
def iet_make(stree): """Create an IET from a ScheduleTree.""" nsections = 0 queues = OrderedDict() for i in stree.visit(): if i == stree: # We hit this handle at the very end of the visit return List(body=queues.pop(i)) elif i.is_Exprs: exprs = [Increment(e) if e.is_Increment else Expression(e) for e in i.exprs] body = ExpressionBundle(i.ispace, i.ops, i.traffic, body=exprs) elif i.is_Conditional: body = Conditional(i.guard, queues.pop(i)) elif i.is_Iteration: # Order to ensure deterministic code generation uindices = sorted(i.sub_iterators, key=lambda d: d.name) # Generate Iteration body = Iteration(queues.pop(i), i.dim, i.limits, offsets=i.offsets, direction=i.direction, properties=i.properties, uindices=uindices) elif i.is_Section: body = Section('section%d' % nsections, body=queues.pop(i)) nsections += 1 elif i.is_Halo: body = HaloSpot(i.halo_scheme, body=queues.pop(i)) queues.setdefault(i.parent, []).append(body) assert False
def _make_copy(self, f, hse, key, swap=False): buf_dims = [] buf_indices = [] for d in f.dimensions: if d not in hse.loc_indices: buf_dims.append(Dimension(name='buf_%s' % d.root)) buf_indices.append(d.root) buf = Array(name='buf', dimensions=buf_dims, dtype=f.dtype, padding=0) f_offsets = [] f_indices = [] for d in f.dimensions: offset = Symbol(name='o%s' % d.root) f_offsets.append(offset) f_indices.append(offset + (d.root if d not in hse.loc_indices else 0)) if swap is False: eq = DummyEq(buf[buf_indices], f[f_indices]) name = 'gather_%s' % key else: eq = DummyEq(f[f_indices], buf[buf_indices]) name = 'scatter_%s' % key iet = Expression(eq) for i, d in reversed(list(zip(buf_indices, buf_dims))): # The -1 below is because an Iteration, by default, generates <= iet = Iteration(iet, i, d.symbolic_size - 1, properties=(PARALLEL, AFFINE)) parameters = [buf] + list(buf.shape) + [f] + f_offsets return Callable(name, iet, 'void', parameters, ('static', ))
def iet_make(stree): """ Create an Iteration/Expression tree (IET) from a :class:`ScheduleTree`. """ nsections = 0 queues = OrderedDict() for i in stree.visit(): if i == stree: # We hit this handle at the very end of the visit return List(body=queues.pop(i)) elif i.is_Exprs: exprs = [Expression(e) for e in i.exprs] body = [ExpressionBundle(i.shape, i.ops, i.traffic, body=exprs)] elif i.is_Conditional: body = [Conditional(i.guard, queues.pop(i))] elif i.is_Iteration: # Order to ensure deterministic code generation uindices = sorted(i.sub_iterators, key=lambda d: d.name) # Generate Iteration body = [Iteration(queues.pop(i), i.dim, i.dim.limits, offsets=i.limits, direction=i.direction, uindices=uindices)] elif i.is_Section: body = [Section('section%d' % nsections, body=queues.pop(i))] nsections += 1 elif i.is_Halo: body = [HaloSpot(i.halo_scheme, body=queues.pop(i))] queues.setdefault(i.parent, []).extend(body) assert False
def exprs(a, b, c, d, a_dense, b_dense): return [Expression(DummyEq(a, a + b + 5.)), Expression(DummyEq(a, b*d - a*c)), Expression(DummyEq(b, a + b*b + 3)), Expression(DummyEq(a, a*b*d*c)), Expression(DummyEq(a, 4 * ((b + d) * (a + c)))), Expression(DummyEq(a, (6. / b) + (8. * a))), Expression(DummyEq(a_dense, a_dense + b_dense + 5.))]
def _specialize_iet(self, iet, **kwargs): warning("The OPS backend is still work-in-progress") ops_init = Call(namespace['ops_init'], [0, 0, 2]) ops_partition = Call(namespace['ops_partition'], Literal('""')) ops_exit = Call(namespace['ops_exit']) ops_block = OpsBlock('block') # Extract all symbols that need to be converted to ops_dat dims = [] to_dat = set() for section, trees in find_affine_trees(iet).items(): dims.append(len(trees[0].dimensions)) symbols = set(FindSymbols('symbolics').visit(trees[0].root)) symbols -= set(FindSymbols('defines').visit(trees[0].root)) to_dat |= symbols # To ensure deterministic code generation we order the datasets to # be generated (since a set is an unordered collection) to_dat = filter_sorted(to_dat) name_to_ops_dat = {} pre_time_loop = [] for f in to_dat: if f.is_Constant: continue pre_time_loop.extend(create_ops_dat(f, name_to_ops_dat, ops_block)) for n, (section, trees) in enumerate(find_affine_trees(iet).items()): pre_loop, ops_kernel = opsit(trees, n) pre_time_loop.extend(pre_loop) self._ops_kernels.append(ops_kernel) assert (d == dims[0] for d in dims), \ "The OPS backend currently assumes that all kernels \ have the same number of dimensions" ops_block_init = Expression( ClusterizedEq( Eq(ops_block, namespace['ops_decl_block'](dims[0], Literal('"block"'))))) self._headers.append(namespace['ops_define_dimension'](dims[0])) self._includes.append('stdio.h') body = [ ops_init, ops_block_init, *pre_time_loop, ops_partition, iet, ops_exit ] return List(body=body)
def _schedule_expressions(self, clusters): """Create an Iteartion/Expression tree given an iterable of :class:`Cluster` objects.""" # Build the Iteration/Expression tree processed = [] schedule = OrderedDict() for i in clusters: # Build the Expression objects to be inserted within an Iteration tree expressions = [ Expression(v, np.int32 if i.trace.is_index(k) else self.dtype) for k, v in i.trace.items() ] if not i.stencil.empty: root = None entries = i.stencil.entries # Can I reuse any of the previously scheduled Iterations ? index = 0 for j0, j1 in zip(entries, list(schedule)): if j0 != j1 or j0.dim in clusters.atomics[i]: break root = schedule[j1] index += 1 needed = entries[index:] # Build and insert the required Iterations iters = [ Iteration([], j.dim, j.dim.limits, offsets=j.ofs) for j in needed ] body, tree = compose_nodes(iters + [expressions], retrieve=True) scheduling = OrderedDict(zip(needed, tree)) if root is None: processed.append(body) schedule = scheduling else: nodes = list(root.nodes) + [body] mapper = {root: root._rebuild(nodes, **root.args_frozen)} transformer = Transformer(mapper) processed = list(transformer.visit(processed)) schedule = OrderedDict( list(schedule.items())[:index] + list(scheduling.items())) for k, v in list(schedule.items()): schedule[k] = transformer.rebuilt.get(v, v) else: # No Iterations are needed processed.extend(expressions) return List(body=processed)
def make_grid_accesses(node): """ Construct a new Iteration/Expression based on ``node``, in which all :class:`types.Indexed` accesses have been converted into YASK grid accesses. """ def make_grid_gets(expr): mapper = {} indexeds = retrieve_indexed(expr) data_carriers = [i for i in indexeds if i.base.function.from_YASK] for i in data_carriers: name = namespace['code-grid-name'](i.base.function.name) args = [ ListInitializer([INT(make_grid_gets(j)) for j in i.indices]) ] mapper[i] = make_sharedptr_funcall(namespace['code-grid-get'], args, name) return expr.xreplace(mapper) mapper = {} for i, e in enumerate(FindNodes(Expression).visit(node)): lhs, rhs = e.expr.args # RHS translation rhs = make_grid_gets(rhs) # LHS translation if e.write.from_YASK: name = namespace['code-grid-name'](e.write.name) args = [rhs] args += [ ListInitializer([INT(make_grid_gets(i)) for i in lhs.indices]) ] handle = make_sharedptr_funcall(namespace['code-grid-put'], args, name) processed = ForeignExpression(handle, e.dtype, is_Increment=e.is_increment) else: # Writing to a scalar temporary processed = Expression(e.expr.func(lhs, rhs)) mapper.update({e: processed}) return Transformer(mapper).visit(node)
def iet_make(stree): """ Create an Iteration/Expression tree (IET) from a :class:`ScheduleTree`. """ nsections = 0 queues = OrderedDict() for i in stree.visit(): if i == stree: # We hit this handle at the very end of the visit return List(body=queues.pop(i)) elif i.is_Exprs: exprs = [Expression(e) for e in i.exprs] body = [ExpressionBundle(i.shape, i.ops, i.traffic, body=exprs)] elif i.is_Conditional: body = [Conditional(i.guard, queues.pop(i))] elif i.is_Iteration: # Generate `uindices` uindices = [] for d, offs in i.sub_iterators: modulo = len(offs) for n, o in enumerate(filter_ordered(offs)): value = (i.dim + o) % modulo symbol = Scalar(name="%s%d" % (d.name, n), dtype=np.int32) uindices.append( UnboundedIndex(symbol, value, value, d, d + o)) # Generate Iteration body = [ Iteration(queues.pop(i), i.dim, i.dim.limits, offsets=i.limits, direction=i.direction, uindices=uindices) ] elif i.is_Section: body = [Section('section%d' % nsections, body=queues.pop(i))] nsections += 1 queues.setdefault(i.parent, []).extend(body) assert False
def _make_copy(self, f, fixed, swap=False): """ Construct a Callable performing a copy of: * an arbitrary convex region of ``f`` into a contiguous Array, OR * if ``swap=True``, a contiguous Array into an arbitrary convex region of ``f``. """ buf_dims = [] buf_indices = [] for d in f.dimensions: if d not in fixed: buf_dims.append(Dimension(name='buf_%s' % d.root)) buf_indices.append(d.root) buf = Array(name='buf', dimensions=buf_dims, dtype=f.dtype) f_offsets = [] f_indices = [] for d in f.dimensions: offset = Symbol(name='o%s' % d.root) f_offsets.append(offset) f_indices.append(offset + (d.root if d not in fixed else 0)) if swap is False: eq = DummyEq(buf[buf_indices], f[f_indices]) name = 'gather%dd' % f.ndim else: eq = DummyEq(f[f_indices], buf[buf_indices]) name = 'scatter%dd' % f.ndim iet = Expression(eq) for i, d in reversed(list(zip(buf_indices, buf_dims))): # The -1 below is because an Iteration, by default, generates <= iet = Iteration(iet, i, d.symbolic_size - 1, properties=PARALLEL) iet = List(body=[ArrayCast(f), ArrayCast(buf), iet]) # Optimize the memory copy with the DLE from devito.dle import transform state = transform(iet, 'simd', {'openmp': self._threaded}) parameters = [buf] + list(buf.shape) + [f] + f_offsets + state.input return Callable(name, state.nodes, 'void', parameters, ('static', )), state.input
def copy(f, fixed, swap=False): """ Construct a :class:`Callable` capable of copying: :: * an arbitrary convex region of ``f`` into a contiguous :class:`Array`, OR * if ``swap=True``, a contiguous :class:`Array` into an arbitrary convex region of ``f``. """ buf_dims = [] buf_indices = [] for d in f.dimensions: if d not in fixed: buf_dims.append(Dimension(name='buf_%s' % d.root)) buf_indices.append(d.root) buf = Array(name='buf', dimensions=buf_dims, dtype=f.dtype) dat_dims = [] dat_offsets = [] dat_indices = [] for d in f.dimensions: dat_dims.append(Dimension(name='dat_%s' % d.root)) offset = Symbol(name='o%s' % d.root) dat_offsets.append(offset) dat_indices.append(offset + (d.root if d not in fixed else 0)) dat = Array(name='dat', dimensions=dat_dims, dtype=f.dtype) if swap is False: eq = DummyEq(buf[buf_indices], dat[dat_indices]) name = 'gather_%s' % f.name else: eq = DummyEq(dat[dat_indices], buf[buf_indices]) name = 'scatter_%s' % f.name iet = Expression(eq) for i, d in reversed(list(zip(buf_indices, buf_dims))): iet = Iteration(iet, i, d.symbolic_size - 1) # -1 as Iteration generates <= iet = List(body=[ArrayCast(dat), ArrayCast(buf), iet]) parameters = [buf] + list(buf.shape) + [dat] + list( dat.shape) + dat_offsets return Callable(name, iet, 'void', parameters, ('static', ))
def test_loops_collapsed(fe, t0, t1, t2, t3, exprs, expected, iters): scope = [fe, t0, t1, t2, t3] node_exprs = [Expression(DummyEq(EVAL(i, *scope))) for i in exprs] ast = iters[6](iters[7](iters[8](node_exprs))) ast = iet_analyze(ast) nodes = transform(ast, mode='openmp').nodes iterations = FindNodes(Iteration).visit(nodes) assert len(iterations) == len(expected) # Check for presence of pragma omp for i, j in zip(iterations, expected): pragmas = i.pragmas if j is True: assert len(pragmas) == 1 pragma = pragmas[0] assert 'omp for collapse' in pragma.value else: for k in pragmas: assert 'omp for collapse' not in k.value
def test_iterations_ompized(self, fa, fb, fc, fd, t0, t1, t2, t3, exprs, expected, iters): scope = [fa, fb, fc, fd, t0, t1, t2, t3] node_exprs = [Expression(DummyEq(EVAL(i, *scope))) for i in exprs] ast = iters[6](iters[7](node_exprs)) ast = iet_analyze(ast) iet, _ = transform(ast, mode='openmp') iterations = FindNodes(Iteration).visit(iet) assert len(iterations) == len(expected) # Check for presence of pragma omp for i, j in zip(iterations, expected): pragmas = i.pragmas if j is True: assert len(pragmas) == 1 pragma = pragmas[0] assert 'omp for' in pragma.value else: for k in pragmas: assert 'omp for' not in k.value
def test_conditional(self, fc): then_body = Expression(DummyEq(fc[x, y], fc[x, y] + 1)) else_body = Expression(DummyEq(fc[x, y], fc[x, y] + 2)) conditional = Conditional(x < 3, then_body, else_body) assert str(conditional) == """\
def iet_make(clusters, dtype): """ Create an Iteration/Expression tree (IET) given an iterable of :class:`Cluster`s. :param clusters: The iterable :class:`Cluster`s for which the IET is built. :param dtype: The data type of the scalar expressions. """ processed = [] schedule = OrderedDict() for cluster in clusters: if not cluster.ispace.empty: root = None intervals = cluster.ispace.intervals # Can I reuse any of the previously scheduled Iterations ? index = 0 for i0, i1 in zip(intervals, list(schedule)): if i0 != i1 or i0.dim in clusters.atomics[cluster]: break root = schedule[i1] index += 1 needed = intervals[index:] # Build Iterations, including any necessary unbounded index iters = [] for i in needed: uindices = [] for j, offs in cluster.ispace.sub_iterators.get(i.dim, []): for n, o in enumerate(filter_ordered(offs)): name = "%s%d" % (j.name, n) vname = Scalar(name=name, dtype=np.int32) value = (i.dim + o) % j.modulo uindices.append(UnboundedIndex(vname, value, value, j, j + o)) iters.append(Iteration([], i.dim, i.dim.limits, offsets=i.limits, uindices=uindices)) # Build Expressions exprs = [Expression(v, np.int32 if cluster.trace.is_index(k) else dtype) for k, v in cluster.trace.items()] # Compose Iterations and Expressions body, tree = compose_nodes(iters + [exprs], retrieve=True) # Update the current scheduling scheduling = OrderedDict(zip(needed, tree)) if root is None: processed.append(body) schedule = scheduling else: nodes = list(root.nodes) + [body] mapper = {root: root._rebuild(nodes, **root.args_frozen)} transformer = Transformer(mapper) processed = list(transformer.visit(processed)) schedule = OrderedDict(list(schedule.items())[:index] + list(scheduling.items())) for k, v in list(schedule.items()): schedule[k] = transformer.rebuilt.get(v, v) else: # No Iterations are needed processed.extend([Expression(e, dtype) for e in cluster.exprs]) return List(body=processed)
def _loop_fission(self, nodes, state): """ Apply loop fission to innermost :class:`Iteration` objects. This pass is not applied if the number of statements in an Iteration's body is lower than ``self.thresholds['fission'].`` """ mapper = {} for tree in retrieve_iteration_tree(nodes): if len(tree) <= 1: # Heuristically avoided continue candidate = tree[-1] expressions = [e for e in candidate.nodes if e.is_Expression] if len(expressions) < self.thresholds['max_fission']: # Heuristically avoided continue if len(expressions) != len(candidate.nodes): # Dangerous for correctness continue functions = list( set.union(*[set(e.functions) for e in expressions])) wrapped = [e.expr for e in expressions] if not functions or not wrapped: # Heuristically avoided continue # Promote temporaries from scalar to tensors handle = functions[0] dim = handle.indices[-1] size = handle.shape[-1] if any(dim != i.indices[-1] for i in functions): # Dangerous for correctness continue wrapped = promote_scalar_expressions(wrapped, (size, ), (dim, ), True) assert len(wrapped) == len(expressions) rebuilt = [ Expression(s, e.dtype) for s, e in zip(wrapped, expressions) ] # Group statements # TODO: Need a heuristic here to maximize reuse args_frozen = candidate.args_frozen properties = as_tuple(args_frozen['properties']) + (ELEMENTAL, ) args_frozen['properties'] = properties n = self.thresholds['min_fission'] fissioned = [ Iteration(g, **args_frozen) for g in grouper(rebuilt, n) ] mapper[candidate] = List(body=fissioned) processed = Transformer(mapper).visit(nodes) return processed, {}
def iet_make(clusters, dtype): """ Create an Iteration/Expression tree (IET) given an iterable of :class:`Cluster`s. :param clusters: The iterable :class:`Cluster`s for which the IET is built. :param dtype: The data type of the scalar expressions. """ processed = [] schedule = OrderedDict() for cluster in clusters: if not cluster.ispace.empty: root = None intervals = cluster.ispace.intervals # Can I reuse any of the previously scheduled Iterations ? index = 0 for i0, i1 in zip(intervals, list(schedule)): if i0 != i1 or i0.dim in cluster.atomics: break root = schedule[i1] index += 1 needed = intervals[index:] # Build Expressions body = [ Expression( e, np.int32 if cluster.trace.is_index(e.lhs) else dtype) for e in cluster.exprs ] if not needed: body = List(body=body) # Build Iterations scheduling = [] for i in reversed(needed): # Prepare any necessary unbounded index uindices = [] for j, offs in cluster.ispace.sub_iterators.get(i.dim, []): modulo = len(offs) for n, o in enumerate(filter_ordered(offs)): name = "%s%d" % (j.name, n) vname = Scalar(name=name, dtype=np.int32) value = (i.dim + o) % modulo uindices.append( UnboundedIndex(vname, value, value, j, j + o)) # Retrieve the iteration direction direction = cluster.ispace.directions[i.dim] # Update IET and scheduling if i.dim in cluster.guards: # Must wrap within an if-then scope body = Conditional(cluster.guards[i.dim], body) iteration = Iteration(body, i.dim, i.dim.limits, offsets=i.limits, direction=direction, uindices=uindices) # Adding (None, None) ensures that nested iterations won't # be reused by the next cluster scheduling.extend([(None, None), (i, iteration)]) else: iteration = Iteration(body, i.dim, i.dim.limits, offsets=i.limits, direction=direction, uindices=uindices) scheduling.append((i, iteration)) # Prepare for next dimension body = iteration # If /needed/ is != [], root.dim might be a guarded dimension for /cluster/ if root is not None and root.dim in cluster.guards: body = Conditional(cluster.guards[root.dim], body) # Update the current schedule scheduling = OrderedDict(reversed(scheduling)) if root is None: processed.append(body) schedule = scheduling else: nodes = list(root.nodes) + [body] mapper = {root: root._rebuild(nodes, **root.args_frozen)} transformer = Transformer(mapper) processed = list(transformer.visit(processed)) schedule = OrderedDict( list(schedule.items())[:index] + list(scheduling.items())) for k, v in list(schedule.items()): schedule[k] = transformer.rebuilt.get(v, v) else: # No Iterations are needed processed.extend([Expression(e, dtype) for e in cluster.exprs]) return List(body=processed)
def iet_make(clusters): """ Create an Iteration/Expression tree (IET) given an iterable of :class:`Cluster`s. :param clusters: The iterable :class:`Cluster`s for which the IET is built. """ # {Iteration -> [c0, c1, ...]}, shared clusters shared = {} # The constructed IET processed = [] # {Interval -> Iteration}, carried from preceding cluster schedule = OrderedDict() # Build IET for cluster in clusters: body = [Expression(e) for e in cluster.exprs] if cluster.ispace.empty: # No Iterations are needed processed.extend(body) continue root = None itintervals = cluster.ispace.iteration_intervals # Can I reuse any of the previously scheduled Iterations ? index = 0 for i0, i1 in zip(itintervals, list(schedule)): if i0 != i1 or i0.dim in cluster.atomics: break root = schedule[i1] index += 1 needed = itintervals[index:] # Build Expressions if not needed: body = List(body=body) # Build Iterations scheduling = [] for i in reversed(needed): # Update IET and scheduling if i.dim in cluster.guards: # Must wrap within an if-then scope body = Conditional(cluster.guards[i.dim], body) # Adding (None, None) ensures that nested iterations won't # be reused by the next cluster scheduling.insert(0, (None, None)) iteration = Iteration(body, i.dim, i.dim.limits, offsets=i.limits, direction=i.direction) scheduling.insert(0, (i, iteration)) # Prepare for next dimension body = iteration # If /needed/ is != [], root.dim might be a guarded dimension for /cluster/ if root is not None and root.dim in cluster.guards: body = Conditional(cluster.guards[root.dim], body) # Update the current schedule if root is None: processed.append(body) else: nodes = list(root.nodes) + [body] transf = Transformer( {root: root._rebuild(nodes, **root.args_frozen)}) processed = list(transf.visit(processed)) scheduling = list(schedule.items())[:index] + list(scheduling) scheduling = [(k, transf.rebuilt.get(v, v)) for k, v in scheduling] shared = {transf.rebuilt.get(k, k): v for k, v in shared.items()} schedule = OrderedDict(scheduling) # Record that /cluster/ was used to build the iterations in /schedule/ shared.update( {i: shared.get(i, []) + [cluster] for i in schedule.values() if i}) iet = List(body=processed) # Add in unbounded indices, if needed mapper = {} for k, v in shared.items(): uindices = [] ispace = IterationSpace.merge(*[i.ispace.project([k.dim]) for i in v]) for j, offs in ispace.sub_iterators.get(k.dim, []): modulo = len(offs) for n, o in enumerate(filter_ordered(offs)): name = "%s%d" % (j.name, n) vname = Scalar(name=name, dtype=np.int32) value = (k.dim + o) % modulo uindices.append(UnboundedIndex(vname, value, value, j, j + o)) mapper[k] = k._rebuild(uindices=uindices) iet = NestedTransformer(mapper).visit(iet) return iet
def _specialize_iet(self, iet, **kwargs): warning("The OPS backend is still work-in-progress") affine_trees = find_affine_trees(iet).items() # If there is no affine trees, then there is no loop to be optimized using OPS. if not affine_trees: return iet ops_init = Call(namespace['ops_init'], [0, 0, 2]) ops_partition = Call(namespace['ops_partition'], Literal('""')) ops_exit = Call(namespace['ops_exit']) # Extract all symbols that need to be converted to ops_dat dims = [] to_dat = set() for _, tree in affine_trees: dims.append(len(tree[0].dimensions)) symbols = set(FindSymbols('symbolics').visit(tree[0].root)) symbols -= set(FindSymbols('defines').visit(tree[0].root)) to_dat |= symbols # Create the OPS block for this problem ops_block = OpsBlock('block') ops_block_init = Expression( ClusterizedEq( Eq(ops_block, namespace['ops_decl_block'](dims[0], Literal('"block"'))))) # To ensure deterministic code generation we order the datasets to # be generated (since a set is an unordered collection) to_dat = filter_sorted(to_dat) name_to_ops_dat = {} pre_time_loop = [] after_time_loop = [] for f in to_dat: if f.is_Constant: continue pre_time_loop.extend( list(create_ops_dat(f, name_to_ops_dat, ops_block))) # To return the result to Devito, it is necessary to copy the data # from the dat object back to the CPU memory. after_time_loop.extend( create_ops_fetch(f, name_to_ops_dat, self.time_dimension.extreme_max)) # Generate ops kernels for each offloadable iteration tree mapper = {} for n, (_, tree) in enumerate(affine_trees): pre_loop, ops_kernel, ops_par_loop_call = opsit( tree, n, name_to_ops_dat, ops_block, dims[0]) pre_time_loop.extend(pre_loop) self._func_table[namespace['ops_kernel_file'](ops_kernel.name)] = \ MetaCall(ops_kernel, False) mapper[tree[0].root] = ops_par_loop_call mapper.update({i.root: mapper.get(i.root) for i in tree}) # Drop trees iet = Transformer(mapper).visit(iet) assert (d == dims[0] for d in dims), \ "The OPS backend currently assumes that all kernels \ have the same number of dimensions" self._headers.append(namespace['ops_define_dimension'](dims[0])) self._includes.extend(['stdio.h', 'ops_seq.h']) body = [ ops_init, ops_block_init, *pre_time_loop, ops_partition, iet, *after_time_loop, ops_exit ] return List(body=body)
def opsit(trees, count): node_factory = OPSNodeFactory() expressions = [] for tree in trees: expressions.extend(FindNodes(Expression).visit(tree.inner)) it_range = [] it_dims = 0 for tree in trees: if isinstance(tree, IterationTree): it_range = [it.bounds() for it in tree] it_dims = len(tree) block = OPSBlock(namespace['ops_block'](count)) block_init = Element( cgen.Initializer( block, Call("ops_decl_block", [it_dims, String(block.name)], False))) ops_expressions = [] accesses = defaultdict(set) for i in reversed(expressions): extend_accesses(accesses, get_accesses(i.expr)) ops_expressions.insert(0, Expression(make_ops_ast(i.expr, node_factory))) ops_stencils_initializers, ops_stencils = generate_ops_stencils(accesses) to_remove = [ f.name for f in FindSymbols('defines').visit(List(body=expressions)) ] parameters = FindSymbols('symbolics').visit(List(body=ops_expressions)) parameters = [ p for p in parameters if p.name != 'OPS_ACC_size' and p.name not in to_remove ] parameters = sorted(parameters, key=lambda i: (i.is_Constant, i.name)) arguments = FindSymbols('symbolics').visit(List(body=expressions)) arguments = [a for a in arguments if a.name not in to_remove] arguments = sorted(arguments, key=lambda i: (i.is_Constant, i.name)) ops_expressions = [ Expression(fix_ops_acc(e.expr, [p.name for p in parameters])) for e in ops_expressions ] callable_kernel = Callable(namespace['ops_kernel'](count), ops_expressions, "void", parameters) dat_declarations = [] argname_to_dat = {} for a in arguments: if a.is_Constant: continue dat_dec, dat_sym = to_ops_dat(a, block) dat_declarations.extend(dat_dec) argname_to_dat.update(dat_sym) par_loop_range_arr = SymbolicArray(name=namespace['ops_range'](count), dimensions=(len(it_range) * 2, ), dtype=np.int32) range_vals = [] for mn, mx in it_range: range_vals.append(mn) range_vals.append(mx) par_loop_range_init = Expression( ClusterizedEq(Eq(par_loop_range_arr, ListInitializer(range_vals)))) ops_args = get_ops_args([p for p in parameters], ops_stencils, argname_to_dat) par_loop = Call("ops_par_loop", [ FunctionPointer(callable_kernel.name), String(callable_kernel.name), block, it_dims, par_loop_range_arr, *ops_args ]) return (callable_kernel, [par_loop_range_init, block_init] + ops_stencils_initializers + dat_declarations + [Call("ops_partition", [String("")])], List(body=[par_loop]), it_dims)
def to_ops_dat(function, block): ndim = function.ndim - (1 if function.is_TimeFunction else 0) dim = SymbolicArray(name="%s_dim" % function.name, dimensions=(ndim, ), dtype=np.int32) base = SymbolicArray(name="%s_base" % function.name, dimensions=(ndim, ), dtype=np.int32) d_p = SymbolicArray(name="%s_d_p" % function.name, dimensions=(ndim, ), dtype=np.int32) d_m = SymbolicArray(name="%s_d_m" % function.name, dimensions=(ndim, ), dtype=np.int32) res = [] dats = {} ops_decl_dat_call = [] if function.is_TimeFunction: time_pos = function._time_position time_index = function.indices[time_pos] time_dims = function.shape[time_pos] dim_shape = function.shape[:time_pos] + function.shape[time_pos + 1:] padding = function.padding[:time_pos] + function.padding[time_pos + 1:] halo = function.halo[:time_pos] + function.halo[time_pos + 1:] base_val = [0 for i in range(ndim)] d_p_val = tuple([p[0] + h[0] for p, h in zip(padding, halo)]) d_m_val = tuple([-(p[1] + h[1]) for p, h in zip(padding, halo)]) ops_dat_array = SymbolicArray( name="%s_dat" % function.name, dimensions=[time_dims], dtype="ops_dat", ) ops_decl_dat_call.append( Element( cgen.Statement( "%s %s[%s]" % (ops_dat_array.dtype, ops_dat_array.name, time_dims)))) for i in range(time_dims): access = FunctionTimeAccess(function, i) ops_dat_access = ArrayAccess(ops_dat_array, i) call = Call("ops_decl_dat", [ block, 1, dim, base, d_m, d_p, access, String(function._C_typedata), String("%s%s%s" % (function.name, time_index, i)) ], False) dats["%s%s%s" % (function.name, time_index, i)] = ArrayAccess( ops_dat_array, Symbol("%s%s" % (time_index, i))) ops_decl_dat_call.append(Element(cgen.Assign(ops_dat_access, call))) else: ops_dat = OPSDat("%s_dat" % function.name) dats[function.name] = ops_dat d_p_val = tuple( [p[0] + h[0] for p, h in zip(function.padding, function.halo)]) d_m_val = tuple( [-(p[1] + h[1]) for p, h in zip(function.padding, function.halo)]) dim_shape = function.shape base_val = [0 for i in function.shape] ops_decl_dat_call.append( Element( cgen.Initializer( ops_dat, Call("ops_decl_dat", [ block, 1, dim, base, d_m, d_p, FunctionTimeAccess(function, 0), String(function._C_typedata), String(function.name) ], False)))) res.append(Expression(ClusterizedEq(Eq(dim, ListInitializer(dim_shape))))) res.append(Expression(ClusterizedEq(Eq(base, ListInitializer(base_val))))) res.append(Expression(ClusterizedEq(Eq(d_p, ListInitializer(d_p_val))))) res.append(Expression(ClusterizedEq(Eq(d_m, ListInitializer(d_m_val))))) res.extend(ops_decl_dat_call) return res, dats
def test_conditional(self, fc, grid): x, y, _ = grid.dimensions then_body = Expression(DummyEq(fc[x, y], fc[x, y] + 1)) else_body = Expression(DummyEq(fc[x, y], fc[x, y] + 2)) conditional = Conditional(x < 3, then_body, else_body) assert str(conditional) == """\
def make_ops_kernels(iet): warning("The OPS backend is still work-in-progress") affine_trees = find_affine_trees(iet).items() # If there is no affine trees, then there is no loop to be optimized using OPS. if not affine_trees: return iet, {} ops_init = Call(namespace['ops_init'], [0, 0, 2]) ops_partition = Call(namespace['ops_partition'], Literal('""')) ops_exit = Call(namespace['ops_exit']) # Extract all symbols that need to be converted to ops_dat dims = [] to_dat = set() for _, tree in affine_trees: dims.append(len(tree[0].dimensions)) symbols = set(FindSymbols('symbolics').visit(tree[0].root)) symbols -= set(FindSymbols('defines').visit(tree[0].root)) to_dat |= symbols # Create the OPS block for this problem ops_block = OpsBlock('block') ops_block_init = Expression( ClusterizedEq( Eq(ops_block, namespace['ops_decl_block'](dims[0], Literal('"block"'))))) # To ensure deterministic code generation we order the datasets to # be generated (since a set is an unordered collection) to_dat = filter_sorted(to_dat) name_to_ops_dat = {} pre_time_loop = [] after_time_loop = [] for f in to_dat: if f.is_Constant: continue pre_time_loop.extend( list(create_ops_dat(f, name_to_ops_dat, ops_block))) # Copy data from device to host after_time_loop.extend( create_ops_fetch(f, name_to_ops_dat, f.grid.time_dim.extreme_max)) # Generate ops kernels for each offloadable iteration tree mapper = {} ffuncs = [] for n, (_, tree) in enumerate(affine_trees): pre_loop, ops_kernel, ops_par_loop_call = opsit( tree, n, name_to_ops_dat, ops_block, dims[0]) pre_time_loop.extend(pre_loop) ffuncs.append(ops_kernel) mapper[tree[0].root] = ops_par_loop_call mapper.update({i.root: mapper.get(i.root) for i in tree}) # Drop trees iet = Transformer(mapper).visit(iet) assert (d == dims[0] for d in dims), \ "The OPS backend currently assumes that all kernels \ have the same number of dimensions" iet = iet._rebuild(body=flatten([ ops_init, ops_block_init, pre_time_loop, ops_partition, iet.body, after_time_loop, ops_exit ])) return iet, { 'includes': ['stdio.h', 'ops_seq.h'], 'ffuncs': ffuncs, 'headers': [namespace['ops_define_dimension'](dims[0])] }