def _nontemporal_stores(self, nodes): """ Add compiler-specific pragmas and instructions to generate nontemporal stores (ie, non-cached stores). """ pragma = self._backend_compiler_pragma('ntstores') fence = self._backend_compiler_pragma('storefence') if not pragma or not fence: return {} mapper = {} for tree in retrieve_iteration_tree(nodes): for i in tree: if i.is_Parallel: mapper[i] = List(body=i, footer=fence) break processed = Transformer(mapper).visit(nodes) mapper = {} for tree in retrieve_iteration_tree(processed): for i in tree: if i.is_Vectorizable: mapper[i] = List(header=pragma, body=i) processed = Transformer(mapper).visit(processed) return processed, {}
def test_cache_blocking_structure(blockinner, exp_calls, exp_iters): # Check code structure _, op = _new_operator1((10, 31, 45), dle=('blocking', {'blockalways': True, 'blockinner': blockinner})) calls = FindNodes(Call).visit(op) assert len(calls) == exp_calls trees = retrieve_iteration_tree(op._func_table['bf0'].root) assert len(trees) == 1 assert len(trees[0]) == exp_iters # Check presence of openmp pragmas at the right place _, op = _new_operator1((10, 31, 45), dle=('blocking', {'openmp': True, 'blockalways': True, 'blockinner': blockinner})) trees = retrieve_iteration_tree(op._func_table['bf0'].root) assert len(trees) == 1 tree = trees[0] assert len(tree.root.pragmas) == 1 assert 'omp for' in tree.root.pragmas[0].value # Also, with omp parallelism enabled, the step increment must be != 0 # to avoid omp segfaults at scheduling time (only certain omp implementations, # including Intel's) conditionals = FindNodes(Conditional).visit(op._func_table['bf0'].root) assert len(conditionals) == 1 conds = conditionals[0].condition.args expected_guarded = tree[:2+blockinner] assert len(conds) == len(expected_guarded) assert all(i.lhs == j.step for i, j in zip(conds, expected_guarded))
def test_multiple_loop_nests(self): """ Compute a simple stencil S, preceded by an "initialization loop" I and followed by a "random loop" R. * S is the trivial equation ``u[t+1,x,y,z] = u[t,x,y,z] + 1``; * I initializes ``u`` to 0; * R adds 2 to another field ``v`` along the ``z`` dimension but only over the planes ``[x=0, y=2]`` and ``[x=0, y=5]``. Out of these three loop nests, only S should be "offloaded" to YASK; indeed, I is outside the time loop, while R does not loop over space dimensions. This test checks that S is the only loop nest "offloaded" to YASK, and that the numerical output is correct. """ grid = Grid(shape=(12, 12, 12)) x, y, z = grid.dimensions t = grid.stepping_dim u = TimeFunction(name='yu4D', grid=grid, space_order=0) v = TimeFunction(name='yv4D', grid=grid, space_order=0) v.data[:] = 0. eqs = [Eq(u[0, x, y, z], 0), Eq(u[1, x, y, z], 0), Eq(u.forward, u + 1.), Eq(v[t + 1, 0, 2, z], v[t + 1, 0, 2, z] + 2.), Eq(v[t + 1, 0, 5, z], v[t + 1, 0, 5, z] + 2.)] op = Operator(eqs) op(yu4D=u, yv4D=v, time=0) assert 'run_solution' in str(op) assert len(retrieve_iteration_tree(op)) == 3 assert np.all(u.data[0] == 0.) assert np.all(u.data[1] == 1.) assert np.all(v.data[0] == 0.) assert np.all(v.data[1, 0, 2] == 2.) assert np.all(v.data[1, 0, 5] == 2.)
def __init__(self, iet): self.iet = iet self.properties = OrderedDict() self.trees = retrieve_iteration_tree(iet, mode='superset') self.scopes = OrderedDict([(k, Scope([i.expr for i in v])) for k, v in MapNodes().visit(iet).items()])
def test_make_efuncs(self, exprs, nfuncs, ntimeiters, nests): """Test construction of ElementalFunctions.""" exprs = list(as_tuple(exprs)) grid = Grid(shape=(10, 10)) t = grid.stepping_dim # noqa x, y = grid.dimensions # noqa u = Function(name='u', grid=grid) # noqa v = TimeFunction(name='v', grid=grid) # noqa # List comprehension would need explicit locals/globals mappings to eval for i, e in enumerate(list(exprs)): exprs[i] = eval(e) op = Operator(exprs) # We create one ElementalFunction for each Iteration nest over space dimensions efuncs = [] for n, tree in enumerate(retrieve_iteration_tree(op)): root = filter_iterations(tree, key=lambda i: i.dim.is_Space)[0] efuncs.append(make_efunc('f%d' % n, root)) assert len(efuncs) == len(nfuncs) == len(ntimeiters) == len(nests) for efunc, nf, nt, nest in zip(efuncs, nfuncs, ntimeiters, nests): # Check the `efunc` parameters assert all(i in efunc.parameters for i in (x.symbolic_min, x.symbolic_max)) assert all(i in efunc.parameters for i in (y.symbolic_min, y.symbolic_max)) functions = FindSymbols().visit(efunc) assert len(functions) == nf assert all(i in efunc.parameters for i in functions) timeiters = [i for i in FindSymbols('free-symbols').visit(efunc) if isinstance(i, Dimension) and i.is_Time] assert len(timeiters) == nt assert all(i in efunc.parameters for i in timeiters) assert len(efunc.parameters) == 4 + len(functions) + len(timeiters) # Check the loop nest structure trees = retrieve_iteration_tree(efunc) assert len(trees) == 1 tree = trees[0] assert all(i.dim.name == j for i, j in zip(tree, nest)) assert efunc.make_call()
def fold_blockable_tree(node, blockinner=True): """ Create IterationFolds from sequences of nested Iterations. """ mapper = {} for k, v in FindAdjacent(Iteration).visit(node).items(): for i in v: # Pre-condition: they all must be perfect iterations assert len(i) > 1 if any(not IsPerfectIteration().visit(j) for j in i): continue # Only retain consecutive trees having same depth trees = [retrieve_iteration_tree(j)[0] for j in i] handle = [] for j in trees: if len(j) != len(trees[0]): break handle.append(j) trees = handle if not trees: continue # Check foldability pairwise_folds = list(zip(*reversed(trees))) if any(not is_foldable(j) for j in pairwise_folds): continue # Maybe heuristically exclude innermost Iteration if blockinner is False: pairwise_folds = pairwise_folds[:-1] # Perhaps there's nothing to fold if len(pairwise_folds) == 1: continue # TODO: we do not currently support blocking if any of the foldable # iterations writes to user data (need min/max loop bounds?) exprs = flatten(FindNodes(Expression).visit(j.root) for j in trees[:-1]) if any(j.write.is_Input for j in exprs): continue # Perform folding for j in pairwise_folds: root, remainder = j[0], j[1:] folds = [(tuple(y-x for x, y in zip(i.offsets, root.offsets)), i.nodes) for i in remainder] mapper[root] = IterationFold(folds=folds, **root.args) for k in remainder: mapper[k] = None # Insert the IterationFolds in the Iteration/Expression tree processed = Transformer(mapper, nested=True).visit(node) return processed
def unfold_blocked_tree(node): """ Unfold nested IterationFolds. Examples -------- Given a section of Iteration/Expression tree as below: :: for i = 1 to N-1 // folded for j = 1 to N-1 // folded foo1() Assuming a fold with offset 1 in both /i/ and /j/ and body ``foo2()``, create: :: for i = 1 to N-1 for j = 1 to N-1 foo1() for i = 2 to N-2 for j = 2 to N-2 foo2() """ # Search the unfolding candidates candidates = [] for tree in retrieve_iteration_tree(node): handle = tuple(i for i in tree if i.is_IterationFold) if handle: # Sanity check assert IsPerfectIteration().visit(handle[0]) candidates.append(handle) # Perform unfolding mapper = {} for tree in candidates: trees = list(zip(*[i.unfold() for i in tree])) trees = optimize_unfolded_tree(trees[:-1], trees[-1]) mapper[tree[0]] = List(body=trees) # Insert the unfolded Iterations in the Iteration/Expression tree processed = Transformer(mapper).visit(node) return processed
def _hoist_prodders(self, iet): """ Move Prodders within the outer levels of an Iteration tree. """ mapper = {} for tree in retrieve_iteration_tree(iet): for prodder in FindNodes(Prodder).visit(tree.root): if prodder._periodic: try: key = lambda i: isinstance(i.dim, BlockDimension) candidate = filter_iterations(tree, key)[-1] except IndexError: # Fallback: use the outermost Iteration candidate = tree.root mapper[candidate] = candidate._rebuild(nodes=((prodder._rebuild(),) + candidate.nodes)) mapper[prodder] = None iet = Transformer(mapper, nested=True).visit(iet) return iet, {}
def test_cache_blocking_imperfect_nest(blockinner): """ Test that a non-perfect Iteration nest is blocked correctly. """ grid = Grid(shape=(4, 4, 4), dtype=np.float64) u = TimeFunction(name='u', grid=grid, space_order=2) v = TimeFunction(name='v', grid=grid, space_order=2) eqns = [Eq(u.forward, v.laplace), Eq(v.forward, u.forward.dz)] op0 = Operator(eqns, opt='noop') op1 = Operator(eqns, opt=('advanced', {'blockinner': blockinner})) # First, check the generated code bns, _ = assert_blocking(op1, {'x0_blk0'}) trees = retrieve_iteration_tree(bns['x0_blk0']) assert len(trees) == 2 assert len(trees[0]) == len(trees[1]) assert all(i is j for i, j in zip(trees[0][:4], trees[1][:4])) assert trees[0][4] is not trees[1][4] assert trees[0].root.dim.is_Incr assert trees[1].root.dim.is_Incr assert op1.parameters[7] is trees[0][0].step assert op1.parameters[10] is trees[0][1].step u.data[:] = 0.2 v.data[:] = 1.5 op0(time_M=0) u1 = TimeFunction(name='u1', grid=grid, space_order=2) v1 = TimeFunction(name='v1', grid=grid, space_order=2) u1.data[:] = 0.2 v1.data[:] = 1.5 op1(u=u1, v=v1, time_M=0) assert np.all(u.data == u1.data) assert np.all(v.data == v1.data)
def test_timeparallel_reduction(self): grid = Grid(shape=(3, 3, 3)) i = Dimension(name='i') f = Function(name='f', shape=(1, ), dimensions=(i, ), grid=grid) u = TimeFunction(name='u', grid=grid) op = Operator(Inc(f[0], u + 1), opt='noop') trees = retrieve_iteration_tree(op) assert len(trees) == 1 tree = trees[0] assert tree.root.is_Sequential assert all(i.is_ParallelRelaxed and not i.is_Parallel for i in tree[1:]) # The time loop is not in OpenMP canonical form, so it won't be parallelized assert not tree.root.pragmas assert len(tree[1].pragmas) == 1 assert tree[1].pragmas[0].value ==\ ('omp target teams distribute parallel for collapse(3)' ' reduction(+:f[0])')
def test_basic(self): grid = Grid(shape=(3, 3, 3)) u = TimeFunction(name='u', grid=grid) op = Operator(Eq(u.forward, u + 1)) trees = retrieve_iteration_tree(op) assert len(trees) == 1 assert trees[0][1].pragmas[0].value ==\ 'omp target teams distribute parallel for collapse(3)' assert op.body[2].header[0].value ==\ ('omp target enter data map(to: u[0:u_vec->size[0]]' '[0:u_vec->size[1]][0:u_vec->size[2]][0:u_vec->size[3]])') assert str(op.body[2].footer[0]) == '' assert op.body[2].footer[1].contents[0].value ==\ ('omp target update from(u[0:u_vec->size[0]]' '[0:u_vec->size[1]][0:u_vec->size[2]][0:u_vec->size[3]])') assert op.body[2].footer[1].contents[1].value ==\ ('omp target exit data map(release: u[0:u_vec->size[0]]' '[0:u_vec->size[1]][0:u_vec->size[2]][0:u_vec->size[3]]) if(devicerm)')
def _parallelize_dist(self, iet): """ Add MPI routines performing halo exchanges to emit distributed-memory parallel code. """ # To produce unique object names generators = { 'msg': generator(), 'comm': generator(), 'comp': generator() } sync_heb = HaloExchangeBuilder('basic', **generators) user_heb = HaloExchangeBuilder(self.params['mpi'], **generators) mapper = {} for hs in FindNodes(HaloSpot).visit(iet): heb = user_heb if hs.is_Overlappable else sync_heb mapper[hs] = heb.make(hs) efuncs = sync_heb.efuncs + user_heb.efuncs objs = sync_heb.objs + user_heb.objs iet = Transformer(mapper, nested=True).visit(iet) # Must drop the PARALLEL tag from the Iterations within which halo # exchanges are performed mapper = {} for tree in retrieve_iteration_tree(iet): for i in reversed(tree): if i in mapper: # Already seen this subtree, skip break if FindNodes(Call).visit(i): mapper.update({ n: n._rebuild(properties=set(n.properties) - {PARALLEL}) for n in tree[:tree.index(i) + 1] }) break iet = Transformer(mapper, nested=True).visit(iet) return iet, {'includes': ['mpi.h'], 'efuncs': efuncs, 'args': objs}
def iet_build(clusters, dtype): """ Create an Iteration/Expression tree (IET) given an iterable of :class:`Cluster`s. The nodes in the returned IET are decorated with properties deriving from data dependence analysis. """ # Clusters -> Iteration/Expression tree iet = iet_make(clusters, dtype) # Data dependency analysis. Properties are attached directly to nodes iet = iet_analyze(iet) # Substitute derived dimensions (e.g., t -> t0, t + 1 -> t1) # This is postponed up to this point to ease /iet_analyze/'s life subs = {} for tree in retrieve_iteration_tree(iet): uindices = flatten(i.uindices for i in tree) subs.update({i.expr: LoweredDimension(name=i.index.name, origin=i.expr) for i in uindices}) iet = SubstituteExpression(subs).visit(iet) return iet
def make_parallel(self, iet): """Transform ``iet`` by introducing shared-memory parallelism.""" mapper = OrderedDict() for tree in retrieve_iteration_tree(iet): # Get the first omp-parallelizable Iteration in `tree` candidates = filter_iterations(tree, key=self.key, stop='asap') if not candidates: continue root = candidates[0] # Build the `omp-for` tree partree = self._make_parallel_tree(root, candidates) # Find out the thread-private and thread-shared variables private = [ i for i in FindSymbols().visit(partree) if i.is_Array and i._mem_stack ] # Build the `omp-parallel` region private = sorted(set([i.name for i in private])) private = ('private(%s)' % ','.join(private)) if private else '' partree = Block(header=self.lang['par-region'](self.nthreads.name, private), body=partree) # Do not enter the parallel region if the step increment might be 0; this # would raise a `Floating point exception (core dumped)` in some OpenMP # implementation. Note that using an OpenMP `if` clause won't work if isinstance(root.step, Symbol): cond = Conditional(CondEq(root.step, 0), Element(c.Statement('return'))) partree = List(body=[cond, partree]) mapper[root] = partree iet = Transformer(mapper).visit(iet) return iet, {'input': [self.nthreads] if mapper else []}
def test_basic(self): grid = Grid(shape=(3, 3, 3)) u = TimeFunction(name='u', grid=grid) op = Operator(Eq(u.forward, u + 1), platform='nvidiaX', language='openacc') trees = retrieve_iteration_tree(op) assert len(trees) == 1 assert trees[0][1].pragmas[0].value ==\ 'acc parallel loop collapse(3)' assert op.body[1].header[0].value ==\ ('acc enter data copyin(u[0:u_vec->size[0]]' '[0:u_vec->size[1]][0:u_vec->size[2]][0:u_vec->size[3]])') assert op.body[1].footer[0].contents[0].value ==\ ('acc exit data copyout(u[0:u_vec->size[0]]' '[0:u_vec->size[1]][0:u_vec->size[2]][0:u_vec->size[3]])') assert op.body[1].footer[0].contents[1].value ==\ ('acc exit data delete(u[0:u_vec->size[0]]' '[0:u_vec->size[1]][0:u_vec->size[2]][0:u_vec->size[3]])')
def test_multiple_eqns(self): grid = Grid(shape=(3, 3, 3)) u = TimeFunction(name='u', grid=grid) v = TimeFunction(name='v', grid=grid) op = Operator([Eq(u.forward, u + v + 1), Eq(v.forward, u + v + 4)], dle=('advanced', {'openmp': True})) trees = retrieve_iteration_tree(op) assert len(trees) == 1 assert trees[0][1].pragmas[0].value ==\ 'omp target teams distribute parallel for collapse(3)' for i, f in enumerate([u, v]): assert op.body[2].header[2 + i].value ==\ ('omp target enter data map(to: %(n)s[0:%(n)s_vec->size[0]]' '[0:%(n)s_vec->size[1]][0:%(n)s_vec->size[2]][0:%(n)s_vec->size[3]])' % {'n': f.name}) assert op.body[2].footer[i].value ==\ ('omp target exit data map(from: %(n)s[0:%(n)s_vec->size[0]]' '[0:%(n)s_vec->size[1]][0:%(n)s_vec->size[2]][0:%(n)s_vec->size[3]])' % {'n': f.name})
def test_multiple_loop_nests(self): """ Compute a simple stencil S, preceded by an "initialization loop" I and followed by a "random loop" R. * S is the trivial equation ``u[t+1,x,y,z] = u[t,x,y,z] + 1``; * I initializes ``u`` to 0; * R adds 2 to another field ``v`` along the ``z`` dimension but only over the planes ``[x=0, y=2]`` and ``[x=0, y=5]``. Out of these three loop nests, only S should be "offloaded" to YASK; indeed, I is outside the time loop, while R does not loop over space dimensions. This test checks that S is the only loop nest "offloaded" to YASK, and that the numerical output is correct. """ grid = Grid(shape=(12, 12, 12)) x, y, z = grid.dimensions t = grid.stepping_dim u = TimeFunction(name='yu4D', grid=grid, space_order=0) v = TimeFunction(name='yv4D', grid=grid, space_order=0) v.data[:] = 0. eqs = [ Eq(u.indexed[0, x, y, z], 0), Eq(u.indexed[1, x, y, z], 0), Eq(u.forward, u + 1.), Eq(v.indexed[t + 1, 0, 2, z], v.indexed[t + 1, 0, 2, z] + 2.), Eq(v.indexed[t + 1, 0, 5, z], v.indexed[t + 1, 0, 5, z] + 2.) ] op = Operator(eqs) op(yu4D=u, yv4D=v, time=0) assert 'run_solution' in str(op) assert len(retrieve_iteration_tree(op)) == 3 assert np.all(u.data[0] == 0.) assert np.all(u.data[1] == 1.) assert np.all(v.data[0] == 0.) assert np.all(v.data[1, 0, 2] == 2.) assert np.all(v.data[1, 0, 5] == 2.)
def test_multiple_subnests_v0(self): grid = Grid(shape=(3, 3, 3)) x, y, z = grid.dimensions t = grid.stepping_dim f = Function(name='f', grid=grid) u = TimeFunction(name='u', grid=grid, space_order=3) eqn = Eq( u.forward, _R( _R(u[t, x, y, z] + u[t, x + 1, y + 1, z + 1]) * 3. * f + _R(u[t, x + 2, y + 2, z + 2] + u[t, x + 3, y + 3, z + 3]) * 3. * f) + 1.) op = Operator(eqn, opt=('advanced', { 'openmp': True, 'cire-mingain': 0, 'par-nested': 0, 'par-collapse-ncores': 1, 'par-dynamic-work': 0 })) bns, _ = assert_blocking(op, {'x0_blk0'}) trees = retrieve_iteration_tree(bns['x0_blk0']) assert len(trees) == 2 assert trees[0][0] is trees[1][0] assert trees[0][0].pragmas[0].value ==\ 'omp for collapse(2) schedule(dynamic,1)' assert trees[0][2].pragmas[0].value == ('omp parallel for collapse(2) ' 'schedule(dynamic,1) ' 'num_threads(nthreads_nested)') assert trees[1][2].pragmas[0].value == ('omp parallel for collapse(2) ' 'schedule(dynamic,1) ' 'num_threads(nthreads_nested)')
def test_cache_blocking_structure_subdims(): """ Test that: * With local SubDimensions no-blocking is expected. * With non-local SubDimensions, blocking is expected. """ grid = Grid(shape=(4, 4, 4)) x, y, z = grid.dimensions xi, yi, zi = grid.interior.dimensions t = grid.stepping_dim xl = SubDimension.left(name='xl', parent=x, thickness=4) f = TimeFunction(name='f', grid=grid) assert xl.local # Local SubDimension -> no blocking expected op = Operator(Eq(f[t + 1, xl, y, z], f[t, xl, y, z] + 1)) assert len(op._func_table) == 0 # Non-local SubDimension -> blocking expected op = Operator(Eq(f.forward, f + 1, subdomain=grid.interior)) trees = retrieve_iteration_tree(op._func_table['bf0'].root) assert len(trees) == 1 tree = trees[0] assert len(tree) == 5 assert tree[ 0].dim.is_Incr and tree[0].dim.parent is xi and tree[0].dim.root is x assert tree[ 1].dim.is_Incr and tree[1].dim.parent is yi and tree[1].dim.root is y assert tree[2].dim.is_Incr and tree[2].dim.parent is tree[0].dim and\ tree[2].dim.root is x assert tree[3].dim.is_Incr and tree[3].dim.parent is tree[1].dim and\ tree[3].dim.root is y assert not tree[ 4].dim.is_Incr and tree[4].dim is zi and tree[4].dim.parent is z
def test_multiple_subnests_v1(self): """ Unlike ``test_multiple_subnestes_v0``, now we use the ``cire-rotate=True`` option, which trades some of the inner parallelism for a smaller working set. """ grid = Grid(shape=(3, 3, 3)) x, y, z = grid.dimensions t = grid.stepping_dim f = Function(name='f', grid=grid) u = TimeFunction(name='u', grid=grid, space_order=3) eqn = Eq(u.forward, ((u[t, x, y, z] + u[t, x+1, y+1, z+1])*3*f + (u[t, x+2, y+2, z+2] + u[t, x+3, y+3, z+3])*3*f + 1)) op = Operator(eqn, opt=('advanced', {'openmp': True, 'cire-mincost-sops': 1, 'cire-rotate': True, 'par-nested': 0, 'par-collapse-ncores': 1, 'par-dynamic-work': 0})) trees = retrieve_iteration_tree(op._func_table['bf0'].root) assert len(trees) == 2 assert trees[0][0] is trees[1][0] assert trees[0][0].pragmas[0].value ==\ 'omp for collapse(2) schedule(dynamic,1)' assert not trees[0][2].pragmas assert not trees[0][3].pragmas assert trees[0][4].pragmas[0].value == ('omp parallel for collapse(1) ' 'schedule(dynamic,1) ' 'num_threads(nthreads_nested)') assert not trees[1][2].pragmas assert trees[1][3].pragmas[0].value == ('omp parallel for collapse(1) ' 'schedule(dynamic,1) ' 'num_threads(nthreads_nested)')
def test_consistency_anti_dependences(self, exprs, directions, expected, visit, ti0, ti1, ti3, tu, tv, tw): """ Test that anti dependences end up generating multi loop nests, rather than a single loop nest enclosing all of the equations. """ eq1, eq2, eq3 = EVAL(exprs, ti0.base, ti1.base, ti3.base, tu.base, tv.base, tw.base) op = Operator([eq1, eq2, eq3], dse='noop', dle='noop') trees = retrieve_iteration_tree(op) iters = FindNodes(Iteration).visit(op) assert len(trees) == len(expected) assert len(iters) == len(directions) # mapper just makes it quicker to write out the test parametrization mapper = {'time': 't'} assert [ "".join(mapper.get(i.dim.name, i.dim.name) for i in j) for j in trees ] == expected assert "".join(mapper.get(i.dim.name, i.dim.name) for i in iters) == visit # mapper just makes it quicker to write out the test parametrization mapper = {'+': Forward, '-': Backward, '*': Any} assert all(i.direction == mapper[j] for i, j in zip(iters, directions))
def _simdize(self, iet): """ Add pragmas to the Iteration/Expression tree to enforce SIMD auto-vectorization by the backend compiler. """ ignore_deps = as_tuple(self._backend_compiler_pragma('ignore-deps')) mapper = {} for tree in retrieve_iteration_tree(iet): vector_iterations = [i for i in tree if i.is_Vectorizable] for i in vector_iterations: aligned = [j for j in FindSymbols('symbolics').visit(i) if j.is_DiscreteFunction] if aligned: simd = Ompizer.lang['simd-for-aligned'] simd = as_tuple(simd(','.join([j.name for j in aligned]), self.platform.simd_reg_size)) else: simd = as_tuple(Ompizer.lang['simd-for']) mapper[i] = i._rebuild(pragmas=i.pragmas + ignore_deps + simd) processed = Transformer(mapper).visit(iet) return processed, {}
def test_multiple_subnests(self): grid = Grid(shape=(3, 3, 3)) x, y, z = grid.dimensions t = grid.stepping_dim f = Function(name='f', grid=grid) u = TimeFunction(name='u', grid=grid) eqn = Eq(u.forward, ((u[t, x, y, z] + u[t, x+1, y+1, z+1])*3*f + (u[t, x+2, y+2, z+2] + u[t, x+3, y+3, z+3])*3*f + 1)) op = Operator(eqn, dse='aggressive', dle=('advanced', {'openmp': True})) trees = retrieve_iteration_tree(op._func_table['bf0'].root) assert len(trees) == 2 assert trees[0][0] is trees[1][0] assert trees[0][0].pragmas[0].value ==\ 'omp for collapse(1) schedule(dynamic,1)' assert trees[0][2].pragmas[0].value == ('omp parallel for collapse(1) ' 'schedule(dynamic,1) ' 'num_threads(nthreads_nested)') assert trees[1][2].pragmas[0].value == ('omp parallel for collapse(1) ' 'schedule(dynamic,1) ' 'num_threads(nthreads_nested)')
def test_multiple_subnests(self): grid = Grid(shape=(3, 3, 3)) x, y, z = grid.dimensions t = grid.stepping_dim f = Function(name='f', grid=grid) u = TimeFunction(name='u', grid=grid) eqn = Eq(u.forward, (u[t, x, y, z]*u[t, x+1, y+1, z+1]*3*f + u[t, x+2, y+2, z+2]*u[t, x+3, y+3, z+3]*3*f + 1)) op = Operator(eqn, dse='aggressive', dle=('advanced', {'openmp': True})) trees = retrieve_iteration_tree(op._func_table['bf0'].root) assert len(trees) == 2 assert trees[0][0] is trees[1][0] assert trees[0][0].pragmas[0].value ==\ 'omp for collapse(1) schedule(static,1)' assert trees[0][2].pragmas[0].value ==\ ('omp parallel for collapse(1) schedule(static,1) num_threads(%d)' % nhyperthreads()) assert trees[1][2].pragmas[0].value ==\ ('omp parallel for collapse(1) schedule(static,1) num_threads(%d)' % nhyperthreads())
def _loop_fission(self, nodes, state): """ Apply loop fission to innermost :class:`Iteration` objects. This pass is not applied if the number of statements in an Iteration's body is lower than ``self.thresholds['fission'].`` """ mapper = {} for tree in retrieve_iteration_tree(nodes): if len(tree) <= 1: # Heuristically avoided continue candidate = tree[-1] expressions = [e for e in candidate.nodes if e.is_Expression] if len(expressions) < self.thresholds['max_fission']: # Heuristically avoided continue if len(expressions) != len(candidate.nodes): # Dangerous for correctness continue functions = list( set.union(*[set(e.functions) for e in expressions])) wrapped = [e.expr for e in expressions] if not functions or not wrapped: # Heuristically avoided continue # Promote temporaries from scalar to tensors handle = functions[0] dim = handle.indices[-1] size = handle.shape[-1] if any(dim != i.indices[-1] for i in functions): # Dangerous for correctness continue wrapped = promote_scalar_expressions(wrapped, (size, ), (dim, ), True) assert len(wrapped) == len(expressions) rebuilt = [ Expression(s, e.dtype) for s, e in zip(wrapped, expressions) ] # Group statements # TODO: Need a heuristic here to maximize reuse args_frozen = candidate.args_frozen properties = as_tuple(args_frozen['properties']) + (ELEMENTAL, ) args_frozen['properties'] = properties n = self.thresholds['min_fission'] fissioned = [ Iteration(g, **args_frozen) for g in grouper(rebuilt, n) ] mapper[candidate] = List(body=fissioned) processed = Transformer(mapper).visit(nodes) return processed, {}
def _loop_blocking(self, iet): """ Apply loop blocking to PARALLEL Iteration trees. """ blockinner = bool(self.params.get('blockinner')) blockalways = bool(self.params.get('blockalways')) # Make sure loop blocking will span as many Iterations as possible iet = fold_blockable_tree(iet, blockinner) mapper = {} efuncs = [] block_dims = [] for tree in retrieve_iteration_tree(iet): # Is the Iteration tree blockable ? iterations = filter_iterations(tree, lambda i: i.is_Parallel) if not blockinner: iterations = iterations[:-1] if len(iterations) <= 1: continue root = iterations[0] if not (tree.root.is_Sequential or iet.is_Callable) and not blockalways: # Heuristic: avoid polluting the generated code with blocked # nests (thus increasing JIT compilation time and affecting # readability) if the blockable tree isn't embedded in a # sequential loop (e.g., a timestepping loop) continue # Apply loop blocking to `tree` interb = [] intrab = [] for i in iterations: d = BlockDimension(i.dim, name="%s%d_blk" % (i.dim.name, len(mapper))) block_dims.append(d) # Build Iteration over blocks properties = (PARALLEL,) + ((AFFINE,) if i.is_Affine else ()) interb.append(Iteration([], d, d.symbolic_max, properties=properties)) # Build Iteration within a block intrab.append(i._rebuild([], limits=(d, d+d.step-1, 1), offsets=(0, 0))) # Construct the blocked tree blocked = compose_nodes(interb + intrab + [iterations[-1].nodes]) blocked = unfold_blocked_tree(blocked) # Promote to a separate Callable dynamic_parameters = flatten((bi.dim, bi.dim.symbolic_size) for bi in interb) efunc = make_efunc("bf%d" % len(mapper), blocked, dynamic_parameters) efuncs.append(efunc) # Compute the iteration ranges ranges = [] for i, bi in zip(iterations, interb): maxb = i.symbolic_max - (i.symbolic_size % bi.dim.step) ranges.append(((i.symbolic_min, maxb, bi.dim.step), (maxb + 1, i.symbolic_max, i.symbolic_max - maxb))) # Build Calls to the `efunc` body = [] for p in product(*ranges): dynamic_args_mapper = {} for bi, (m, M, b) in zip(interb, p): dynamic_args_mapper[bi.dim] = (m, M) dynamic_args_mapper[bi.dim.step] = (b,) call = efunc.make_call(dynamic_args_mapper) body.append(List(body=call)) mapper[root] = List(body=body) iet = Transformer(mapper).visit(iet) return iet, {'dimensions': block_dims, 'efuncs': efuncs, 'args': [i.step for i in block_dims]}
def _minimize_remainders(self, iet): """ Reshape temporary tensors and adjust loop trip counts to prevent as many compiler-generated remainder loops as possible. """ # The innermost dimension is the one that might get padded p_dim = -1 mapper = {} for tree in retrieve_iteration_tree(iet): vector_iterations = [i for i in tree if i.is_Vectorizable] if not vector_iterations or len(vector_iterations) > 1: continue root = vector_iterations[0] # Padding writes = [i.write for i in FindNodes(Expression).visit(root) if i.write.is_Array] padding = [] for i in writes: try: simd_items = self.platform.simd_items_per_reg(i.dtype) except KeyError: return iet, {} padding.append(simd_items - i.shape[-1] % simd_items) if len(set(padding)) == 1: padding = padding[0] for i in writes: padded = (i._padding[p_dim][0], i._padding[p_dim][1] + padding) i.update(padding=i._padding[:p_dim] + (padded,)) else: # Padding must be uniform -- not the case, so giving up continue # Dynamic trip count adjustment endpoint = root.symbolic_max if not endpoint.is_Symbol: continue condition = [] externals = set(i.symbolic_shape[-1] for i in FindSymbols().visit(root) if i.is_Tensor) for i in root.uindices: for j in externals: condition.append(root.symbolic_max + padding < j) condition = ' && '.join(ccode(i) for i in condition) endpoint_padded = endpoint.func('_%s' % endpoint.name) init = cgen.Initializer( cgen.Value("const int", endpoint_padded), cgen.Line('(%s) ? %s : %s' % (condition, ccode(endpoint + padding), endpoint)) ) # Update the Iteration bound limits = list(root.limits) limits[1] = endpoint_padded.func(endpoint_padded.name) rebuilt = list(tree) rebuilt[rebuilt.index(root)] = root._rebuild(limits=limits) mapper[tree[0]] = List(header=init, body=compose_nodes(rebuilt)) processed = Transformer(mapper).visit(iet) return processed, {}
def relax_incr_dimensions(iet, **kwargs): """ Recast Iterations over IncrDimensions as ElementalFunctions; insert ElementalCalls to iterate over the "main" and "remainder" regions induced by the IncrDimensions. """ sregistry = kwargs['sregistry'] efuncs = [] mapper = {} for tree in retrieve_iteration_tree(iet): iterations = [i for i in tree if i.dim.is_Incr] if not iterations: continue root = iterations[0] if root in mapper: continue outer, inner = split(iterations, lambda i: not i.dim.parent.is_Incr) # Compute the iteration ranges ranges = [] for i in outer: maxb = i.symbolic_max - (i.symbolic_size % i.dim.step) ranges.append(((i.symbolic_min, maxb, i.dim.step), (maxb + 1, i.symbolic_max, i.symbolic_max - maxb))) # Remove any offsets # E.g., `x = x_m + 2 to x_M - 2` --> `x = x_m to x_M` outer = [i._rebuild(limits=(i.dim.root.symbolic_min, i.dim.root.symbolic_max, i.step)) for i in outer] # Create the ElementalFunction name = sregistry.make_name(prefix="bf") body = compose_nodes(outer) dynamic_parameters = flatten((i.symbolic_bounds, i.step) for i in outer) dynamic_parameters.extend([i.step for i in inner if not is_integer(i.step)]) efunc = make_efunc(name, body, dynamic_parameters) efuncs.append(efunc) # Create the ElementalCalls calls = [] for p in product(*ranges): dynamic_args_mapper = {} for i, (m, M, b) in zip(outer, p): dynamic_args_mapper[i.symbolic_min] = m dynamic_args_mapper[i.symbolic_max] = M dynamic_args_mapper[i.step] = b for j in inner: if j.dim.root is i.dim.root and not is_integer(j.step): value = j.step if b is i.step else b dynamic_args_mapper[j.step] = (value,) calls.append(efunc.make_call(dynamic_args_mapper)) mapper[root] = List(body=calls) iet = Transformer(mapper).visit(iet) return iet, {'efuncs': efuncs}
def test_create_efuncs_complex(complex_function): roots = [i[-1] for i in retrieve_iteration_tree(complex_function)] retagged = [j._rebuild(properties=tagger(i)) for i, j in enumerate(roots)] mapper = { i: j._rebuild(properties=(j.properties + (ELEMENTAL, ))) for i, j in zip(roots, retagged) } function = Transformer(mapper).visit(complex_function) handle = transform(function, mode='split') block = List(body=[handle.nodes] + handle.efuncs) output = str(block.ccode) # Make output compiler independent output = [ i for i in output.split('\n') if all([j not in i for j in ('#pragma', '/*')]) ] assert '\n'.join(output) == \ ("""void foo(float *restrict a_vec, float *restrict b_vec,""" """ float *restrict c_vec, float *restrict d_vec) { for (int i = 0; i <= 3; i += 1) { f_0((float *)a,(float *)b,i_size,i,4,0); for (int j = 0; j <= 5; j += 1) { f_1((float *)a,(float *)b,(float *)c,(float *)d,i_size,j_size,k_size,i,j,7,0); } f_2((float *)a,(float *)b,i_size,i,4,0); } } void f_0(float *restrict a_vec, float *restrict b_vec,""" """ const int i_size, const int i, const int sf_M, const int sf_m) { float (*restrict a) __attribute__ ((aligned (64))) = (float (*)) a_vec; float (*restrict b) __attribute__ ((aligned (64))) = (float (*)) b_vec; for (int s = sf_m; s <= sf_M; s += 1) { b[i] = a[i] + pow(b[i], 2) + 3; } } void f_1(float *restrict a_vec, float *restrict b_vec,""" """ float *restrict c_vec, float *restrict d_vec,""" """ const int i_size, const int j_size, const int k_size,""" """ const int i, const int j, const int kf_M, const int kf_m) { float (*restrict a) __attribute__ ((aligned (64))) = (float (*)) a_vec; float (*restrict b) __attribute__ ((aligned (64))) = (float (*)) b_vec; float (*restrict c)[j_size] __attribute__ ((aligned (64))) = (float (*)[j_size]) c_vec; float (*restrict d)[j_size][k_size] __attribute__ ((aligned (64))) =""" """ (float (*)[j_size][k_size]) d_vec; for (int k = kf_m; k <= kf_M; k += 1) { a[i] = a[i]*b[i]*c[i][j]*d[i][j][k]; a[i] = 4*(a[i] + c[i][j])*(b[i] + d[i][j][k]); } } void f_2(float *restrict a_vec, float *restrict b_vec,""" """ const int i_size, const int i, const int qf_M, const int qf_m) { float (*restrict a) __attribute__ ((aligned (64))) = (float (*)) a_vec; float (*restrict b) __attribute__ ((aligned (64))) = (float (*)) b_vec; for (int q = qf_m; q <= qf_M; q += 1) { a[i] = 8.0F*a[i] + 6.0F/b[i]; } }""")
def _create_elemental_functions(self, nodes, state): """ Extract :class:`Iteration` sub-trees and move them into :class:`Callable`s. Currently, only tagged, elementizable Iteration objects are targeted. """ noinline = self._compiler_decoration('noinline', c.Comment('noinline?')) functions = OrderedDict() mapper = {} for tree in retrieve_iteration_tree(nodes, mode='superset'): # Search an elementizable sub-tree (if any) tagged = filter_iterations(tree, lambda i: i.tag is not None, 'asap') if not tagged: continue root = tagged[0] if not root.is_Elementizable: continue target = tree[tree.index(root):] # Elemental function arguments args = [] # Found so far (scalars, tensors) defined_args = {} # Map of argument values defined by loop bounds # Build a new Iteration/Expression tree with free bounds free = [] for i in target: name, bounds = i.dim.name, i.bounds_symbolic # Iteration bounds start = Scalar(name='%s_start' % name, dtype=np.int32) finish = Scalar(name='%s_finish' % name, dtype=np.int32) defined_args[start.name] = bounds[0] defined_args[finish.name] = bounds[1] # Iteration unbounded indices ufunc = [ Scalar(name='%s_ub%d' % (name, j), dtype=np.int32) for j in range(len(i.uindices)) ] defined_args.update( {uf.name: j.start for uf, j in zip(ufunc, i.uindices)}) limits = [ Scalar(name=start.name, dtype=np.int32), Scalar(name=finish.name, dtype=np.int32), 1 ] uindices = [ UnboundedIndex(j.index, i.dim + as_symbol(k)) for j, k in zip(i.uindices, ufunc) ] free.append( i._rebuild(limits=limits, offsets=None, uindices=uindices)) # Construct elemental function body, and inspect it free = NestedTransformer(dict((zip(target, free)))).visit(root) # Insert array casts for all non-defined f_symbols = FindSymbols('symbolics').visit(free) defines = [s.name for s in FindSymbols('defines').visit(free)] casts = [ ArrayCast(f) for f in f_symbols if f.is_Tensor and f.name not in defines ] free = (List(body=casts), free) for i in derive_parameters(free): if i.name in defined_args: args.append((defined_args[i.name], i)) elif i.is_Dimension: d = Scalar(name=i.name, dtype=i.dtype) args.append((d, d)) else: args.append((i, i)) call, params = zip(*args) name = "f_%d" % root.tag # Produce the new Call mapper[root] = List(header=noinline, body=Call(name, call)) # Produce the new Callable functions.setdefault( name, Callable(name, free, 'void', flatten(params), ('static', ))) # Transform the main tree processed = Transformer(mapper).visit(nodes) return processed, {'elemental_functions': functions.values()}
def _loop_blocking(self, nodes, state): """Apply loop blocking to PARALLEL Iteration trees.""" exclude_innermost = not self.params.get('blockinner', False) ignore_heuristic = self.params.get('blockalways', False) # Make sure loop blocking will span as many Iterations as possible fold = fold_blockable_tree(nodes, exclude_innermost) mapper = {} blocked = OrderedDict() for tree in retrieve_iteration_tree(fold): # Is the Iteration tree blockable ? iterations = [i for i in tree if i.is_Parallel] if exclude_innermost: iterations = [i for i in iterations if not i.is_Vectorizable] if len(iterations) <= 1: continue root = iterations[0] if not IsPerfectIteration().visit(root): # Illegal/unsupported continue if not tree.root.is_Sequential and not ignore_heuristic: # Heuristic: avoid polluting the generated code with blocked # nests (thus increasing JIT compilation time and affecting # readability) if the blockable tree isn't embedded in a # sequential loop (e.g., a timestepping loop) continue # Decorate intra-block iterations with an IterationProperty TAG = tagger(len(mapper)) # Build all necessary Iteration objects, individually. These will # subsequently be composed to implement loop blocking. inter_blocks = [] intra_blocks = [] remainders = [] for i in iterations: # Build Iteration over blocks name = "%s%d_block" % (i.dim.name, len(mapper)) dim = blocked.setdefault(i, BlockDimension(i.dim, name=name)) binnersize = i.symbolic_size + (i.offsets[1] - i.offsets[0]) bmax = i.dim.symbolic_max - (binnersize % dim.step) inter_block = Iteration([], dim, bmax, offsets=i.offsets, properties=PARALLEL) inter_blocks.append(inter_block) # Build Iteration within a block limits = (dim, dim + dim.step - 1, 1) intra_block = i._rebuild([], limits=limits, offsets=(0, 0), properties=i.properties + (TAG, ELEMENTAL)) intra_blocks.append(intra_block) # Build unitary-increment Iteration over the 'leftover' region. # This will be used for remainder loops, executed when any # dimension size is not a multiple of the block size. remainder = i._rebuild([], limits=[bmax + 1, i.dim.symbolic_max, 1], offsets=(i.offsets[1], i.offsets[1])) remainders.append(remainder) # Build blocked Iteration nest blocked_tree = compose_nodes(inter_blocks + intra_blocks + [iterations[-1].nodes]) # Build remainder Iterations remainder_trees = [] for n in range(len(iterations)): for c in combinations([i.dim for i in iterations], n + 1): # First all inter-block Interations nodes = [b._rebuild(properties=b.properties + (REMAINDER,)) for b, r in zip(inter_blocks, remainders) if r.dim not in c] # Then intra-block or remainder, for each dim (in order) properties = (REMAINDER, TAG, ELEMENTAL) for b, r in zip(intra_blocks, remainders): handle = r if b.dim in c else b nodes.append(handle._rebuild(properties=properties)) nodes.extend([iterations[-1].nodes]) remainder_trees.append(compose_nodes(nodes)) # Will replace with blocked loop tree mapper[root] = List(body=[blocked_tree] + remainder_trees) rebuilt = Transformer(mapper).visit(fold) # Finish unrolling any previously folded Iterations processed = unfold_blocked_tree(rebuilt) return processed, {'dimensions': list(blocked.values())}
def _loop_blocking(self, iet): """ Apply loop blocking to PARALLEL Iteration trees. """ blockinner = bool(self.params.get('blockinner')) blockalways = bool(self.params.get('blockalways')) # Make sure loop blocking will span as many Iterations as possible iet = fold_blockable_tree(iet, blockinner) mapper = {} efuncs = [] block_dims = [] for tree in retrieve_iteration_tree(iet): # Is the Iteration tree blockable ? iterations = filter_iterations(tree, lambda i: i.is_Parallel) if not blockinner: iterations = iterations[:-1] if len(iterations) <= 1: continue root = iterations[0] if not blockalways: # Heuristically bypass loop blocking if we think `tree` # won't be computationally expensive. This will help with code # size/redability, JIT time, and auto-tuning time if not (tree.root.is_Sequential or iet.is_Callable): # E.g., not inside a time-stepping Iteration continue if any(i.dim.is_Sub and i.dim.local for i in tree): # At least an outer Iteration is over a local SubDimension, # which suggests the computational cost of this Iteration # nest will be negligible w.r.t. the "core" Iteration nest # (making use of non-local (Sub)Dimensions only) continue if not IsPerfectIteration().visit(root): # Don't know how to block non-perfect nests continue # Apply loop blocking to `tree` interb = [] intrab = [] for i in iterations: d = BlockDimension(i.dim, name="%s%d_blk" % (i.dim.name, len(mapper))) block_dims.append(d) # Build Iteration over blocks properties = (PARALLEL,) + ((AFFINE,) if i.is_Affine else ()) interb.append(Iteration([], d, d.symbolic_max, properties=properties)) # Build Iteration within a block intrab.append(i._rebuild([], limits=(d, d+d.step-1, 1), offsets=(0, 0))) # Construct the blocked tree blocked = compose_nodes(interb + intrab + [iterations[-1].nodes]) blocked = unfold_blocked_tree(blocked) # Promote to a separate Callable dynamic_parameters = flatten((bi.dim, bi.dim.symbolic_size) for bi in interb) efunc = make_efunc("bf%d" % len(mapper), blocked, dynamic_parameters) efuncs.append(efunc) # Compute the iteration ranges ranges = [] for i, bi in zip(iterations, interb): maxb = i.symbolic_max - (i.symbolic_size % bi.dim.step) ranges.append(((i.symbolic_min, maxb, bi.dim.step), (maxb + 1, i.symbolic_max, i.symbolic_max - maxb))) # Build Calls to the `efunc` body = [] for p in product(*ranges): dynamic_args_mapper = {} for bi, (m, M, b) in zip(interb, p): dynamic_args_mapper[bi.dim] = (m, M) dynamic_args_mapper[bi.dim.step] = (b,) call = efunc.make_call(dynamic_args_mapper) body.append(List(body=call)) mapper[root] = List(body=body) iet = Transformer(mapper).visit(iet) return iet, {'dimensions': block_dims, 'efuncs': efuncs, 'args': [i.step for i in block_dims]}
def _loop_blocking(self, iet): """ Apply loop blocking to PARALLEL Iteration trees. """ blockinner = bool(self.params.get('blockinner')) blockalways = bool(self.params.get('blockalways')) # Make sure loop blocking will span as many Iterations as possible iet = fold_blockable_tree(iet, blockinner) mapper = {} efuncs = [] block_dims = [] for tree in retrieve_iteration_tree(iet): # Is the Iteration tree blockable ? iterations = filter_iterations(tree, lambda i: i.is_Parallel) if not blockinner: iterations = iterations[:-1] if len(iterations) <= 1: continue root = iterations[0] if not (tree.root.is_Sequential or iet.is_Callable) and not blockalways: # Heuristic: avoid polluting the generated code with blocked # nests (thus increasing JIT compilation time and affecting # readability) if the blockable tree isn't embedded in a # sequential loop (e.g., a timestepping loop) continue # Apply loop blocking to `tree` interb = [] intrab = [] for i in iterations: d = BlockDimension(i.dim, name="%s%d_blk" % (i.dim.name, len(mapper))) block_dims.append(d) # Build Iteration over blocks interb.append( Iteration([], d, d.symbolic_max, properties=PARALLEL)) # Build Iteration within a block intrab.append( i._rebuild([], limits=(d, d + d.step - 1, 1), offsets=(0, 0))) # Construct the blocked tree blocked = compose_nodes(interb + intrab + [iterations[-1].nodes]) blocked = unfold_blocked_tree(blocked) # Promote to a separate Callable dynamic_parameters = flatten( (bi.dim, bi.dim.symbolic_size) for bi in interb) efunc = make_efunc("bf%d" % len(mapper), blocked, dynamic_parameters) efuncs.append(efunc) # Compute the iteration ranges ranges = [] for i, bi in zip(iterations, interb): maxb = i.symbolic_max - (i.symbolic_size % bi.dim.step) ranges.append( ((i.symbolic_min, maxb, bi.dim.step), (maxb + 1, i.symbolic_max, i.symbolic_max - maxb))) # Build Calls to the `efunc` body = [] for p in product(*ranges): dynamic_args_mapper = {} for bi, (m, M, b) in zip(interb, p): dynamic_args_mapper[bi.dim] = (m, M) dynamic_args_mapper[bi.dim.step] = (b, ) call = efunc.make_call(dynamic_args_mapper) body.append(List(body=call)) mapper[root] = List(body=body) iet = Transformer(mapper).visit(iet) return iet, { 'dimensions': block_dims, 'efuncs': efuncs, 'args': [i.step for i in block_dims] }
def make_blocking(self, iet): """ Apply loop blocking to PARALLEL Iteration trees. """ # Make sure loop blocking will span as many Iterations as possible iet = fold_blockable_tree(iet, self.blockinner) mapper = {} efuncs = [] block_dims = [] for tree in retrieve_iteration_tree(iet): # Is the Iteration tree blockable ? iterations = filter_iterations(tree, lambda i: i.is_Parallel and i.is_Affine) if not self.blockinner: iterations = iterations[:-1] if len(iterations) <= 1: continue root = iterations[0] if not self.blockalways: # Heuristically bypass loop blocking if we think `tree` # won't be computationally expensive. This will help with code # size/readbility, JIT time, and auto-tuning time if not (tree.root.is_Sequential or iet.is_Callable): # E.g., not inside a time-stepping Iteration continue if any(i.dim.is_Sub and i.dim.local for i in tree): # At least an outer Iteration is over a local SubDimension, # which suggests the computational cost of this Iteration # nest will be negligible w.r.t. the "core" Iteration nest # (making use of non-local (Sub)Dimensions only) continue if not IsPerfectIteration().visit(root): # Don't know how to block non-perfect nests continue # Apply hierarchical loop blocking to `tree` level_0 = [] # Outermost level of blocking level_i = [[] for i in range(1, self.nlevels)] # Inner levels of blocking intra = [] # Within the smallest block for i in iterations: template = "%s%d_blk%s" % (i.dim.name, self.nblocked, '%d') properties = (PARALLEL,) + ((AFFINE,) if i.is_Affine else ()) # Build Iteration across `level_0` blocks d = BlockDimension(i.dim, name=template % 0) level_0.append(Iteration([], d, d.symbolic_max, properties=properties)) # Build Iteration across all `level_i` blocks, `i` in (1, self.nlevels] for n, li in enumerate(level_i, 1): di = BlockDimension(d, name=template % n) li.append(Iteration([], di, limits=(d, d+d.step-1, di.step), properties=properties)) d = di # Build Iteration within the smallest block intra.append(i._rebuild([], limits=(d, d+d.step-1, 1), offsets=(0, 0))) level_i = flatten(level_i) # Track all constructed BlockDimensions block_dims.extend(i.dim for i in level_0 + level_i) # Construct the blocked tree blocked = compose_nodes(level_0 + level_i + intra + [iterations[-1].nodes]) blocked = unfold_blocked_tree(blocked) # Promote to a separate Callable dynamic_parameters = flatten((l0.dim, l0.step) for l0 in level_0) dynamic_parameters.extend([li.step for li in level_i]) efunc = make_efunc("bf%d" % self.nblocked, blocked, dynamic_parameters) efuncs.append(efunc) # Compute the iteration ranges ranges = [] for i, l0 in zip(iterations, level_0): maxb = i.symbolic_max - (i.symbolic_size % l0.step) ranges.append(((i.symbolic_min, maxb, l0.step), (maxb + 1, i.symbolic_max, i.symbolic_max - maxb))) # Build Calls to the `efunc` body = [] for p in product(*ranges): dynamic_args_mapper = {} for l0, (m, M, b) in zip(level_0, p): dynamic_args_mapper[l0.dim] = (m, M) dynamic_args_mapper[l0.step] = (b,) for li in level_i: if li.dim.root is l0.dim.root: value = li.step if b is l0.step else b dynamic_args_mapper[li.step] = (value,) call = efunc.make_call(dynamic_args_mapper) body.append(List(body=call)) mapper[root] = List(body=body) # Next blockable nest, use different (unique) variable/function names self.nblocked += 1 iet = Transformer(mapper).visit(iet) # Force-unfold if some folded Iterations haven't been blocked in the end iet = unfold_blocked_tree(iet) return iet, {'dimensions': block_dims, 'efuncs': efuncs, 'args': [i.step for i in block_dims]}
def _create_elemental_functions(self, nodes, state): """ Extract :class:`Iteration` sub-trees and move them into :class:`Callable`s. Currently, only tagged, elementizable Iteration objects are targeted. """ noinline = self._compiler_decoration('noinline', c.Comment('noinline?')) functions = OrderedDict() mapper = {} for tree in retrieve_iteration_tree(nodes, mode='superset'): # Search an elementizable sub-tree (if any) tagged = filter_iterations(tree, lambda i: i.tag is not None, 'asap') if not tagged: continue root = tagged[0] if not root.is_Elementizable: continue target = tree[tree.index(root):] # Elemental function arguments args = [] # Found so far (scalars, tensors) maybe_required = set() # Scalars that *may* have to be passed in not_required = set() # Elemental function locally declared scalars # Build a new Iteration/Expression tree with free bounds free = [] for i in target: name, bounds = i.dim.name, i.bounds_symbolic # Iteration bounds start = Scalar(name='%s_start' % name, dtype=np.int32) finish = Scalar(name='%s_finish' % name, dtype=np.int32) args.extend(zip([ccode(j) for j in bounds], (start, finish))) # Iteration unbounded indices ufunc = [ Scalar(name='%s_ub%d' % (name, j), dtype=np.int32) for j in range(len(i.uindices)) ] args.extend(zip([ccode(j.start) for j in i.uindices], ufunc)) limits = [Symbol(start.name), Symbol(finish.name), 1] uindices = [ UnboundedIndex(j.index, i.dim + as_symbol(k)) for j, k in zip(i.uindices, ufunc) ] free.append( i._rebuild(limits=limits, offsets=None, uindices=uindices)) not_required.update({i.dim}, set(j.index for j in i.uindices)) # Construct elemental function body, and inspect it free = NestedTransformer(dict((zip(target, free)))).visit(root) expressions = FindNodes(Expression).visit(free) fsymbols = FindSymbols('symbolics').visit(free) # Add all definitely-required arguments not_required.update({i.output for i in expressions if i.is_scalar}) for i in fsymbols: if i in not_required: continue elif i.is_Array: args.append( ("(%s*)%s" % (c.dtype_to_ctype(i.dtype), i.name), i)) elif i.is_TensorFunction: args.append(("%s_vec" % i.name, i)) elif i.is_Scalar: args.append((i.name, i)) # Add all maybe-required arguments that turn out to be required maybe_required.update( set(FindSymbols(mode='free-symbols').visit(free))) for i in fsymbols: not_required.update({as_symbol(i), i.indexify()}) for j in i.symbolic_shape: maybe_required.update(j.free_symbols) required = filter_sorted(maybe_required - not_required, key=attrgetter('name')) args.extend([(i.name, Scalar(name=i.name, dtype=i.dtype)) for i in required]) call, params = zip(*args) handle = flatten([p.rtargs for p in params]) name = "f_%d" % root.tag # Produce the new Call mapper[root] = List(header=noinline, body=Call(name, call)) # Produce the new Callable functions.setdefault( name, Callable(name, free, 'void', handle, ('static', ))) # Transform the main tree processed = Transformer(mapper).visit(nodes) return processed, {'elemental_functions': functions.values()}
def _specialize_iet(self, iet, **kwargs): warning("The OPS backend is still work-in-progress") # If there is no iteration tree, then there is no loop to be optimized using OPS. iteration_tree = retrieve_iteration_tree(iet, mode='normal') if not len(iteration_tree): return iet time_upper_bound = iteration_tree[0].dimensions[TimeFunction._time_position]\ .extreme_max ops_init = Call(namespace['ops_init'], [0, 0, 2]) ops_partition = Call(namespace['ops_partition'], Literal('""')) ops_exit = Call(namespace['ops_exit']) # Extract all symbols that need to be converted to ops_dat dims = [] to_dat = set() for section, trees in find_affine_trees(iet).items(): dims.append(len(trees[0].dimensions)) symbols = set(FindSymbols('symbolics').visit(trees[0].root)) symbols -= set(FindSymbols('defines').visit(trees[0].root)) to_dat |= symbols # Create the OPS block for this problem ops_block = OpsBlock('block') ops_block_init = Expression(ClusterizedEq(Eq( ops_block, namespace['ops_decl_block']( dims[0], Literal('"block"') ) ))) # To ensure deterministic code generation we order the datasets to # be generated (since a set is an unordered collection) to_dat = filter_sorted(to_dat) name_to_ops_dat = {} pre_time_loop = [] after_time_loop = [] for f in to_dat: if f.is_Constant: continue pre_time_loop.extend(create_ops_dat(f, name_to_ops_dat, ops_block)) # To return the result to Devito, it is necessary to copy the data # from the dat object back to the CPU memory. after_time_loop.extend(create_ops_fetch(f, name_to_ops_dat, time_upper_bound)) # Generate ops kernels for each offloadable iteration tree mapper = {} for n, (section, trees) in enumerate(find_affine_trees(iet).items()): pre_loop, ops_kernel, ops_par_loop_call = opsit( trees, n, name_to_ops_dat, ops_block, dims[0] ) pre_time_loop.extend(pre_loop) self._ops_kernels.append(ops_kernel) mapper[trees[0].root] = ops_par_loop_call mapper.update({i.root: mapper.get(i.root) for i in trees}) # Drop trees iet = Transformer(mapper).visit(iet) assert (d == dims[0] for d in dims), \ "The OPS backend currently assumes that all kernels \ have the same number of dimensions" self._headers.append(namespace['ops_define_dimension'](dims[0])) self._includes.append('stdio.h') body = [ops_init, ops_block_init, *pre_time_loop, ops_partition, iet, *after_time_loop, ops_exit] return List(body=body)
def _optimize_halospots(self, iet): """ Optimize the HaloSpots in ``iet``. * Remove all ``useless`` HaloSpots; * Merge all ``hoistable`` HaloSpots with their root HaloSpot, thus removing redundant communications and anticipating communications that will be required by later Iterations. """ # Drop `useless` HaloSpots mapper = { hs: hs._rebuild(halo_scheme=hs.halo_scheme.drop(hs.useless)) for hs in FindNodes(HaloSpot).visit(iet) } iet = Transformer(mapper, nested=True).visit(iet) # Handle `hoistable` HaloSpots # First, we merge `hoistable` HaloSpots together, to anticipate communications mapper = {} for tree in retrieve_iteration_tree(iet): halo_spots = FindNodes(HaloSpot).visit(tree.root) if not halo_spots: continue root = halo_spots[0] if root in mapper: continue hss = [root.halo_scheme] hss.extend([ hs.halo_scheme.project(hs.hoistable) for hs in halo_spots[1:] ]) try: mapper[root] = root._rebuild(halo_scheme=HaloScheme.union(hss)) except ValueError: # HaloSpots have non-matching `loc_indices` and therefore can't be merged warning("Found hoistable HaloSpots with disjoint loc_indices, " "skipping optimization") continue for hs in halo_spots[1:]: halo_scheme = hs.halo_scheme.drop(hs.hoistable) if halo_scheme.is_void: mapper[hs] = hs.body else: mapper[hs] = hs._rebuild(halo_scheme=halo_scheme) iet = Transformer(mapper, nested=True).visit(iet) # Then, we make sure the halo exchanges get performed *before* # the first distributed Dimension. Again, we do this to anticipate # communications, which hopefully has a pay off in performance # # <Iteration x> <HaloSpot(u)>, in y # <HaloSpot(u)>, in y ----> <Iteration x> # <Iteration y> <Iteration y> mapper = {} for i, halo_spots in MapNodes(Iteration, HaloSpot).visit(iet).items(): hoistable = [hs for hs in halo_spots if hs.hoistable] if not hoistable: continue elif len(hoistable) > 1: # We should never end up here, but for now we can't prove it formally warning( "Found multiple hoistable HaloSpots, skipping optimization" ) continue hs = hoistable.pop() if hs in mapper: continue if i.dim.root in hs.dimensions: halo_scheme = hs.halo_scheme.drop(hs.hoistable) if halo_scheme.is_void: mapper[hs] = hs.body else: mapper[hs] = hs._rebuild(halo_scheme=halo_scheme) halo_scheme = hs.halo_scheme.project(hs.hoistable) mapper[i] = hs._rebuild(halo_scheme=halo_scheme, body=i._rebuild()) iet = Transformer(mapper, nested=True).visit(iet) # Finally, we try to move HaloSpot-free Iteration nests within HaloSpot # subtrees, to overlap as much computation as possible. The HaloSpot-free # Iteration nests must be fully affine, otherwise we wouldn't be able to # honour the data dependences along the halo # # <HaloSpot(u,v)> HaloSpot(u,v) # <A> ----> <A> # <B> affine? <B> # # Here, <B> doesn't require any halo exchange, but it might still need the # output of <A>; thus, if we do computation/communication overlap over <A> # *and* want to embed <B> within the HaloSpot, then <B>'s iteration space # will have to be split as well. For this, <B> must be affine. mapper = {} for v in FindAdjacent((HaloSpot, Iteration)).visit(iet).values(): for g in v: root = None for i in g: if i.is_HaloSpot: root = i mapper[root] = [root.body] elif root and all(j.is_Affine for j in FindNodes(Iteration).visit(i)): mapper[root].append(i) mapper[i] = None else: root = None mapper = { k: k._rebuild(body=List(body=v)) if v else v for k, v in mapper.items() } iet = Transformer(mapper).visit(iet) return iet, {}
def _loop_blocking(self, nodes, state): """ Apply loop blocking to :class:`Iteration` trees. Blocking is applied to parallel iteration trees. Heuristically, innermost dimensions are not blocked to maximize the trip count of the SIMD loops. Different heuristics may be specified by passing the keywords ``blockshape`` and ``blockinner`` to the DLE. The former, a dictionary, is used to indicate a specific block size for each blocked dimension. For example, for the :class:`Iteration` tree: :: for i for j for k ... one may provide ``blockshape = {i: 4, j: 7}``, in which case the two outer loops will blocked, and the resulting 2-dimensional block will have size 4x7. The latter may be set to True to also block innermost parallel :class:`Iteration` objects. """ exclude_innermost = not self.params.get('blockinner', False) ignore_heuristic = self.params.get('blockalways', False) # Make sure loop blocking will span as many Iterations as possible fold = fold_blockable_tree(nodes, exclude_innermost) mapper = {} blocked = OrderedDict() for tree in retrieve_iteration_tree(fold): # Is the Iteration tree blockable ? iterations = [i for i in tree if i.is_Parallel] if exclude_innermost: iterations = [i for i in iterations if not i.is_Vectorizable] if len(iterations) <= 1: continue root = iterations[0] if not IsPerfectIteration().visit(root): # Illegal/unsupported continue if not tree[0].is_Sequential and not ignore_heuristic: # Heuristic: avoid polluting the generated code with blocked # nests (thus increasing JIT compilation time and affecting # readability) if the blockable tree isn't embedded in a # sequential loop (e.g., a timestepping loop) continue # Decorate intra-block iterations with an IterationProperty TAG = tagger(len(mapper)) # Build all necessary Iteration objects, individually. These will # subsequently be composed to implement loop blocking. inter_blocks = [] intra_blocks = [] remainders = [] for i in iterations: name = "%s%d_block" % (i.dim.name, len(mapper)) # Build Iteration over blocks dim = blocked.setdefault(i, Dimension(name=name)) bsize = dim.symbolic_size bstart = i.limits[0] binnersize = i.dim.symbolic_extent + (i.offsets[1] - i.offsets[0]) bfinish = i.dim.symbolic_end - (binnersize % bsize) - 1 inter_block = Iteration([], dim, [bstart, bfinish, bsize], offsets=i.offsets, properties=PARALLEL) inter_blocks.append(inter_block) # Build Iteration within a block limits = (dim, dim + bsize - 1, 1) intra_block = i._rebuild([], limits=limits, offsets=(0, 0), properties=i.properties + (TAG, ELEMENTAL)) intra_blocks.append(intra_block) # Build unitary-increment Iteration over the 'leftover' region. # This will be used for remainder loops, executed when any # dimension size is not a multiple of the block size. remainder = i._rebuild( [], limits=[bfinish + 1, i.dim.symbolic_end, 1], offsets=(i.offsets[1], i.offsets[1])) remainders.append(remainder) # Build blocked Iteration nest blocked_tree = compose_nodes(inter_blocks + intra_blocks + [iterations[-1].nodes]) # Build remainder Iterations remainder_trees = [] for n in range(len(iterations)): for c in combinations([i.dim for i in iterations], n + 1): # First all inter-block Interations nodes = [ b._rebuild(properties=b.properties + (REMAINDER, )) for b, r in zip(inter_blocks, remainders) if r.dim not in c ] # Then intra-block or remainder, for each dim (in order) properties = (REMAINDER, TAG, ELEMENTAL) for b, r in zip(intra_blocks, remainders): handle = r if b.dim in c else b nodes.append(handle._rebuild(properties=properties)) nodes.extend([iterations[-1].nodes]) remainder_trees.append(compose_nodes(nodes)) # Will replace with blocked loop tree mapper[root] = List(body=[blocked_tree] + remainder_trees) rebuilt = Transformer(mapper).visit(fold) # Finish unrolling any previously folded Iterations processed = unfold_blocked_tree(rebuilt) # All blocked dimensions if not blocked: return processed, {} # Determine the block shape blockshape = self.params.get('blockshape') if not blockshape: # Use trivial heuristic for a suitable blockshape def heuristic(dim_size): ths = 8 # FIXME: This really needs to be improved return ths if dim_size > ths else 1 blockshape = {k: heuristic for k in blocked.keys()} else: try: nitems, nrequired = len(blockshape), len(blocked) blockshape = {k: v for k, v in zip(blocked, blockshape)} if nitems > nrequired: dle_warning("Provided 'blockshape' has more entries than " "blocked loops; dropping entries ...") if nitems < nrequired: dle_warning("Provided 'blockshape' has fewer entries than " "blocked loops; dropping dimensions ...") except TypeError: blockshape = {list(blocked)[0]: blockshape} blockshape.update( {k: None for k in blocked.keys() if k not in blockshape}) # Track any additional arguments required to execute /state.nodes/ arguments = [ BlockingArg(v, k, blockshape[k]) for k, v in blocked.items() ] return processed, {'arguments': arguments, 'flags': 'blocking'}
def test_multiple_loops(self): grid = Grid(shape=(3, 3, 3)) f = Function(name='f', grid=grid) g = Function(name='g', grid=grid) u = TimeFunction(name='u', grid=grid, space_order=2) v = TimeFunction(name='v', grid=grid, space_order=2) eqns = [ Eq(f, g * 2), Eq(u.forward, u + v * f), Eq(v.forward, u.forward.dx + v * f + 4) ] op = Operator(eqns, opt='noop') trees = retrieve_iteration_tree(op) assert len(trees) == 3 # All loop nests must have been parallelized assert trees[0][0].pragmas[0].value ==\ 'omp target teams distribute parallel for collapse(3)' assert trees[1][1].pragmas[0].value ==\ 'omp target teams distribute parallel for collapse(3)' assert trees[2][1].pragmas[0].value ==\ 'omp target teams distribute parallel for collapse(3)' # Check `u` and `v` for i, f in enumerate([u, v], 1): assert op.body[1].header[i].value ==\ ('omp target enter data map(to: %(n)s[0:%(n)s_vec->size[0]]' '[0:%(n)s_vec->size[1]][0:%(n)s_vec->size[2]][0:%(n)s_vec->size[3]])' % {'n': f.name}) assert op.body[1].footer[i+1].contents[0].value ==\ ('omp target update from(%(n)s[0:%(n)s_vec->size[0]]' '[0:%(n)s_vec->size[1]][0:%(n)s_vec->size[2]][0:%(n)s_vec->size[3]])' % {'n': f.name}) assert op.body[1].footer[i+1].contents[1].value ==\ ('omp target exit data map(release: %(n)s[0:%(n)s_vec->size[0]]' '[0:%(n)s_vec->size[1]][0:%(n)s_vec->size[2]][0:%(n)s_vec->size[3]])' % {'n': f.name}) # Check `f` assert op.body[1].header[0].value ==\ ('omp target enter data map(to: f[0:f_vec->size[0]]' '[0:f_vec->size[1]][0:f_vec->size[2]])') assert op.body[1].footer[1].contents[0].value ==\ ('omp target update from(f[0:f_vec->size[0]]' '[0:f_vec->size[1]][0:f_vec->size[2]])') assert op.body[1].footer[1].contents[1].value ==\ ('omp target exit data map(release: f[0:f_vec->size[0]]' '[0:f_vec->size[1]][0:f_vec->size[2]])') # Check `g` -- note that unlike `f`, this one should be `delete` upon # exit, not `from` assert op.body[1].header[3].value ==\ ('omp target enter data map(to: g[0:g_vec->size[0]]' '[0:g_vec->size[1]][0:g_vec->size[2]])') assert op.body[1].footer[4].value ==\ ('omp target exit data map(delete: g[0:g_vec->size[0]]' '[0:g_vec->size[1]][0:g_vec->size[2]])' ' if((g_vec->size[0] != 0) && (g_vec->size[1] != 0)' ' && (g_vec->size[2] != 0))')
def create_profile(name, node): """ Create a :class:`Profiler` for the Iteration/Expression tree ``node``. The following code sections are profiled: :: * The whole ``node``; * A sequence of perfectly nested loops that have common :class:`Iteration` dimensions, but possibly different extent. For example: :: for x = 0 to N .. for x = 1 to N-1 .. Both Iterations have dimension ``x``, and will be profiled as a single section, though their extent is different. * Any perfectly nested loops. """ profiler = Profiler(name) trees = retrieve_iteration_tree(node) if not trees: return node, profiler adjacents = [ flatten(i) for i in FindAdjacentIterations().visit(node).values() if i ] def are_adjacent(tree, last): for i, j in zip(tree, last): if i == j: continue try: return any( abs(a.index(j) - a.index(i)) == 1 for a in adjacents) except ValueError: return False # Group Iterations based on timing region key, groups = lambda itspace: {i.defines for i in itspace}, [] handle = [trees[0]] for tree in trees[1:]: last = handle[-1] if key(tree) == key(last) and are_adjacent(tree, last): handle.append(tree) else: groups.append(tuple(handle)) handle = [tree] groups.append(tuple(handle)) # Create and track C-level timers mapper = OrderedDict() for group in groups: # We time at the single timestep level for i in zip(*group): root = i[0] remainder = tuple(j for j in i if j is not root) if not root.dim.is_Time: break if root in mapper: continue # Prepare to transform the Iteration/Expression tree body = (root, ) + remainder lname = 'section_%d' % len(mapper) mapper[root] = TimedList(gname=name, lname=lname, body=body) mapper.update(OrderedDict([(j, None) for j in remainder])) # Estimate computational properties of the profiled section expressions = FindNodes(Expression).visit(body) ops = estimate_cost([e.expr for e in expressions]) memory = estimate_memory([e.expr for e in expressions]) # Keep track of the new profiled section profiler.add(lname, group[0], ops, memory) # Transform the Iteration/Expression tree introducing the C-level timers processed = Transformer(mapper).visit(node) return processed, profiler