def _make_partree(self, candidates, nthreads=None): """Parallelize the `candidates` Iterations attaching suitable OpenMP pragmas.""" assert candidates root = candidates[0] # Get the collapsable Iterations collapsable = self._find_collapsable(root, candidates) ncollapse = 1 + len(collapsable) # Prepare to build a ParallelTree if all(i.is_Affine for i in candidates): bundles = FindNodes(ExpressionBundle).visit(root) sops = sum(i.ops for i in bundles) if sops >= self.DYNAMIC_WORK: schedule = 'dynamic' else: schedule = 'static' if nthreads is None: # pragma omp for ... schedule(..., 1) nthreads = self.nthreads body = ParallelIteration(schedule=schedule, ncollapse=ncollapse, **root.args) else: # pragma omp parallel for ... schedule(..., 1) body = ParallelIteration(schedule=schedule, parallel=True, ncollapse=ncollapse, nthreads=nthreads, **root.args) prefix = [] else: # pragma omp for ... schedule(..., expr) assert nthreads is None nthreads = self.nthreads_nonaffine chunk_size = Symbol(name='chunk_size') body = ParallelIteration(ncollapse=ncollapse, chunk_size=chunk_size, **root.args) niters = prod([root.symbolic_size] + [j.symbolic_size for j in collapsable]) value = INT(Max(niters / (nthreads*self.CHUNKSIZE_NONAFFINE), 1)) prefix = [Expression(DummyEq(chunk_size, value, dtype=np.int32))] # Create a ParallelTree partree = ParallelTree(prefix, body, nthreads=nthreads) collapsed = [partree] + collapsable return root, partree, collapsed
def _make_reductions(self, partree): if not any(i.is_ParallelAtomic for i in partree.collapsed): return partree exprs = [i for i in FindNodes(Expression).visit(partree) if i.is_Increment] reduction = [i.output for i in exprs] if all(i.is_Affine for i in partree.collapsed) or \ all(not i.is_Indexed for i in reduction): # Implement reduction mapper = {partree.root: partree.root._rebuild(reduction=reduction)} else: # Make sure the increment is atomic mapper = {i: i._rebuild(pragmas=self.lang['atomic']) for i in exprs} partree = Transformer(mapper).visit(partree) return partree
def test_codegen_quality0(): grid = Grid(shape=(4, 4)) u = TimeFunction(name='u', grid=grid, space_order=2) eqn = Eq(u.forward, u.dx2 + 1.) op = Operator(eqn, opt=('advanced', {'linearize': True})) assert 'uL0' in str(op) exprs = FindNodes(Expression).visit(op) assert len(exprs) == 6 assert all('const long' in str(i) for i in exprs[:-2]) # Only four access macros necessary, namely `uL0`, `aL0`, `bufL0`, `bufL1` (the # other three obviously are _POSIX_C_SOURCE, START_TIMER, STOP_TIMER) assert len(op._headers) == 7
def _make_partree(self, candidates, nthreads=None): assert candidates root = candidates[0] # Get the collapsable Iterations collapsable = self._find_collapsable(root, candidates) ncollapse = 1 + len(collapsable) # Prepare to build a ParallelTree if all(i.is_Affine for i in candidates): bundles = FindNodes(ExpressionBundle).visit(root) sops = sum(i.ops for i in bundles) if sops >= self.dynamic_work: schedule = 'dynamic' else: schedule = 'static' if nthreads is None: # pragma ... for ... schedule(..., 1) nthreads = self.nthreads body = self.HostIteration(schedule=schedule, ncollapse=ncollapse, **root.args) else: # pragma ... parallel for ... schedule(..., 1) body = self.HostIteration(schedule=schedule, parallel=True, ncollapse=ncollapse, nthreads=nthreads, **root.args) prefix = [] else: # pragma ... for ... schedule(..., expr) assert nthreads is None nthreads = self.nthreads_nonaffine chunk_size = Symbol(name='chunk_size') body = self.HostIteration(ncollapse=ncollapse, chunk_size=chunk_size, **root.args) niters = prod([root.symbolic_size] + [j.symbolic_size for j in collapsable]) value = INT(Max(niters / (nthreads*self.chunk_nonaffine), 1)) prefix = [Expression(DummyEq(chunk_size, value, dtype=np.int32))] # Create a ParallelTree partree = ParallelTree(prefix, body, nthreads=nthreads) return root, partree
def _make_parallel(self, iet): mapper = {} parrays = {} for tree in retrieve_iteration_tree(iet): # Get the parallelizable Iterations in `tree` candidates = filter_iterations(tree, key=self.key) if not candidates: continue # Outer parallelism root, partree, collapsed = self._make_partree(candidates) if partree is None or root in mapper: continue # Nested parallelism partree = self._make_nested_partree(partree) # Handle reductions partree = self._make_reductions(partree, collapsed) # Atomicize and optimize single-thread prodders partree = self._make_threaded_prodders(partree) # Wrap within a parallel region parregion = self._make_parregion(partree, parrays) # Protect the parallel region if necessary parregion = self._make_guard(parregion, collapsed) mapper[root] = parregion iet = Transformer(mapper).visit(iet) # The new arguments introduced by this pass args = [ i for i in FindSymbols().visit(iet) if isinstance(i, (NThreadsMixin)) ] for n in FindNodes(HeapGlobal).visit(iet): args.extend([(n.array, True), n.parray]) return iet, {'args': args, 'includes': [self.lang['header']]}
def _make_parallel(self, iet): mapper = {} parrays = {} for tree in retrieve_iteration_tree(iet): # Get the omp-parallelizable Iterations in `tree` candidates = filter_iterations(tree, key=self.key) if not candidates: continue # Outer parallelism root, partree, collapsed = self._make_partree(candidates) if root in mapper: continue # Nested parallelism partree = self._make_nested_partree(partree) # Handle reductions partree = self._make_reductions(partree, collapsed) # Atomicize and optimize single-thread prodders partree = self._make_threaded_prodders(partree) # Wrap within a parallel region, declaring private and shared variables parregion = self._make_parregion(partree, parrays) # Protect the parallel region in case of 0-valued step increments parregion = self._make_guard(parregion, collapsed) mapper[root] = parregion iet = Transformer(mapper).visit(iet) # The new arguments introduced by this pass args = [ i for i in FindSymbols().visit(iet) if isinstance(i, (NThreadsMixin)) ] for n in FindNodes(Dereference).visit(iet): args.extend([(n.array, True), n.parray]) return iet, {'args': args, 'includes': ['omp.h']}
def test_tasking_lock_placement(self): grid = Grid(shape=(10, 10, 10)) f = Function(name='f', grid=grid, space_order=2) u = TimeFunction(name='u', grid=grid) usave = TimeFunction(name='usave', grid=grid, save=10) eqns = [Eq(f, u + 1), Eq(u.forward, f.dx + u + 1), Eq(usave, u)] op = Operator(eqns, opt=('tasking', 'orchestrate')) # Check generated code -- the wait-lock is expected in section1 assert len(retrieve_iteration_tree(op)) == 5 assert len([i for i in FindSymbols().visit(op) if isinstance(i, Lock)]) == 1 sections = FindNodes(Section).visit(op) assert len(sections) == 3 assert sections[0].body[0].body[0].body[0].is_Iteration assert str(sections[1].body[0].body[0].body[0].body[0]) ==\ 'while(lock0[t1] == 0);'
def _make_reductions(self, partree, collapsed): if not partree.is_ParallelAtomic: return partree # Collect expressions inducing reductions exprs = FindNodes(Expression).visit(partree) exprs = [i for i in exprs if i.is_Increment and not i.is_ForeignExpression] reduction = [i.output for i in exprs] if (all(i.is_Affine for i in collapsed) or all(not i.is_Indexed for i in reduction)): # Introduce reduction clause mapper = {partree.root: partree.root._rebuild(reduction=reduction)} else: # Introduce one `omp atomic` pragma for each increment mapper = {i: List(header=self.lang['atomic'], body=i) for i in exprs} partree = Transformer(mapper).visit(partree) return partree
def place_casts(self, iet): """ Create a new IET with the necessary type casts. Parameters ---------- iet : Callable The input Iteration/Expression tree. """ # Make the generated code less verbose: if a non-Array parameter does not # appear in any Expression, that is, if the parameter is merely propagated # down to another Call, then there's no need to cast it exprs = FindNodes(Expression).visit(iet) need_cast = {i for i in set().union(*[i.functions for i in exprs]) if i.is_Tensor} need_cast.update({i for i in iet.parameters if i.is_Array}) casts = tuple(ArrayCast(i) for i in iet.parameters if i in need_cast) iet = iet._rebuild(body=casts + iet.body) return iet, {}
def test_tasking_fused(self): nt = 10 bundle0 = Bundle() grid = Grid(shape=(10, 10, 10), subdomains=bundle0) tmp = Function(name='tmp', grid=grid) u = TimeFunction(name='u', grid=grid, save=nt) v = TimeFunction(name='v', grid=grid, save=nt) w = TimeFunction(name='w', grid=grid) eqns = [ Eq(w.forward, w + 1), Eq(tmp, w.forward), Eq(u.forward, tmp, subdomain=bundle0), Eq(v.forward, tmp, subdomain=bundle0) ] op = Operator(eqns, opt=('tasking', 'fuse', 'orchestrate')) # Check generated code assert len(retrieve_iteration_tree(op)) == 5 locks = [i for i in FindSymbols().visit(op) if isinstance(i, Lock)] assert len( locks) == 1 # Only 1 because it's only `tmp` that needs protection assert len(op._func_table) == 2 exprs = FindNodes(Expression).visit( op._func_table['copy_device_to_host0'].root) assert len(exprs) == 20 assert str(exprs[12]) == 'int id = sdata0->id;' assert str(exprs[13]) == 'int deviceid = sdata0->deviceid;' assert str(exprs[14]) == 'const int time = sdata0->time;' assert str(exprs[15]) == 'lock0[0] = 1;' assert exprs[16].write is u assert exprs[17].write is v assert str(exprs[18]) == 'lock0[0] = 2;' assert str(exprs[19]) == 'sdata0->flag = 1;' op.apply(time_M=nt - 2) assert np.all(u.data[nt - 1] == 9) assert np.all(v.data[nt - 1] == 9)
def hoist_prodders(iet): """ Move Prodders within the outer levels of an Iteration tree. """ mapper = {} for tree in retrieve_iteration_tree(iet): for prodder in FindNodes(Prodder).visit(tree.root): if prodder._periodic: try: key = lambda i: i.dim.is_Incr and i.dim.step != 1 candidate = filter_iterations(tree, key)[-1] except IndexError: # Fallback: use the outermost Iteration candidate = tree.root mapper[candidate] = candidate._rebuild(nodes=(candidate.nodes + (prodder._rebuild(),))) mapper[prodder] = None iet = Transformer(mapper, nested=True).visit(iet) return iet, {}
def test_tti_clusters_to_graph(): solver = tti_operator() nodes = FindNodes(Expression).visit( solver.op_fwd('centered').elemental_functions + (solver.op_fwd('centered'), )) expressions = [n.expr for n in nodes] stencils = solver.op_fwd('centered')._retrieve_stencils(expressions) clusters = clusterize(expressions, stencils) assert len(clusters) == 3 main_cluster = clusters[0] n_output_tensors = len(main_cluster.trace) clusters = rewrite([main_cluster], mode='basic') assert len(clusters) == 1 main_cluster = clusters[0] graph = main_cluster.trace assert len([v for v in graph.values() if v.is_tensor ]) == n_output_tensors # u and v assert all(v.reads or v.readby for v in graph.values())
def test_codegen_quality1(): grid = Grid(shape=(4, 4, 4)) u = TimeFunction(name='u', grid=grid, space_order=2) eqn = Eq(u.forward, u.dy.dy + 1.) op = Operator(eqn, opt=('advanced', { 'linearize': True, 'cire-mingain': 0 })) assert 'uL0' in str(op) # 11 expressions in total are expected, 8 of which are for the linearized accesses exprs = FindNodes(Expression).visit(op) assert len(exprs) == 11 assert all('const long' in str(i) for i in exprs[:-3]) assert all('const long' not in str(i) for i in exprs[-3:]) # Only two access macros necessary, namely `uL0` and `r1L0` (the other five # obviously are _POSIX_C_SOURCE, MIN, MAX, START_TIMER, STOP_TIMER) assert len(op._headers) == 7
def test_skewing_codegen(self, expr, expected, skewing, blockinner): """Tests code generation on skewed indices.""" grid = Grid(shape=(3, 3, 3)) x, y, z = grid.dimensions time = grid.time_dim u = TimeFunction(name='u', grid=grid) # noqa v = TimeFunction(name='v', grid=grid) # noqa eqn = eval(expr) # List comprehension would need explicit locals/globals mappings to eval op = Operator(eqn, opt=('blocking', { 'blocklevels': 0, 'skewing': skewing, 'blockinner': blockinner })) op.apply(time_M=5) iters = FindNodes(Iteration).visit(op) assert len(iters) == 4 assert iters[0].dim is time assert iters[1].dim is x assert iters[2].dim is y assert iters[3].dim is z skewed = [i.expr for i in FindNodes(Expression).visit(op)] if skewing and not blockinner: assert (iters[1].symbolic_min == (iters[1].dim.symbolic_min + time)) assert (iters[1].symbolic_max == (iters[1].dim.symbolic_max + time)) assert (iters[2].symbolic_min == (iters[2].dim.symbolic_min + time)) assert (iters[2].symbolic_max == (iters[2].dim.symbolic_max + time)) assert (iters[3].symbolic_min == (iters[3].dim.symbolic_min)) assert (iters[3].symbolic_max == (iters[3].dim.symbolic_max)) elif skewing and blockinner: assert (iters[1].symbolic_min == (iters[1].dim.symbolic_min + time)) assert (iters[1].symbolic_max == (iters[1].dim.symbolic_max + time)) assert (iters[2].symbolic_min == (iters[2].dim.symbolic_min + time)) assert (iters[2].symbolic_max == (iters[2].dim.symbolic_max + time)) assert (iters[3].symbolic_min == (iters[3].dim.symbolic_min + time)) assert (iters[3].symbolic_max == (iters[3].dim.symbolic_max + time)) elif not skewing and not blockinner: assert (iters[1].symbolic_min == (iters[1].dim.symbolic_min)) assert (iters[1].symbolic_max == (iters[1].dim.symbolic_max)) assert (iters[2].symbolic_min == (iters[2].dim.symbolic_min)) assert (iters[2].symbolic_max == (iters[2].dim.symbolic_max)) assert (iters[3].symbolic_min == (iters[3].dim.symbolic_min)) assert (iters[3].symbolic_max == (iters[3].dim.symbolic_max)) assert str(skewed[0]).replace(' ', '') == expected
def _make_atomic_prodders(self, partree): # Atomic-ize any single-thread Prodders in the parallel tree mapper = {i: SingleThreadProdder(i) for i in FindNodes(Prodder).visit(partree)} partree = Transformer(mapper).visit(partree) return partree
def place_definitions(self, iet, **kwargs): """ Create a new IET with symbols allocated/deallocated in some memory space. Parameters ---------- iet : Callable The input Iteration/Expression tree. """ storage = Storage() # Collect and declare symbols for k, v in MapExprStmts().visit(iet).items(): if k.is_Expression: if k.is_definition: site = v[-1] if v else iet self._alloc_scalar_on_low_lat_mem(site, k, storage) continue objs = [k.write] elif k.is_Call: objs = k.arguments for i in objs: try: if i.is_LocalObject: site = v[-1] if v else iet self._alloc_object_on_low_lat_mem(site, i, storage) elif i.is_Array: if i in iet.parameters: # The Array is passed as a Callable argument continue elif i._mem_stack: self._alloc_array_on_low_lat_mem(iet, i, storage) else: self._alloc_array_on_high_bw_mem(i, storage) except AttributeError: # E.g., a generic SymPy expression pass # Place symbols in a memory space if not iet.is_ElementalFunction: writes = set() reads = set() for efunc in kwargs.get('efuncs', []): for i in FindNodes(Expression).visit(efunc): if i.write.is_Function: writes.add(i.write) reads = (reads | {r for r in i.reads if r.is_Function}) - writes for i in filter_sorted(writes): self._map_function_on_high_bw_mem(i, storage) for i in filter_sorted(reads): self._map_function_on_high_bw_mem(i, storage, read_only=True) # Introduce symbol definitions going in the low latency memory mapper = dict(storage._on_low_lat_mem) iet = Transformer(mapper, nested=True).visit(iet) # Introduce symbol definitions going in the high bandwidth memory header = [] footer = [] for decl, alloc, free in storage._on_high_bw_mem: if decl is None: header.append(alloc) else: header.extend([decl, alloc]) footer.append(free) if header or footer: body = List(header=header, body=iet.body, footer=footer) iet = iet._rebuild(body=body) return iet, {}
def _make_threaded_prodders(self, partree): mapper = {i: self.Prodder(i) for i in FindNodes(Prodder).visit(partree)} partree = Transformer(mapper).visit(partree) return partree
def optimize_unfolded_tree(unfolded, root): """ Transform folded trees to reduce the memory footprint. Examples -------- Given: .. code-block:: for i = 1 to N - 1 # Folded tree for j = 1 to N - 1 tmp[i,j] = ... for i = 2 to N - 2 # Root for j = 2 to N - 2 ... = ... tmp[i,j] ... The temporary ``tmp`` has shape ``(N-1, N-1)``. However, as soon as the iteration space is blocked, with blocks of shape ``(i_bs, j_bs)``, the ``tmp`` shape can be shrunk to ``(i_bs-1, j_bs-1)``. The resulting iteration tree becomes: .. code-block:: for i = 1 to i_bs + 1 # Folded tree for j = 1 to j_bs + 1 i' = i + i_block - 2 j' = j + j_block - 2 tmp[i,j] = ... # use i' and j' for i = i_block to i_block + i_bs # Root for j = j_block to j_block + j_bs i' = i - x_block j' = j - j_block ... = ... tmp[i',j'] ... """ processed = [] for i, tree in enumerate(unfolded): assert len(tree) == len(root) # We can optimize the folded trees only iff: # test0 := they compute temporary arrays, but not if they compute input data # test1 := the outer Iterations have actually been blocked exprs = FindNodes(Expression).visit(tree) writes = [j.write for j in exprs if j.is_tensor] test0 = not all(j.is_Array for j in writes) test1 = any(not isinstance(j.limits[0], BlockDimension) for j in root) if test0 or test1: processed.append(compose_nodes(tree)) root = compose_nodes(root) continue # Shrink the iteration space modified_tree = [] modified_root = [] modified_dims = {} mapper = {} for t, r in zip(tree, root): udim0 = IncrDimension(t.dim, t.symbolic_min, 1, "%ss%d" % (t.index, i)) modified_tree.append(t._rebuild(limits=(0, t.limits[1] - t.limits[0], t.step), uindices=t.uindices + (udim0,))) mapper[t.dim] = udim0 udim1 = IncrDimension(t.dim, 0, 1, "%ss%d" % (t.index, i)) modified_root.append(r._rebuild(uindices=r.uindices + (udim1,))) d = r.limits[0] assert isinstance(d, BlockDimension) modified_dims[d.root] = d # Temporary arrays can now be moved onto the stack for w in writes: dims = tuple(modified_dims.get(d, d) for d in w.dimensions) shape = tuple(d.symbolic_size for d in dims) w.update(shape=shape, dimensions=dims, scope='stack') # Substitute iteration variables within the folded trees modified_tree = compose_nodes(modified_tree) replaced = xreplace_indices([j.expr for j in exprs], mapper, lambda i: i.function not in writes, True) subs = [j._rebuild(expr=k) for j, k in zip(exprs, replaced)] processed.append(Transformer(dict(zip(exprs, subs))).visit(modified_tree)) # Introduce the new iteration variables within /root/ modified_root = compose_nodes(modified_root) exprs = FindNodes(Expression).visit(modified_root) candidates = [as_symbol(j.output) for j in subs] replaced = xreplace_indices([j.expr for j in exprs], mapper, candidates) subs = [j._rebuild(expr=k) for j, k in zip(exprs, replaced)] root = Transformer(dict(zip(exprs, subs))).visit(modified_root) return processed + [root]
def linearize_accesses(iet, cache, sregistry): """ Turn Indexeds into FIndexeds and create the necessary access Macros. """ # Find all objects amenable to linearization symbol_names = {i.name for i in FindSymbols('indexeds').visit(iet)} functions = [f for f in FindSymbols().visit(iet) if ((f.is_DiscreteFunction or f.is_Array) and f.ndim > 1 and f.name in symbol_names)] functions = sorted(functions, key=lambda f: len(f.dimensions), reverse=True) # Find unique sizes (unique -> minimize necessary registers) mapper = DefaultOrderedDict(list) for f in functions: if f not in cache: # NOTE: the outermost dimension is unnecessary for d in f.dimensions[1:]: # TODO: same grid + same halo => same padding, however this is # never asserted throughout the compiler yet... maybe should do # it when in debug mode at `prepare_arguments` time, ie right # before jumping to C? mapper[(d, f._size_halo[d], getattr(f, 'grid', None))].append(f) # Build all exprs such as `x_fsz0 = u_vec->size[1]` imapper = DefaultOrderedDict(list) for (d, halo, _), v in mapper.items(): name = sregistry.make_name(prefix='%s_fsz' % d.name) s = Symbol(name=name, dtype=np.int32, is_const=True) try: expr = DummyExpr(s, v[0]._C_get_field(FULL, d).size, init=True) except AttributeError: assert v[0].is_Array expr = DummyExpr(s, v[0].symbolic_shape[d], init=True) for f in v: imapper[f].append((d, s)) cache[f].stmts0.append(expr) # Build all exprs such as `y_slc0 = y_fsz0*z_fsz0` built = {} mapper = DefaultOrderedDict(list) for f, v in imapper.items(): for n, (d, _) in enumerate(v): expr = prod(list(zip(*v[n:]))[1]) try: stmt = built[expr] except KeyError: name = sregistry.make_name(prefix='%s_slc' % d.name) s = Symbol(name=name, dtype=np.int32, is_const=True) stmt = built[expr] = DummyExpr(s, expr, init=True) mapper[f].append(stmt.write) cache[f].stmts1.append(stmt) mapper.update([(f, []) for f in functions if f not in mapper]) # Build defines. For example: # `define uL(t, x, y, z) u[(t)*t_slice_sz + (x)*x_slice_sz + (y)*y_slice_sz + (z)]` headers = [] findexeds = {} for f, szs in mapper.items(): if cache[f].cbk is not None: # Perhaps we've already built an access macro for `f` through another efunc findexeds[f] = cache[f].cbk else: assert len(szs) == len(f.dimensions) - 1 pname = sregistry.make_name(prefix='%sL' % f.name) expr = sum([MacroArgument(d.name)*s for d, s in zip(f.dimensions, szs)]) expr += MacroArgument(f.dimensions[-1].name) expr = Indexed(IndexedData(f.name, None, f), expr) define = DefFunction(pname, f.dimensions) headers.append((ccode(define), ccode(expr))) cache[f].cbk = findexeds[f] = lambda i, pname=pname: FIndexed(i, pname) # Build "functional" Indexeds. For example: # `u[t2, x+8, y+9, z+7] => uL(t2, x+8, y+9, z+7)` mapper = {} for n in FindNodes(Expression).visit(iet): subs = {} for i in retrieve_indexed(n.expr): try: subs[i] = findexeds[i.function](i) except KeyError: pass mapper[n] = n._rebuild(expr=uxreplace(n.expr, subs)) # Put together all of the necessary exprs for `y_fsz0`, ..., `y_slc0`, ... stmts0 = filter_ordered(flatten(cache[f].stmts0 for f in functions)) if stmts0: stmts0.append(BlankLine) stmts1 = filter_ordered(flatten(cache[f].stmts1 for f in functions)) if stmts1: stmts1.append(BlankLine) iet = Transformer(mapper).visit(iet) body = iet.body._rebuild(body=tuple(stmts0) + tuple(stmts1) + iet.body.body) iet = iet._rebuild(body=body) return iet, headers
def linearize_accesses(iet, key, cache, sregistry): """ Turn Indexeds into FIndexeds and create the necessary access Macros. """ # `functions` are all Functions that `iet` may need to linearize functions = [f for f in FindSymbols().visit(iet) if key(f) and f.ndim > 1] functions = sorted(functions, key=lambda f: len(f.dimensions), reverse=True) # `functions_unseen` are all Functions that `iet` may need to linearize # and have not been seen while processing other IETs functions_unseen = [f for f in functions if f not in cache] # Find unique sizes (unique -> minimize necessary registers) mapper = DefaultOrderedDict(list) for f in functions: # NOTE: the outermost dimension is unnecessary for d in f.dimensions[1:]: # TODO: same grid + same halo => same padding, however this is # never asserted throughout the compiler yet... maybe should do # it when in debug mode at `prepare_arguments` time, ie right # before jumping to C? mapper[(d, f._size_halo[d], getattr(f, 'grid', None))].append(f) # For all unseen Functions, build the size exprs. For example: # `x_fsz0 = u_vec->size[1]` imapper = DefaultOrderedDict(dict) for (d, halo, _), v in mapper.items(): v_unseen = [f for f in v if f in functions_unseen] if not v_unseen: continue expr = _generate_fsz(v_unseen[0], d, sregistry) if expr: for f in v_unseen: imapper[f][d] = expr.write cache[f].stmts0.append(expr) # For all unseen Functions, build the stride exprs. For example: # `y_stride0 = y_fsz0*z_fsz0` built = {} mapper = DefaultOrderedDict(dict) for f, v in imapper.items(): for d in v: n = f.dimensions.index(d) expr = prod(v[i] for i in f.dimensions[n:]) try: stmt = built[expr] except KeyError: name = sregistry.make_name(prefix='%s_stride' % d.name) s = Symbol(name=name, dtype=np.int64, is_const=True) stmt = built[expr] = DummyExpr(s, expr, init=True) mapper[f][d] = stmt.write cache[f].stmts1.append(stmt) mapper.update([(f, {}) for f in functions_unseen if f not in mapper]) # For all unseen Functions, build defines. For example: # `#define uL(t, x, y, z) u[(t)*t_stride0 + (x)*x_stride0 + (y)*y_stride0 + (z)]` headers = [] findexeds = {} for f in functions: if cache[f].cbk is None: header, cbk = _generate_macro(f, mapper[f], sregistry) headers.append(header) cache[f].cbk = findexeds[f] = cbk else: findexeds[f] = cache[f].cbk # Build "functional" Indexeds. For example: # `u[t2, x+8, y+9, z+7] => uL(t2, x+8, y+9, z+7)` mapper = {} indexeds = FindSymbols('indexeds').visit(iet) for i in indexeds: try: mapper[i] = findexeds[i.function](i) except KeyError: pass # Introduce the linearized expressions iet = Uxreplace(mapper).visit(iet) # All Functions that actually require linearization in `iet` candidates = [] candidates.extend(filter_ordered(i.function for i in indexeds)) calls = FindNodes(Call).visit(iet) cfuncs = filter_ordered(flatten(i.functions for i in calls)) candidates.extend(i for i in cfuncs if i.function.is_DiscreteFunction) # All Functions that can be linearized in `iet` defines = FindSymbols('defines-aliases').visit(iet) # Place the linearization expressions or delegate to ancestor efunc stmts0 = [] stmts1 = [] args = [] for f in candidates: if f in defines: stmts0.extend(cache[f].stmts0) stmts1.extend(cache[f].stmts1) else: args.extend([e.write for e in cache[f].stmts1]) if stmts0: assert len(stmts1) > 0 stmts0 = filter_ordered(stmts0) + [BlankLine] stmts1 = filter_ordered(stmts1) + [BlankLine] body = iet.body._rebuild(body=tuple(stmts0) + tuple(stmts1) + iet.body.body) iet = iet._rebuild(body=body) else: assert len(stmts0) == 0 return iet, headers, args
def make_simd(self, iet): mapper = {} for tree in retrieve_iteration_tree(iet): candidates = [i for i in tree if i.is_ParallelRelaxed] # As long as there's an outer level of parallelism, the innermost # PARALLEL Iteration gets vectorized if len(candidates) < 2: continue candidate = candidates[-1] # Only fully-parallel Iterations will be SIMD-ized (ParallelRelaxed # might not be enough then) if not candidate.is_Parallel: continue # This check catches cases where an iteration appears as the vectorizable # candidate in tree A but has actually less priority over a candidate in # another tree B. # # Example: # # for (i = ... ) (End of tree A - i is the candidate for tree A) # Expr1 # for (j = ...) (End of tree B - j is the candidate for tree B) # Expr2 # ... if not IsPerfectIteration(depth=candidates[-2]).visit(candidate): continue # If it's an array reduction, we need to be sure the backend compiler # actually supports it. For example, it may be possible to # # #pragma parallel reduction(a[...]) # for (i = ...) # #pragma simd # for (j = ...) # a[j] += ... # # While the following could be unsupported # # #pragma parallel // compiler doesn't support array reduction # for (i = ...) # #pragma simd # for (j = ...) # #pragma atomic // cannot nest simd and atomic # a[j] += ... if any(i.is_ParallelAtomic for i in candidates[:-1]) and \ not self._support_array_reduction(self.compiler): exprs = FindNodes(Expression).visit(candidate) reductions = [i.output for i in exprs if i.is_Increment] if any(i.is_Indexed for i in reductions): continue # Add SIMD pragma indexeds = FindSymbols('indexeds').visit(candidate) aligned = {i.name for i in indexeds if i.function.is_DiscreteFunction} if aligned: simd = self.lang['simd-for-aligned'] simd = as_tuple(simd(','.join(sorted(aligned)), self.simd_reg_size)) else: simd = as_tuple(self.lang['simd-for']) pragmas = candidate.pragmas + simd # Add VECTORIZED property properties = list(candidate.properties) + [VECTORIZED] mapper[candidate] = candidate._rebuild(pragmas=pragmas, properties=properties) iet = Transformer(mapper).visit(iet) return iet, {}