def process(func, state): """ Apply ``func`` to the IETs in ``state._efuncs``, and update ``state`` accordingly. """ # Create a Call graph. `func` will be applied to each node in the Call graph. # `func` might change an `efunc` signature; the Call graph will be used to # propagate such change through the `efunc` callers dag = DAG(nodes=['root']) queue = ['root'] while queue: caller = queue.pop(0) callees = FindNodes(Call).visit(state._efuncs[caller]) for callee in filter_ordered([i.name for i in callees]): if callee in state._efuncs: # Exclude foreign Calls, e.g., MPI calls try: dag.add_node(callee) queue.append(callee) except KeyError: # `callee` already in `dag` pass dag.add_edge(callee, caller) assert dag.size == len(state._efuncs) # Apply `func` for i in dag.topological_sort(): state._efuncs[i], metadata = func(state._efuncs[i]) # Track any new Dimensions introduced by `func` state._dimensions.extend(list(metadata.get('dimensions', []))) # Track any new #include required by `func` state._includes.extend(list(metadata.get('includes', []))) state._includes = filter_ordered(state._includes) # Track any new ElementalFunctions state._efuncs.update(OrderedDict([(i.name, i) for i in metadata.get('efuncs', [])])) # If there's a change to the `args` and the `iet` is an efunc, then # we must update the call sites as well, as the arguments dropped down # to the efunc have just increased args = as_tuple(metadata.get('args')) if args: # `extif` avoids redundant updates to the parameters list, due # to multiple children wanting to add the same input argument extif = lambda v: list(v) + [e for e in args if e not in v] stack = [i] + dag.all_downstreams(i) for n in stack: efunc = state._efuncs[n] calls = [c for c in FindNodes(Call).visit(efunc) if c.name in stack] mapper = {c: c._rebuild(arguments=extif(c.arguments)) for c in calls} efunc = Transformer(mapper).visit(efunc) if efunc.is_Callable: efunc = efunc._rebuild(parameters=extif(efunc.parameters)) state._efuncs[n] = efunc
def create_profile(name, iet): """ Enrich the Iteration/Expression tree ``iet`` adding nodes for C-level performance profiling. In particular, turn all :class:`Section`s within ``iet`` into :class:`TimedList`s. A :class:`Profiler` is returned to access profiling data. """ sections = FindNodes(Section).visit(iet) # Construct the Profiler profiler = Profiler(name) for section in sections: # All ExpressionBundles within `section` bundles = FindNodes(ExpressionBundle).visit(section) # Total operation count ops = sum(i.ops for i in bundles) # Operation count at each section iteration sops = sum( estimate_cost(i.expr) for i in flatten(b.exprs for b in bundles)) # Total memory traffic mapper = {} for i in bundles: for k, v in i.traffic.items(): mapper.setdefault(k, []).append(v) traffic = [ IntervalGroup.generate('merge', *i) for i in mapper.values() ] traffic = sum(i.extent for i in traffic) # Each ExpressionBundle lives in its own iteration space itershapes = [i.shape for i in bundles] # Track how many grid points are written within `section` points = [] for i in bundles: writes = { e.write for e in i.exprs if e.is_tensor and e.write.is_TimeFunction } points.append(reduce(mul, i.shape) * len(writes)) points = sum(points) profiler.add(section, SectionData(ops, sops, points, traffic, itershapes)) # Transform the Iteration/Expression tree introducing the C-level timers mapper = { i: TimedList(gname=name, lname=i.name, body=i.body) for i in sections } iet = Transformer(mapper).visit(iet) return iet, profiler
def test_transformer_add_replace(exprs, block2, block3): """Basic transformer test that adds one expression and replaces another""" line1 = '// Replaced expression' line2 = '// Adding a simple line' replacer = Block(c.Line(line1)) adder = lambda n: Block(c.Line(line2), n) transformer = Transformer({exprs[0]: replacer, exprs[1]: adder(exprs[1])}) for block in [block2, block3]: newblock = transformer.visit(block) newcode = str(newblock.ccode) oldnumlines = len(str(block.ccode).split('\n')) newnumlines = len(newcode.split('\n')) assert newnumlines >= oldnumlines + 1 assert line1 in newcode assert line2 in newcode assert "a[i0] = a[i0] + b[i0] + 5.0F;" not in newcode
def _hoist_halospots(iet): """ Hoist HaloSpots from inner to outer Iterations where all data dependencies would be honored. """ # Precompute scopes to save time scopes = {i: Scope([e.expr for e in v]) for i, v in MapNodes().visit(iet).items()} # Analysis hsmapper = {} imapper = defaultdict(list) for iters, halo_spots in MapNodes(Iteration, HaloSpot, 'groupby').visit(iet).items(): for hs in halo_spots: hsmapper[hs] = hs.halo_scheme for f in hs.fmapper: for n, i in enumerate(iters): maybe_hoistable = set().union(*[i.dim._defines for i in iters[n:]]) d_flow = scopes[i].d_flow.project(f) if all(not (dep.cause & maybe_hoistable) or dep.write.is_increment for dep in d_flow): hsmapper[hs] = hsmapper[hs].drop(f) imapper[i].append(hs.halo_scheme.project(f)) break # Post-process analysis mapper = {i: HaloSpot(HaloScheme.union(hss), i._rebuild()) for i, hss in imapper.items()} mapper.update({i: i.body if hs.is_void else i._rebuild(halo_scheme=hs) for i, hs in hsmapper.items()}) # Transform the IET hoisting/dropping HaloSpots as according to the analysis iet = Transformer(mapper, nested=True).visit(iet) # Clean up: de-nest HaloSpots if necessary mapper = {} for hs in FindNodes(HaloSpot).visit(iet): if hs.body.is_HaloSpot: halo_scheme = HaloScheme.union([hs.halo_scheme, hs.body.halo_scheme]) mapper[hs] = hs._rebuild(halo_scheme=halo_scheme, body=hs.body.body) iet = Transformer(mapper, nested=True).visit(iet) return iet
def fold_blockable_tree(iet, blockinner=True): """ Create IterationFolds from sequences of nested Iterations. """ mapper = {} for k, sequence in FindAdjacent(Iteration).visit(iet).items(): # Group based on Dimension groups = [] for subsequence in sequence: for _, v in groupby(subsequence, lambda i: i.dim): i = list(v) if len(i) >= 2: groups.append(i) for i in groups: # Pre-condition: they all must be perfect iterations if any(not IsPerfectIteration().visit(j) for j in i): continue # Only retain consecutive trees having same depth trees = [retrieve_iteration_tree(j)[0] for j in i] handle = [] for j in trees: if len(j) != len(trees[0]): break handle.append(j) trees = handle if not trees: continue # Check foldability pairwise_folds = list(zip(*reversed(trees))) if any(not is_foldable(j) for j in pairwise_folds): continue # Maybe heuristically exclude innermost Iteration if blockinner is False: pairwise_folds = pairwise_folds[:-1] # Perhaps there's nothing to fold if len(pairwise_folds) == 0: continue # TODO: we do not currently support blocking if any of the foldable # iterations writes to user data (need min/max loop bounds?) exprs = flatten( FindNodes(Expression).visit(j.root) for j in trees[:-1]) if any(j.write.is_Input for j in exprs): continue # Perform folding for j in pairwise_folds: r, remainder = j[0], j[1:] folds = [(tuple(y - x for x, y in zip(i.offsets, r.offsets)), i.nodes) for i in remainder] mapper[r] = IterationFold(folds=folds, **r.args) for k in remainder: mapper[k] = None # Insert the IterationFolds in the Iteration/Expression tree iet = Transformer(mapper, nested=True).visit(iet) return iet
def _specialize_iet(self, iet): """Transform the Iteration/Expression tree to offload the computation of one or more loop nests onto YASK. This involves calling the YASK compiler to generate YASK code. Such YASK code is then called from within the transformed Iteration/Expression tree.""" log("Specializing a Devito Operator for YASK...") self.context = YaskNullContext() self.yk_soln = YaskNullKernel() offloadable = find_offloadable_trees(iet) if len(offloadable) == 0: log("No offloadable trees found") elif len(offloadable) == 1: tree, bundle, grid, dtype = offloadable[0] self.context = contexts.fetch(grid, dtype) # Create a YASK compiler solution for this Operator yc_soln = self.context.make_yc_solution(namespace['jit-yc-soln']) transform = sympy2yask(self.context, yc_soln) try: for i in bundle.exprs: transform(i.expr) funcall = make_sharedptr_funcall(namespace['code-soln-run'], ['time'], namespace['code-soln-name']) funcall = Element(c.Statement(ccode(funcall))) iet = Transformer({tree[1]: funcall}).visit(iet) # Track /funcall/ as an external function call self.func_table[namespace['code-soln-run']] = MetaCall(None, False) # JIT-compile the newly-created YASK kernel local_grids = [i for i in transform.mapper if i.is_Array] self.yk_soln = self.context.make_yk_solution(namespace['jit-yk-soln'], yc_soln, local_grids) # Print some useful information about the newly constructed solution log("Solution '%s' contains %d grid(s) and %d equation(s)." % (yc_soln.get_name(), yc_soln.get_num_grids(), yc_soln.get_num_equations())) except: log("Unable to offload a candidate tree.") else: exit("Found more than one offloadable trees in a single Operator") # Some Iteration/Expression trees are not offloaded to YASK and may # require further processing to be executed in YASK, due to the differences # in storage layout employed by Devito and YASK iet = make_grid_accesses(iet) log("Specialization successfully performed!") return iet
def test_nested_transformer(exprs, iters, block2): """When created with the kwarg ``nested=True``, a Transformer performs nested replacements. This test simultaneously replace an inner expression and an Iteration sorrounding it.""" target_loop = block2.nodes[1] target_expr = target_loop.nodes[0].nodes[0] mapper = {target_loop: iters[3](target_loop.nodes[0]), target_expr: exprs[3]} processed = Transformer(mapper, nested=True).visit(block2) assert printAST(processed) == """<Iteration i::i::(0, 3, 1)::(0, 0)>
def instrument(self, iet): sections = FindNodes(Section).visit(iet) # Transform the Iteration/Expression tree introducing Advisor calls that # resume and stop data collection mapper = {i: List(body=[Call(self._api_resume), i, Call(self._api_pause)]) for i in sections} iet = Transformer(mapper).visit(iet) return iet
def _make_compute(self, hs, key, msgs, callpoke): if hs.body.is_Call: return None else: mapper = { i: List(body=[callpoke, i]) for i in FindNodes(ExpressionBundle).visit(hs.body) } iet = Transformer(mapper).visit(hs.body) return make_efunc('compute%d' % key, iet, hs.arguments)
def _make_parallel_tree(self, root, candidates): """ Return a mapper to parallelize the :class:`Iteration`s within /root/. """ parallel = self._pragma_for(root, candidates) # Introduce the `omp for` pragma mapper = OrderedDict() if root.is_ParallelAtomic: # Introduce the `omp atomic` pragmas exprs = FindNodes(Expression).visit(root) subs = {i: List(header=self.lang['atomic'], body=i) for i in exprs if i.is_increment} handle = Transformer(subs).visit(root) mapper[root] = handle._rebuild(pragmas=root.pragmas + (parallel,)) else: mapper[root] = root._rebuild(pragmas=root.pragmas + (parallel,)) return mapper
def _optimize_halo_updates(self, iet, state): """ Drop unnecessary halo exchanges, or shuffle them around to improve computation-communication overlap. """ hss = FindNodes(HaloSpot).visit(iet) mapper = {i: None for i in hss if i.is_Redundant} processed = Transformer(mapper, nested=True).visit(iet) return processed, {}
def instrument(self, iet): """ Enrich the Iteration/Expression tree ``iet`` adding nodes for C-level performance profiling. In particular, turn all Sections within ``iet`` into TimedLists. """ sections = FindNodes(Section).visit(iet) for section in sections: bundles = FindNodes(ExpressionBundle).visit(section) # Total operation count ops = sum(i.ops for i in bundles) # Operation count at each section iteration sops = sum( estimate_cost(i.expr) for i in flatten(b.exprs for b in bundles)) # Total memory traffic mapper = {} for i in bundles: for k, v in i.traffic.items(): mapper.setdefault(k, []).append(v) traffic = 0 for i in mapper.values(): try: traffic += IntervalGroup.generate('union', *i).size except ValueError: # Over different iteration spaces traffic += sum(j.size for j in i) # Each ExpressionBundle lives in its own iteration space itermaps = [i.ispace.dimension_map for i in bundles] # Track how many grid points are written within `section` points = [] for i in bundles: writes = { e.write for e in i.exprs if e.is_tensor and e.write.is_TimeFunction } points.append(i.size * len(writes)) points = sum(points) self._sections[section] = SectionData(ops, sops, points, traffic, itermaps) # Transform the Iteration/Expression tree introducing the C-level timers mapper = { i: TimedList(timer=self.timer, lname=i.name, body=i) for i in sections } iet = Transformer(mapper).visit(iet) return iet
def iet_insert_C_decls(iet, func_table): """ Given an Iteration/Expression tree ``iet``, build a new tree with the necessary symbol declarations. Declarations are placed as close as possible to the first symbol use. :param iet: The input Iteration/Expression tree. :param func_table: A mapper from callable names to :class:`Callable`s called from within ``iet``. """ # Resolve function calls first scopes = [] me = MapExpressions() for k, v in me.visit(iet).items(): if k.is_Call: func = func_table[k.name] if func.local: scopes.extend(me.visit(func.root, queue=list(v)).items()) else: scopes.append((k, v)) # Determine all required declarations allocator = Allocator() mapper = OrderedDict() for k, v in scopes: if k.is_scalar: # Inline declaration mapper[k] = LocalExpression(**k.args) elif k.write is None or k.write._mem_external: # Nothing to do, e.g., variable passed as kernel argument continue elif k.write._mem_stack: # On the stack key = lambda i: not i.is_Parallel site = filter_iterations(v, key=key, stop='asap') or [iet] allocator.push_stack(site[-1], k.write) else: # On the heap, as a tensor that must be globally accessible allocator.push_heap(k.write) # Introduce declarations on the stack for k, v in allocator.onstack: mapper[k] = tuple(Element(i) for i in v) iet = NestedTransformer(mapper).visit(iet) for k, v in list(func_table.items()): if v.local: func_table[k] = MetaCall( Transformer(mapper).visit(v.root), v.local) # Introduce declarations on the heap (if any) if allocator.onheap: decls, allocs, frees = zip(*allocator.onheap) iet = List(header=decls + allocs, body=iet, footer=frees) return iet
def _parallelize_dist(self, iet): """ Add MPI routines performing halo exchanges to emit distributed-memory parallel code. """ if not self.params['mpi']: return iet, {} # To produce unique object names generators = { 'msg': generator(), 'comm': generator(), 'comp': generator() } sync_heb = HaloExchangeBuilder('basic', **generators) user_heb = HaloExchangeBuilder(self.params['mpi'], **generators) mapper = {} for hs in FindNodes(HaloSpot).visit(iet): heb = user_heb if hs.is_Overlappable else sync_heb mapper[hs] = heb.make(hs) efuncs = sync_heb.efuncs + user_heb.efuncs objs = sync_heb.objs + user_heb.objs iet = Transformer(mapper, nested=True).visit(iet) # Must drop the PARALLEL tag from the Iterations within which halo # exchanges are performed mapper = {} for tree in retrieve_iteration_tree(iet): for i in reversed(tree): if i in mapper: # Already seen this subtree, skip break if FindNodes(Call).visit(i): mapper.update({ n: n._rebuild(properties=set(n.properties) - {PARALLEL}) for n in tree[:tree.index(i) + 1] }) break iet = Transformer(mapper, nested=True).visit(iet) return iet, {'includes': ['mpi.h'], 'efuncs': efuncs, 'args': objs}
def make_parallel(self, iet): """ Transform ``iet`` by decorating its parallel :class:`Iteration`s with suitable ``#pragma omp ...`` for thread-level parallelism. """ # Group sequences of loops that should go within the same parallel region was_tagged = False groups = OrderedDict() for tree in retrieve_iteration_tree(iet): # Determine the number of consecutive parallelizable Iterations candidates = filter_iterations(tree, key=self.key, stop='asap') if not candidates: was_tagged = False continue # Consecutive tagged Iteration go in the same group is_tagged = any(i.tag is not None for i in tree) key = len(groups) - (is_tagged & was_tagged) handle = groups.setdefault(key, OrderedDict()) handle[candidates[0]] = candidates was_tagged = is_tagged mapper = OrderedDict() for group in groups.values(): private = [] for root, candidates in group.items(): mapper.update(self._make_parallel_tree(root, candidates)) # Track the thread-private and thread-shared variables private.extend([ i for i in FindSymbols('symbolics').visit(root) if i.is_Array and i._mem_stack ]) # Build the parallel region private = sorted(set([i.name for i in private])) private = ('private(%s)' % ','.join(private)) if private else '' rebuilt = [v for k, v in mapper.items() if k in group] par_region = Block(header=self.lang['par-region'](private), body=rebuilt) for k, v in list(mapper.items()): if isinstance(v, Iteration): mapper[k] = None if v.is_Remainder else par_region processed = Transformer(mapper).visit(iet) # Hack/workaround to the fact that the OpenMP pragmas are not true # IET nodes, so the `nthreads` variables won't be detected as a # Callable parameter unless inserted in a mock Expression if mapper: nt = NThreads() eq = LocalExpression(DummyEq(Symbol(name='nt', dtype=np.int32), nt)) return List(body=[eq, processed]), {'input': [nt]} else: return List(body=processed), {}
def process(self, iet): def key(s): # The SyncOps are to be processed in the following order return [WaitLock, WithLock, Delete, FetchWait, FetchWaitPrefetch].index(s) callbacks = { WaitLock: self._make_waitlock, WithLock: self._make_withlock, FetchWait: self._make_fetchwait, FetchWaitPrefetch: self._make_fetchwaitprefetch, Delete: self._make_delete } sync_spots = FindNodes(SyncSpot).visit(iet) if not sync_spots: return iet, {} pieces = namedtuple('Pieces', 'init finalize funcs threads')([], [], [], []) subs = {} for n in sync_spots: mapper = as_mapper(n.sync_ops, lambda i: type(i)) for _type in sorted(mapper, key=key): subs[n] = callbacks[_type](subs.get(n, n), mapper[_type], pieces, iet) iet = Transformer(subs).visit(iet) # Add initialization and finalization code init = List(body=pieces.init, footer=c.Line()) finalize = List(header=c.Line(), body=pieces.finalize) iet = iet._rebuild(body=(init, ) + iet.body + (finalize, )) return iet, { 'efuncs': pieces.funcs, 'includes': ['pthread.h'], 'args': [i.size for i in pieces.threads if not is_integer(i.size)] }
def fold_blockable_tree(node, exclude_innermost=False): """ Create IterationFolds from sequences of nested Iterations. """ found = FindAdjacent(Iteration).visit(node) mapper = {} for k, v in found.items(): for i in v: # Pre-condition: they all must be perfect iterations assert len(i) > 1 if any(not IsPerfectIteration().visit(j) for j in i): continue # Only retain consecutive trees having same depth trees = [retrieve_iteration_tree(j)[0] for j in i] handle = [] for j in trees: if len(j) != len(trees[0]): break handle.append(j) trees = handle if not trees: continue # Check foldability pairwise_folds = list(zip(*reversed(trees))) if any(not is_foldable(j) for j in pairwise_folds): continue # Maybe heuristically exclude innermost Iteration if exclude_innermost is True: pairwise_folds = pairwise_folds[:-1] # Perhaps there's nothing to fold if len(pairwise_folds) == 1: continue # TODO: we do not currently support blocking if any of the foldable # iterations writes to user data (need min/max loop bounds?) exprs = flatten( FindNodes(Expression).visit(j.root) for j in trees[:-1]) if any(j.write.is_Input for j in exprs): continue # Perform folding for j in pairwise_folds: root, remainder = j[0], j[1:] folds = [(tuple(y - x for x, y in zip(i.offsets, root.offsets)), i.nodes) for i in remainder] mapper[root] = IterationFold(folds=folds, **root.args) for k in remainder: mapper[k] = None # Insert the IterationFolds in the Iteration/Expression tree processed = Transformer(mapper, nested=True).visit(node) return processed
def _simdize(self, iet): # No SIMD-ization for devices. We then drop the VECTOR property # so that later passes can perform more aggressive transformations mapper = {} for i in FindNodes(Iteration).visit(iet): if i.is_Vectorizable: properties = [p for p in i.properties if p is not VECTOR] mapper[i] = i._rebuild(properties=properties) iet = Transformer(mapper).visit(iet) return iet, {}
def _make_parallel_tree(self, root, candidates): """Return a mapper to parallelize the Iterations within ``root``.""" ncollapse = self._ncollapse(root, candidates) parallel = self.lang['for'](ncollapse) pragmas = root.pragmas + (parallel,) properties = root.properties + (COLLAPSED(ncollapse),) # Introduce the `omp for` pragma mapper = OrderedDict() if root.is_ParallelAtomic: # Introduce the `omp atomic` pragmas exprs = FindNodes(Expression).visit(root) subs = {i: List(header=self.lang['atomic'], body=i) for i in exprs if i.is_Increment} handle = Transformer(subs).visit(root) mapper[root] = handle._rebuild(pragmas=pragmas, properties=properties) else: mapper[root] = root._rebuild(pragmas=pragmas, properties=properties) return mapper
def mpiize(iet, **kwargs): """ Add MPI routines performing halo exchanges to emit distributed-memory parallel code. """ mode = kwargs.pop('mode') language = kwargs.pop('language') sregistry = kwargs.pop('sregistry') # To produce unique object names generators = {'msg': generator(), 'comm': generator(), 'comp': generator()} sync_heb = HaloExchangeBuilder('basic', language, sregistry, **generators) user_heb = HaloExchangeBuilder(mode, language, sregistry, **generators) mapper = {} for hs in FindNodes(HaloSpot).visit(iet): heb = user_heb if isinstance(hs, OverlappableHaloSpot) else sync_heb mapper[hs] = heb.make(hs) efuncs = sync_heb.efuncs + user_heb.efuncs objs = filter_sorted(sync_heb.objs + user_heb.objs) iet = Transformer(mapper, nested=True).visit(iet) # Must drop the PARALLEL tag from the Iterations within which halo # exchanges are performed mapper = {} for tree in retrieve_iteration_tree(iet): for i in reversed(tree): if i in mapper: # Already seen this subtree, skip break if FindNodes(Call).visit(i): mapper.update({ n: n._rebuild(properties=set(n.properties) - {PARALLEL}) for n in tree[:tree.index(i) + 1] }) break iet = Transformer(mapper, nested=True).visit(iet) return iet, {'includes': ['mpi.h'], 'efuncs': efuncs, 'args': objs}
def _dist_parallelize(self, iet): """ Add MPI routines performing halo exchanges to emit distributed-memory parallel code. """ # Build send/recv Callables and Calls heb = HaloExchangeBuilder(self.params['mpi']) call_trees, calls = heb.make(FindNodes(HaloSpot).visit(iet)) # Transform the IET by adding in the `haloupdate` Calls iet = Transformer(calls, nested=True).visit(iet) return iet, {'includes': ['mpi.h'], 'call_trees': call_trees}
def mpi_gpu_direct(iet, **kwargs): """ Modify MPI Callables to enable multiple GPUs performing GPU-Direct communication. """ mapper = {} for node in FindNodes((IsendCall, IrecvCall)).visit(iet): header = c.Pragma('omp target data use_device_ptr(%s)' % node.arguments[0].name) mapper[node] = Block(header=header, body=node) iet = Transformer(mapper).visit(iet) return iet, {}
def iet_analyze(iet): analysis = mark_halospot_useless(iet) analysis = mark_halospot_hoistable(analysis) analysis = mark_halospot_overlappable(analysis) # Decorate the Iteration/Expression tree with the found properties mapper = OrderedDict() for k, v in list(analysis.properties.items()): args = k.args properties = as_tuple(args.pop('properties')) + as_tuple(v) mapper[k] = k._rebuild(properties=properties, **args) processed = Transformer(mapper, nested=True).visit(iet) return processed
def instrument(self, iet, timer): # Look for the presence of a time loop within the IET of the Operator mapper = {} for i in FindNodes(Iteration).visit(iet): if i.dim.is_Time: # The calls to Advisor's Collection Control API are only for Operators # with a time loop mapper[i] = List(header=c.Statement('%s()' % self._api_resume), body=i, footer=c.Statement('%s()' % self._api_pause)) return Transformer(mapper).visit(iet) # Return the IET intact if no time loop is found return iet
def instrument(self, iet, timer): """ Instrument the given IET for C-level performance profiling. """ sections = FindNodes(Section).visit(iet) if sections: mapper = {} for i in sections: n = i.name assert n in timer.fields mapper[i] = i._rebuild(body=TimedList(timer=timer, lname=n, body=i.body)) return Transformer(mapper, nested=True).visit(iet) else: return iet
def _generate_mpi(self, iet, **kwargs): if configuration['mpi'] is False: return iet halo_spots = FindNodes(HaloSpot).visit(iet) # For each MPI-distributed TensorFunction, generate all necessary # C-level routines to perform a halo update callables = OrderedDict() for hs in halo_spots: for f, v in hs.fmapper.items(): callables[f] = [update_halo(f, v.loc_indices)] callables[f].append(sendrecv(f, v.loc_indices)) callables[f].append(copy(f, v.loc_indices)) callables[f].append(copy(f, v.loc_indices, True)) callables = flatten(callables.values()) # Replace HaloSpots with suitable calls performing the halo update mapper = {} for hs in halo_spots: for f, v in hs.fmapper.items(): stencil = [int(i) for i in hs.mask[f].values()] comm = f.grid.distributor._C_comm nb = f.grid.distributor._C_neighbours.obj loc_indices = list(v.loc_indices.values()) dsizes = [d.symbolic_size for d in f.dimensions] parameters = [f] + stencil + [comm, nb] + loc_indices + dsizes call = Call('halo_exchange_%s' % f.name, parameters) mapper.setdefault(hs, []).append(call) # Sorting is for deterministic code generation. However, in practice, # we don't expect `cstructs` to contain more than one element because # there should always be one grid per Operator (though we're not really # enforcing it) cstructs = { f.grid.distributor._C_neighbours.cdef for f in flatten(i.fmapper for i in halo_spots) } self._globals.extend(sorted(cstructs, key=lambda i: i.tpname)) self._includes.append('mpi.h') self._func_table.update( OrderedDict([(i.name, MetaCall(i, True)) for i in callables])) # Add in the halo update calls mapper = {k: List(body=v + list(k.body)) for k, v in mapper.items()} iet = Transformer(mapper, nested=True).visit(iet) return iet
def __init__(self, expressions, **kwargs): super(OperatorDebug, self).__init__(expressions, **kwargs) self._includes.append('stdio.h') # Minimize the trip count of the sequential loops iterations = set(flatten(retrieve_iteration_tree(self.body))) mapper = { i: i._rebuild(limits=(max(i.offsets) + 2)) for i in iterations if i.is_Sequential } self.body = Transformer(mapper).visit(self.body) # Mark entry/exit points of each non-sequential Iteration tree in the body iterations = [ filter_iterations(i, lambda i: not i.is_Sequential, 'any') for i in retrieve_iteration_tree(self.body) ] iterations = [i[0] for i in iterations if i] mapper = { t: List(header=printmark('In nest %d' % i), body=t) for i, t in enumerate(iterations) } self.body = Transformer(mapper).visit(self.body)
def _process(self, func): """Apply ``func`` to all tracked ``IETs``.""" for i in self._call_graph.topological_sort(): self._efuncs[i], metadata = func(self._efuncs[i]) # Track any new Dimensions and includes introduced by `func` self._dimensions.extend(list(metadata.get('dimensions', []))) self._includes.extend(list(metadata.get('includes', []))) # If there's a change to the `input` and the `iet` is an efunc, then # we must update the call sites as well, as the arguments dropped down # to the efunc have just increased _input = as_tuple(metadata.get('input')) if _input: # `extif` avoids redundant updates to the parameters list, due # to multiple children wanting to add the same input argument extif = lambda v: list(v) + [e for e in _input if e not in v] stack = [i] + self._call_graph.all_downstreams(i) for n in stack: efunc = self._efuncs[n] calls = [c for c in FindNodes(Call).visit(efunc) if c.name in stack] mapper = {c: c._rebuild(arguments=extif(c.arguments)) for c in calls} efunc = Transformer(mapper).visit(efunc) if efunc.is_Callable: efunc = efunc._rebuild(parameters=extif(efunc.parameters)) self._efuncs[n] = efunc self._input.extend(list(_input)) for k, v in metadata.get('efuncs', {}).items(): # Update the efuncs if k.is_Callable: self._efuncs[k.name] = k # Update the call graph self._call_graph.add_node(k.name, ignore_existing=True) for target in (v or [None]): self._call_graph.add_edge(k.name, target or 'main', force_add=True)
def test_create_elemental_functions_simple(simple_function): roots = [i[-1] for i in retrieve_iteration_tree(simple_function)] retagged = [i._rebuild(properties=tagger(0)) for i in roots] mapper = { i: j._rebuild(properties=(j.properties + (ELEMENTAL, ))) for i, j in zip(roots, retagged) } function = Transformer(mapper).visit(simple_function) handle = transform(function, mode='split') block = List(body=[handle.nodes] + handle.elemental_functions) output = str(block.ccode) # Make output compiler independent output = [ i for i in output.split('\n') if all([j not in i for j in ('#pragma', '/*')]) ] assert '\n'.join(output) == \ ("""void foo(float *restrict a_vec, float *restrict b_vec,""" """ float *restrict c_vec, float *restrict d_vec) { float (*restrict a) __attribute__((aligned(64))) = (float (*)) a_vec; float (*restrict b) __attribute__((aligned(64))) = (float (*)) b_vec; float (*restrict c)[j_size] __attribute__((aligned(64))) = (float (*)[j_size]) c_vec; float (*restrict d)[j_size][k_size] __attribute__((aligned(64))) =""" """ (float (*)[j_size][k_size]) d_vec; for (int i = 0; i < 3; i += 1) { for (int j = 0; j < 5; j += 1) { f_0(0,7,(float*)a,(float*)b,(float*)c,(float*)d,i,i_size,j,j_size,k_size); } } } void f_0(const int k_start, const int k_finish,""" """ float *restrict a_vec, float *restrict b_vec,""" """ float *restrict c_vec, float *restrict d_vec,""" """ const int i, const int i_size, const int j, const int j_size, const int k_size) { float (*restrict a) __attribute__((aligned(64))) = (float (*)) a_vec; float (*restrict b) __attribute__((aligned(64))) = (float (*)) b_vec; float (*restrict c)[j_size] __attribute__((aligned(64))) = (float (*)[j_size]) c_vec; float (*restrict d)[j_size][k_size] __attribute__((aligned(64))) =""" """ (float (*)[j_size][k_size]) d_vec; for (int k = k_start; k < k_finish; k += 1) { a[i] = a[i] + b[i] + 5.0F; a[i] = -a[i]*c[i][j] + b[i]*d[i][j][k]; } }""")
def make_grid_accesses(node, yk_grid_objs): """ Construct a new Iteration/Expression based on ``node``, in which all :class:`types.Indexed` accesses have been converted into YASK grid accesses. """ def make_grid_gets(expr): mapper = {} indexeds = retrieve_indexed(expr) data_carriers = [i for i in indexeds if i.base.function.from_YASK] for i in data_carriers: args = [ ListInitializer([INT(make_grid_gets(j)) for j in i.indices]) ] mapper[i] = make_sharedptr_funcall( namespace['code-grid-get'], args, yk_grid_objs[i.base.function.name]) return expr.xreplace(mapper) mapper = {} for i, e in enumerate(FindNodes(Expression).visit(node)): if e.is_ForeignExpression: continue lhs, rhs = e.expr.args # RHS translation rhs = make_grid_gets(rhs) # LHS translation if e.write.from_YASK: args = [rhs] args += [ ListInitializer([INT(make_grid_gets(i)) for i in lhs.indices]) ] call = namespace['code-grid-add' if e. is_Increment else 'code-grid-put'] handle = make_sharedptr_funcall(call, args, yk_grid_objs[e.write.name]) processed = ForeignExpression(handle, e.dtype, is_Increment=e.is_Increment) else: # Writing to a scalar temporary processed = e._rebuild(expr=e.expr.func(lhs, rhs)) mapper.update({e: processed}) return Transformer(mapper).visit(node)
def _insert_declarations(self, nodes): """Populate the Operator's body with the necessary variable declarations.""" # Resolve function calls first scopes = [] me = MapExpressions() for k, v in me.visit(nodes).items(): if k.is_Call: func = self.func_table[k.name] if func.local: scopes.extend(me.visit(func.root, queue=list(v)).items()) else: scopes.append((k, v)) # Determine all required declarations allocator = Allocator() mapper = OrderedDict() for k, v in scopes: if k.is_scalar: # Inline declaration mapper[k] = LocalExpression(**k.args) elif k.write._mem_external: # Nothing to do, variable passed as kernel argument continue elif k.write._mem_stack: # On the stack, as established by the DLE key = lambda i: not i.is_Parallel site = filter_iterations(v, key=key, stop='asap') or [nodes] allocator.push_stack(site[-1], k.write) else: # On the heap, as a tensor that must be globally accessible allocator.push_heap(k.write) # Introduce declarations on the stack for k, v in allocator.onstack: mapper[k] = tuple(Element(i) for i in v) nodes = NestedTransformer(mapper).visit(nodes) for k, v in list(self.func_table.items()): if v.local: self.func_table[k] = FunMeta( Transformer(mapper).visit(v.root), v.local) # Introduce declarations on the heap (if any) if allocator.onheap: decls, allocs, frees = zip(*allocator.onheap) nodes = List(header=decls + allocs, body=nodes, footer=frees) return nodes