def st_make_halo(stree): """ Add :class:`NodeHalo`s to a :class:`ScheduleTree`. A HaloNode captures the halo exchanges that should take place before executing the sub-tree; these are described by means of a :class:`HaloScheme`. """ # Build a HaloScheme for each expression bundle halo_schemes = {} for n in findall(stree, lambda i: i.is_Exprs): try: halo_schemes[n] = HaloScheme(n.exprs, n.ispace, n.dspace) except HaloSchemeException as e: if configuration['mpi']: raise RuntimeError(str(e)) # Insert the HaloScheme at a suitable level in the ScheduleTree mapper = {} for k, hs in halo_schemes.items(): for f, v in hs.fmapper.items(): spot = k ancestors = [n for n in k.ancestors if n.is_Iteration] for n in ancestors: test0 = any(n.dim is i.dim for i in v.halos) test1 = n.dim not in [i.root for i in v.loc_indices] if test0 or test1: spot = n break mapper.setdefault(spot, []).append((f, v)) for spot, entries in mapper.items(): insert(NodeHalo(HaloScheme(fmapper=dict(entries))), spot.parent, [spot]) return stree
def _hoist_halospots(iet): """ Hoist HaloSpots from inner to outer Iterations where all data dependencies would be honored. """ # Precompute scopes to save time scopes = { i: Scope([e.expr for e in v]) for i, v in MapNodes().visit(iet).items() } # Analysis hsmapper = {} imapper = defaultdict(list) for iters, halo_spots in MapNodes(Iteration, HaloSpot, 'groupby').visit(iet).items(): for hs in halo_spots: hsmapper[hs] = hs.halo_scheme for f in hs.fmapper: for n, i in enumerate(iters): maybe_hoistable = set().union( *[i.dim._defines for i in iters[n:]]) d_flow = scopes[i].d_flow.project(f) if all(not (dep.cause & maybe_hoistable) or dep.write.is_increment for dep in d_flow): hsmapper[hs] = hsmapper[hs].drop(f) imapper[i].append(hs.halo_scheme.project(f)) break # Post-process analysis mapper = { i: HaloSpot(HaloScheme.union(hss), i._rebuild()) for i, hss in imapper.items() } mapper.update({ i: i.body if hs.is_void else i._rebuild(halo_scheme=hs) for i, hs in hsmapper.items() }) # Transform the IET hoisting/dropping HaloSpots as according to the analysis iet = Transformer(mapper, nested=True).visit(iet) # Clean up: de-nest HaloSpots if necessary mapper = {} for hs in FindNodes(HaloSpot).visit(iet): if hs.body.is_HaloSpot: halo_scheme = HaloScheme.union( [hs.halo_scheme, hs.body.halo_scheme]) mapper[hs] = hs._rebuild(halo_scheme=halo_scheme, body=hs.body.body) iet = Transformer(mapper, nested=True).visit(iet) return iet
def stree_make_halo(stree): """ Add NodeHalos to a ScheduleTree. A NodeHalo captures the halo exchanges that should take place before executing the sub-tree; these are described by means of a HaloScheme. """ # Build a HaloScheme for each expression bundle halo_schemes = {} for n in findall(stree, lambda i: i.is_Exprs): try: halo_schemes[n] = HaloScheme(n.exprs, n.ispace) except HaloSchemeException as e: if configuration['mpi']: raise RuntimeError(str(e)) # Split a HaloScheme based on where it should be inserted # For example, it's possible that, for a given HaloScheme, a Function's # halo needs to be exchanged at a certain `stree` depth, while another # Function's halo needs to be exchanged before some other nodes mapper = {} for k, hs in halo_schemes.items(): for f, v in hs.fmapper.items(): spot = k ancestors = [n for n in k.ancestors if n.is_Iteration] for n in ancestors: # Place the halo exchange right before the first # distributed Dimension which requires it if any(i.dim in n.dim._defines for i in v.halos): spot = n break mapper.setdefault(spot, []).append(hs.project(f)) # Now fuse the HaloSchemes at the same `stree` depth and perform the insertion for spot, halo_schemes in mapper.items(): insert(NodeHalo(HaloScheme.union(halo_schemes)), spot.parent, [spot]) return stree
def _merge_halospots(iet): """ Merge HaloSpots on the same Iteration tree level where all data dependencies would be honored. """ # Analysis mapper = {} for i, halo_spots in MapNodes(Iteration, HaloSpot, 'immediate').visit(iet).items(): if i is None or len(halo_spots) <= 1: continue scope = Scope([e.expr for e in FindNodes(Expression).visit(i)]) hs0 = halo_spots[0] mapper[hs0] = hs0.halo_scheme for hs in halo_spots[1:]: mapper[hs] = hs.halo_scheme for f in hs.fmapper: test = True for dep in scope.d_flow.project(f): if not (dep.cause & set(hs.dimensions)): continue if dep.is_regular and all( not any(dep.read.touched_halo(c.root)) for c in dep.cause): continue test = False break if test: mapper[hs0] = HaloScheme.union( [mapper[hs0], hs.halo_scheme.project(f)]) mapper[hs] = mapper[hs].drop(f) # Post-process analysis mapper = { i: i.body if hs.is_void else i._rebuild(halo_scheme=hs) for i, hs in mapper.items() } # Transform the IET merging/dropping HaloSpots as according to the analysis iet = Transformer(mapper, nested=True).visit(iet) return iet
def st_make_halo(stree): """ Add :class:`NodeHalo` to a :class:`ScheduleTree`. A halo node describes what halo exchanges should take place before executing the sub-tree. """ if not configuration['mpi']: # TODO: This will be dropped as soon as stronger analysis will have # been implemented return stree processed = {} for n in LevelOrderIter(stree, stop=lambda i: i.parent in processed): if not n.is_Iteration: continue exprs = flatten(i.exprs for i in findall(n, lambda i: i.is_Exprs)) try: halo_scheme = HaloScheme(exprs) if n.dim in halo_scheme.dmapper: processed[n] = NodeHalo(halo_scheme) except HaloSchemeException: # We should get here only when trying to compute a halo # scheme for a group of expressions that belong to different # iteration spaces. We expect proper halo schemes to be built # as the `stree` visit proceeds. # TODO: However, at the end, we should check that a halo scheme, # possibly even a "void" one, has been built for *all* of the # expressions, and error out otherwise. continue except RuntimeError as e: if configuration['mpi'] is True: raise RuntimeError(str(e)) for k, v in processed.items(): insert(v, k.parent, [k]) return stree
def _optimize_halospots(self, iet): """ Optimize the HaloSpots in ``iet``. * Remove all ``useless`` HaloSpots; * Merge all ``hoistable`` HaloSpots with their root HaloSpot, thus removing redundant communications and anticipating communications that will be required by later Iterations. """ # Drop `useless` HaloSpots mapper = { hs: hs._rebuild(halo_scheme=hs.halo_scheme.drop(hs.useless)) for hs in FindNodes(HaloSpot).visit(iet) } iet = Transformer(mapper, nested=True).visit(iet) # Handle `hoistable` HaloSpots # First, we merge `hoistable` HaloSpots together, to anticipate communications mapper = {} for tree in retrieve_iteration_tree(iet): halo_spots = FindNodes(HaloSpot).visit(tree.root) if not halo_spots: continue root = halo_spots[0] if root in mapper: continue hss = [root.halo_scheme] hss.extend([ hs.halo_scheme.project(hs.hoistable) for hs in halo_spots[1:] ]) try: mapper[root] = root._rebuild(halo_scheme=HaloScheme.union(hss)) except ValueError: # HaloSpots have non-matching `loc_indices` and therefore can't be merged warning("Found hoistable HaloSpots with disjoint loc_indices, " "skipping optimization") continue for hs in halo_spots[1:]: halo_scheme = hs.halo_scheme.drop(hs.hoistable) if halo_scheme.is_void: mapper[hs] = hs.body else: mapper[hs] = hs._rebuild(halo_scheme=halo_scheme) iet = Transformer(mapper, nested=True).visit(iet) # Then, we make sure the halo exchanges get performed *before* # the first distributed Dimension. Again, we do this to anticipate # communications, which hopefully has a pay off in performance # # <Iteration x> <HaloSpot(u)>, in y # <HaloSpot(u)>, in y ----> <Iteration x> # <Iteration y> <Iteration y> mapper = {} for i, halo_spots in MapNodes(Iteration, HaloSpot).visit(iet).items(): hoistable = [hs for hs in halo_spots if hs.hoistable] if not hoistable: continue elif len(hoistable) > 1: # We should never end up here, but for now we can't prove it formally warning( "Found multiple hoistable HaloSpots, skipping optimization" ) continue hs = hoistable.pop() if hs in mapper: continue if i.dim.root in hs.dimensions: halo_scheme = hs.halo_scheme.drop(hs.hoistable) if halo_scheme.is_void: mapper[hs] = hs.body else: mapper[hs] = hs._rebuild(halo_scheme=halo_scheme) halo_scheme = hs.halo_scheme.project(hs.hoistable) mapper[i] = hs._rebuild(halo_scheme=halo_scheme, body=i._rebuild()) iet = Transformer(mapper, nested=True).visit(iet) # Finally, we try to move HaloSpot-free Iteration nests within HaloSpot # subtrees, to overlap as much computation as possible. The HaloSpot-free # Iteration nests must be fully affine, otherwise we wouldn't be able to # honour the data dependences along the halo # # <HaloSpot(u,v)> HaloSpot(u,v) # <A> ----> <A> # <B> affine? <B> # # Here, <B> doesn't require any halo exchange, but it might still need the # output of <A>; thus, if we do computation/communication overlap over <A> # *and* want to embed <B> within the HaloSpot, then <B>'s iteration space # will have to be split as well. For this, <B> must be affine. mapper = {} for v in FindAdjacent((HaloSpot, Iteration)).visit(iet).values(): for g in v: root = None for i in g: if i.is_HaloSpot: root = i mapper[root] = [root.body] elif root and all(j.is_Affine for j in FindNodes(Iteration).visit(i)): mapper[root].append(i) mapper[i] = None else: root = None mapper = { k: k._rebuild(body=List(body=v)) if v else v for k, v in mapper.items() } iet = Transformer(mapper).visit(iet) return iet, {}
def _hoist_halospots(iet): """ Hoist HaloSpots from inner to outer Iterations where all data dependencies would be honored. """ # Hoisting rules -- if the retval is True, then it means the input `dep` is not # a stopper to halo hoisting def rule0(dep, candidates, loc_dims): # E.g., `dep=W<f,[x]> -> R<f,[x-1]>` and `candidates=({time}, {x})` => False # E.g., `dep=W<f,[t1, x, y]> -> R<f,[t0, x-1, y+1]>`, `dep.cause={t,time}` and # `candidates=({x},)` => True return (all(i & set(dep.distance_mapper) for i in candidates) and not any(i & dep.cause for i in candidates) and not any(i & loc_dims for i in candidates)) def rule1(dep, candidates, loc_dims): # An increment isn't a stopper to hoisting return dep.write.is_increment hoist_rules = [rule0, rule1] # Precompute scopes to save time scopes = { i: Scope([e.expr for e in v]) for i, v in MapNodes().visit(iet).items() } # Analysis hsmapper = {} imapper = defaultdict(list) for iters, halo_spots in MapNodes(Iteration, HaloSpot, 'groupby').visit(iet).items(): for hs in halo_spots: hsmapper[hs] = hs.halo_scheme for f, (loc_indices, _) in hs.fmapper.items(): loc_dims = frozenset().union( [q for d in loc_indices for q in d._defines]) for n, i in enumerate(iters): candidates = [i.dim._defines for i in iters[n:]] test = True for dep in scopes[i].d_flow.project(f): if any( rule(dep, candidates, loc_dims) for rule in hoist_rules): continue test = False break if test: hsmapper[hs] = hsmapper[hs].drop(f) imapper[i].append(hs.halo_scheme.project(f)) break # Post-process analysis mapper = { i: HaloSpot(HaloScheme.union(hss), i._rebuild()) for i, hss in imapper.items() } mapper.update({ i: i.body if hs.is_void else i._rebuild(halo_scheme=hs) for i, hs in hsmapper.items() }) # Transform the IET hoisting/dropping HaloSpots as according to the analysis iet = Transformer(mapper, nested=True).visit(iet) # Clean up: de-nest HaloSpots if necessary mapper = {} for hs in FindNodes(HaloSpot).visit(iet): if hs.body.is_HaloSpot: halo_scheme = HaloScheme.union( [hs.halo_scheme, hs.body.halo_scheme]) mapper[hs] = hs._rebuild(halo_scheme=halo_scheme, body=hs.body.body) iet = Transformer(mapper, nested=True).visit(iet) return iet
def _merge_halospots(iet): """ Merge HaloSpots on the same Iteration tree level where all data dependencies would be honored. """ # Merge rules -- if the retval is True, then it means the input `dep` is not # a stopper to halo merging def rule0(dep, hs, loc_indices): # E.g., `dep=W<f,[t1, x]> -> R<f,[t0, x-1]>` => True return not any( d in hs.dimensions or dep.distance_mapper[d] is S.Infinity for d in dep.cause) def rule1(dep, hs, loc_indices): # TODO This is apparently never hit, but feeling uncomfortable to remove it return dep.is_regular and all(not any(dep.read.touched_halo(d.root)) for d in dep.cause) def rule2(dep, hs, loc_indices): # E.g., `dep=W<f,[t1, x+1]> -> R<f,[t1, xl+1]>` and `loc_indices={t: t0}` => True return any(dep.distance_mapper[d] == 0 and dep.source[d] is not v for d, v in loc_indices.items()) merge_rules = [rule0, rule1, rule2] # Analysis mapper = {} for i, halo_spots in MapNodes(Iteration, HaloSpot, 'immediate').visit(iet).items(): if i is None or len(halo_spots) <= 1: continue scope = Scope([e.expr for e in FindNodes(Expression).visit(i)]) hs0 = halo_spots[0] mapper[hs0] = hs0.halo_scheme for hs in halo_spots[1:]: mapper[hs] = hs.halo_scheme for f, (loc_indices, _) in hs.fmapper.items(): test = True for dep in scope.d_flow.project(f): if any(rule(dep, hs, loc_indices) for rule in merge_rules): continue test = False break if test: try: mapper[hs0] = HaloScheme.union( [mapper[hs0], hs.halo_scheme.project(f)]) mapper[hs] = mapper[hs].drop(f) except ValueError: # `hs.loc_indices=<frozendict {t: t1}` and # `hs0.loc_indices=<frozendict {t: t0}` pass # Post-process analysis mapper = { i: i.body if hs.is_void else i._rebuild(halo_scheme=hs) for i, hs in mapper.items() } # Transform the IET merging/dropping HaloSpots as according to the analysis iet = Transformer(mapper, nested=True).visit(iet) return iet