def topological_sort(exprs): """Topologically sort the temporaries in a list of equations.""" mapper = {e.lhs: e for e in exprs} assert len(mapper) == len(exprs) # Expect SSA # Build DAG and topologically-sort temporaries temporaries, tensors = split(exprs, lambda e: not e.lhs.is_Indexed) dag = DAG(nodes=temporaries) for e in temporaries: for r in retrieve_terminals(e.rhs): if r not in mapper: continue elif mapper[r] is e: # Avoid cyclic dependences, such as # Eq(f, f + 1) continue elif r.is_Indexed: # Only scalars enforce an ordering continue else: dag.add_edge(mapper[r], e, force_add=True) processed = dag.topological_sort() # Append tensor equations at the end in user-provided order processed.extend(tensors) return processed
def process(func, state): """ Apply ``func`` to the IETs in ``state._efuncs``, and update ``state`` accordingly. """ # Create a Call graph. `func` will be applied to each node in the Call graph. # `func` might change an `efunc` signature; the Call graph will be used to # propagate such change through the `efunc` callers dag = DAG(nodes=['root']) queue = ['root'] while queue: caller = queue.pop(0) callees = FindNodes(Call).visit(state._efuncs[caller]) for callee in filter_ordered([i.name for i in callees]): if callee in state._efuncs: # Exclude foreign Calls, e.g., MPI calls try: dag.add_node(callee) queue.append(callee) except KeyError: # `callee` already in `dag` pass dag.add_edge(callee, caller) assert dag.size == len(state._efuncs) # Apply `func` for i in dag.topological_sort(): state._efuncs[i], metadata = func(state._efuncs[i]) # Track any new Dimensions introduced by `func` state._dimensions.extend(list(metadata.get('dimensions', []))) # Track any new #include required by `func` state._includes.extend(list(metadata.get('includes', []))) state._includes = filter_ordered(state._includes) # Track any new ElementalFunctions state._efuncs.update( OrderedDict([(i.name, i) for i in metadata.get('efuncs', [])])) # If there's a change to the `args` and the `iet` is an efunc, then # we must update the call sites as well, as the arguments dropped down # to the efunc have just increased args = as_tuple(metadata.get('args')) if args: # `extif` avoids redundant updates to the parameters list, due # to multiple children wanting to add the same input argument extif = lambda v: list(v) + [e for e in args if e not in v] stack = [i] + dag.all_downstreams(i) for n in stack: efunc = state._efuncs[n] calls = [ c for c in FindNodes(Call).visit(efunc) if c.name in stack ] mapper = { c: c._rebuild(arguments=extif(c.arguments)) for c in calls } efunc = Transformer(mapper).visit(efunc) if efunc.is_Callable: efunc = efunc._rebuild(parameters=extif(efunc.parameters)) state._efuncs[n] = efunc
def _build_dag(self, cgroups, prefix): """ A DAG capturing dependences between *all* ClusterGroups within an iteration space. Examples -------- When do we need to sequentialize two ClusterGroup `cg0` and `cg1`? Essentially any time there's a dependence between them, apart from when it's a carried flow-dependence within the given iteration space. Let's consider two ClusterGroups `cg0` and `cg1` within the iteration space identified by the Dimension `i`. 1) cg0 := b[i, j] = ... cg1 := ... = ... b[i, j] ... Non-carried flow-dependence, so `cg1` must go after `cg0` 2) cg0 := b[i, j] = ... cg1 := ... = ... b[i, j+1] ... Anti-dependence in `j`, so `cg1` must go after `cg0` 3) cg0 := b[i, j] = ... cg1 := ... = ... b[i-1, j+1] ... Flow-dependence in `i`, so `cg1` can safely go before or after `cg0` (but clearly still within the `i` iteration space). Note: the `j+1` in `cg1` has no impact -- the dependence is in `i`. 4) cg0 := b[i, j] = ... cg1 := ... = ... b[i, j-1] ... Flow-dependence in `j`, so `cg1` must go after `cg0`. Unlike case 3), the flow-dependence is along an inner Dimension, so `cg0` and `cg1 need to be sequentialized. """ prefix = {i.dim for i in as_tuple(prefix)} dag = DAG(nodes=cgroups) for n, cg0 in enumerate(cgroups): for cg1 in cgroups[n + 1:]: scope = Scope(exprs=cg0.exprs + cg1.exprs) # Handle anti-dependences local_deps = cg0.scope.d_anti + cg1.scope.d_anti if scope.d_anti - local_deps: dag.add_edge(cg0, cg1) break # Flow-dependences along one of the `prefix` Dimensions can # be ignored; all others require sequentialization local_deps = cg0.scope.d_flow + cg1.scope.d_flow if any(not i.cause or not (i.cause & prefix) for i in scope.d_flow - local_deps): dag.add_edge(cg0, cg1) break return dag
def process(func, state): """ Apply ``func`` to the IETs in ``state._efuncs``, and update ``state`` accordingly. """ # Create a Call graph. `func` will be applied to each node in the Call graph. # `func` might change an `efunc` signature; the Call graph will be used to # propagate such change through the `efunc` callers dag = DAG(nodes=['root']) queue = ['root'] while queue: caller = queue.pop(0) callees = FindNodes(Call).visit(state._efuncs[caller]) for callee in filter_ordered([i.name for i in callees]): if callee in state._efuncs: # Exclude foreign Calls, e.g., MPI calls try: dag.add_node(callee) queue.append(callee) except KeyError: # `callee` already in `dag` pass dag.add_edge(callee, caller) assert dag.size == len(state._efuncs) # Apply `func` for i in dag.topological_sort(): state._efuncs[i], metadata = func(state._efuncs[i]) # Track any new Dimensions introduced by `func` state._dimensions.extend(list(metadata.get('dimensions', []))) # Track any new #include required by `func` state._includes.extend(list(metadata.get('includes', []))) state._includes = filter_ordered(state._includes) # Track any new ElementalFunctions state._efuncs.update(OrderedDict([(i.name, i) for i in metadata.get('efuncs', [])])) # If there's a change to the `args` and the `iet` is an efunc, then # we must update the call sites as well, as the arguments dropped down # to the efunc have just increased args = as_tuple(metadata.get('args')) if args: # `extif` avoids redundant updates to the parameters list, due # to multiple children wanting to add the same input argument extif = lambda v: list(v) + [e for e in args if e not in v] stack = [i] + dag.all_downstreams(i) for n in stack: efunc = state._efuncs[n] calls = [c for c in FindNodes(Call).visit(efunc) if c.name in stack] mapper = {c: c._rebuild(arguments=extif(c.arguments)) for c in calls} efunc = Transformer(mapper).visit(efunc) if efunc.is_Callable: efunc = efunc._rebuild(parameters=extif(efunc.parameters)) state._efuncs[n] = efunc
def _create_call_graph(self): dag = DAG(nodes=['root']) queue = ['root'] while queue: caller = queue.pop(0) callees = FindNodes(Call).visit(self.efuncs[caller]) for callee in filter_ordered([i.name for i in callees]): if callee in self.efuncs: # Exclude foreign Calls, e.g., MPI calls try: dag.add_node(callee) queue.append(callee) except KeyError: # `callee` already in `dag` pass dag.add_edge(callee, caller) # Sanity check assert dag.size == len(self.efuncs) return dag
def topological_sort(exprs): """Topologically sort a list of equations.""" mapper = {e.lhs: e for e in exprs} assert len(mapper) == len(exprs) # Expect SSA dag = DAG(nodes=exprs) for e in exprs: for r in retrieve_terminals(e.rhs): if r not in mapper: continue elif mapper[r] is e: # Avoid cyclic dependences, such as # Eq(f, f + 1) continue elif r.is_Indexed: # Only scalars enforce an ordering continue else: dag.add_edge(mapper[r], e, force_add=True) processed = dag.topological_sort() return processed
def _build_dag(self, cgroups, prefix): """ A DAG captures data dependences between ClusterGroups up to the iteration space depth dictated by ``prefix``. Examples -------- Consider two ClusterGroups `c0` and `c1`, and ``prefix=[i]``. 1) cg0 := b[i, j] = ... cg1 := ... = ... b[i, j] ... Non-carried flow-dependence, so `cg1` must go after `cg0`. 2) cg0 := b[i, j] = ... cg1 := ... = ... b[i, j-1] ... Carried flow-dependence in `j`, so `cg1` must go after `cg0`. 3) cg0 := b[i, j] = ... cg1 := ... = ... b[i, j+1] ... Carried anti-dependence in `j`, so `cg1` must go after `cg0`. 4) cg0 := b[i, j] = ... cg1 := ... = ... b[i-1, j+1] ... Carried flow-dependence in `i`, so `cg1` can safely go before or after `cg0`. Note: the `j+1` in `cg1` has no impact -- the actual dependence betweeb `b[i, j]` and `b[i-1, j+1]` is along `i`. """ prefix = {i.dim for i in as_tuple(prefix)} dag = DAG(nodes=cgroups) for n, cg0 in enumerate(cgroups): for cg1 in cgroups[n + 1:]: scope = Scope(exprs=cg0.exprs + cg1.exprs) # Handle anti-dependences deps = scope.d_anti - (cg0.scope.d_anti + cg1.scope.d_anti) if any(i.cause & prefix for i in deps): # Anti-dependences break the execution flow # i) ClusterGroups between `cg0` and `cg1` must precede `cg1` for cg2 in cgroups[n:cgroups.index(cg1)]: dag.add_edge(cg2, cg1) # ii) ClusterGroups after `cg1` cannot precede `cg1` for cg2 in cgroups[cgroups.index(cg1) + 1:]: dag.add_edge(cg1, cg2) break elif deps: dag.add_edge(cg0, cg1) # Flow-dependences along one of the `prefix` Dimensions can # be ignored; all others require sequentialization deps = scope.d_flow - (cg0.scope.d_flow + cg1.scope.d_flow) if any(not (i.cause and i.cause & prefix) for i in deps): dag.add_edge(cg0, cg1) continue # Handle increment-after-write dependences deps = scope.d_output - (cg0.scope.d_output + cg1.scope.d_output) if any(i.is_iaw for i in deps): dag.add_edge(cg0, cg1) continue return dag
def _build_dag(self, cgroups, prefix): """ A DAG representing the data dependences across the ClusterGroups within a given scope. """ prefix = {i.dim for i in as_tuple(prefix)} dag = DAG(nodes=cgroups) for n, cg0 in enumerate(cgroups): for cg1 in cgroups[n + 1:]: # A Scope to compute all cross-ClusterGroup anti-dependences rule = lambda i: i.is_cross scope = Scope(exprs=cg0.exprs + cg1.exprs, rules=rule) # Optimization: we exploit the following property: # no prefix => (edge <=> at least one (any) dependence) # to jump out of this potentially expensive loop as quickly as possible if not prefix and any(scope.d_all_gen()): dag.add_edge(cg0, cg1) # Anti-dependences along `prefix` break the execution flow # (intuitively, "the loop nests are to be kept separated") # * All ClusterGroups between `cg0` and `cg1` must precede `cg1` # * All ClusterGroups after `cg1` cannot precede `cg1` elif any(i.cause & prefix for i in scope.d_anti_gen()): for cg2 in cgroups[n:cgroups.index(cg1)]: dag.add_edge(cg2, cg1) for cg2 in cgroups[cgroups.index(cg1) + 1:]: dag.add_edge(cg1, cg2) break # Any anti- and iaw-dependences impose that `cg1` follows `cg0` # while not being its immediate successor (unless it already is), # to avoid they are fused together (thus breaking the dependence) # TODO: the "not being its immediate successor" part *seems* to be # a work around to the fact that any two Clusters characterized # by anti-dependence should have been given a different stamp, # and same for guarded Clusters, but that is not the case (yet) elif any(scope.d_anti_gen()) or\ any(i.is_iaw for i in scope.d_output_gen()): dag.add_edge(cg0, cg1) index = cgroups.index(cg1) - 1 if index > n and self._key(cg0) == self._key(cg1): dag.add_edge(cg0, cgroups[index]) dag.add_edge(cgroups[index], cg1) # Any flow-dependences along an inner Dimension (i.e., a Dimension # that doesn't appear in `prefix`) impose that `cg1` follows `cg0` elif any(not (i.cause and i.cause & prefix) for i in scope.d_flow_gen()): dag.add_edge(cg0, cg1) # Clearly, output dependences must be honored elif any(scope.d_output_gen()): dag.add_edge(cg0, cg1) return dag
def _build_dag(self, cgroups, prefix): """ A DAG captures data dependences between ClusterGroups up to the iteration space depth dictated by ``prefix``. Examples -------- Consider two ClusterGroups `c0` and `c1`, and ``prefix=[i]``. 1) cg0 := b[i, j] = ... cg1 := ... = ... b[i, j] ... Non-carried flow-dependence, so `cg1` must go after `cg0`. 2) cg0 := b[i, j] = ... cg1 := ... = ... b[i, j-1] ... Carried flow-dependence in `j`, so `cg1` must go after `cg0`. 3) cg0 := b[i, j] = ... cg1 := ... = ... b[i, j+1] ... Carried anti-dependence in `j`, so `cg1` must go after `cg0`. 4) cg0 := b[i, j] = ... cg1 := ... = ... b[i-1, j+1] ... Carried flow-dependence in `i`, so `cg1` can safely go before or after `cg0`. Note: the `j+1` in `cg1` has no impact -- the actual dependence betweeb `b[i, j]` and `b[i-1, j+1]` is along `i`. """ prefix = {i.dim for i in as_tuple(prefix)} dag = DAG(nodes=cgroups) for n, cg0 in enumerate(cgroups): for cg1 in cgroups[n + 1:]: rule = lambda i: i.is_cross # Only retain dep if cross-ClusterGroup scope = Scope(exprs=cg0.exprs + cg1.exprs, rules=rule) # Optimization: we exploit the following property: # no prefix => (edge <=> at least one (any) dependence) # To jump out of this potentially expensive loop as quickly as possible if not prefix and any(scope.d_all_gen()): dag.add_edge(cg0, cg1) continue # Handle anti-dependences if any(i.cause & prefix for i in scope.d_anti_gen()): # Anti-dependences break the execution flow # i) ClusterGroups between `cg0` and `cg1` must precede `cg1` for cg2 in cgroups[n:cgroups.index(cg1)]: dag.add_edge(cg2, cg1) # ii) ClusterGroups after `cg1` cannot precede `cg1` for cg2 in cgroups[cgroups.index(cg1) + 1:]: dag.add_edge(cg1, cg2) break elif any(scope.d_anti_gen()): dag.add_edge(cg0, cg1) continue # Flow-dependences along one of the `prefix` Dimensions can # be ignored; all others require sequentialization if any(not (i.cause and i.cause & prefix) for i in scope.d_flow_gen()): dag.add_edge(cg0, cg1) continue # Handle increment-after-write dependences if any(i.is_iaw for i in scope.d_output_gen()): dag.add_edge(cg0, cg1) continue return dag
class State(object): def __init__(self, iet): self._efuncs = OrderedDict([('main', iet)]) self._dimensions = [] self._input = [] self._includes = [] self._call_graph = DAG(nodes=['main']) def _process(self, func): """Apply ``func`` to all tracked ``IETs``.""" for i in self._call_graph.topological_sort(): self._efuncs[i], metadata = func(self._efuncs[i]) # Track any new Dimensions and includes introduced by `func` self._dimensions.extend(list(metadata.get('dimensions', []))) self._includes.extend(list(metadata.get('includes', []))) # If there's a change to the `input` and the `iet` is an efunc, then # we must update the call sites as well, as the arguments dropped down # to the efunc have just increased _input = as_tuple(metadata.get('input')) if _input: # `extif` avoids redundant updates to the parameters list, due # to multiple children wanting to add the same input argument extif = lambda v: list(v) + [e for e in _input if e not in v] stack = [i] + self._call_graph.all_downstreams(i) for n in stack: efunc = self._efuncs[n] calls = [c for c in FindNodes(Call).visit(efunc) if c.name in stack] mapper = {c: c._rebuild(arguments=extif(c.arguments)) for c in calls} efunc = Transformer(mapper).visit(efunc) if efunc.is_Callable: efunc = efunc._rebuild(parameters=extif(efunc.parameters)) self._efuncs[n] = efunc self._input.extend(list(_input)) for k, v in metadata.get('efuncs', {}).items(): # Update the efuncs if k.is_Callable: self._efuncs[k.name] = k # Update the call graph self._call_graph.add_node(k.name, ignore_existing=True) for target in (v or [None]): self._call_graph.add_edge(k.name, target or 'main', force_add=True) @property def root(self): return self._efuncs['main'] @property def efuncs(self): return tuple(v for k, v in self._efuncs.items() if k != 'main') @property def dimensions(self): return self._dimensions @property def input(self): return self._input @property def includes(self): return self._includes