def guard(clusters): """ Split Clusters containing conditional expressions into separate Clusters. """ processed = [] for c in clusters: # Group together consecutive expressions with same ConditionalDimensions for cds, g in groupby(c.exprs, key=lambda e: e.conditionals): if not cds: processed.append(Cluster(list(g), c.ispace, c.dspace)) continue # Create a guarded Cluster guards = {} for cd in cds: condition = guards.setdefault(cd.parent, []) if cd.condition is None: condition.append(CondEq(cd.parent % cd.factor, 0)) else: condition.append(cd.condition) guards = { k: sympy.And(*v, evaluate=False) for k, v in guards.items() } processed.append(Cluster(list(g), c.ispace, c.dspace, guards)) return processed
def guard(clusters): """ Split Clusters containing conditional expressions into separate Clusters. """ processed = [] for c in clusters: free = [] for e in c.exprs: if e.conditionals: # Expressions that need no guarding are kept in a separate Cluster if free: processed.append(Cluster(free, c.ispace, c.dspace)) free = [] # Create a guarded Cluster guards = {} for d in e.conditionals: condition = guards.setdefault(d.parent, []) if d.condition is None: condition.append(CondEq(d.parent % d.factor, 0)) else: condition.append(d.condition) guards = { k: sympy.And(*v, evaluate=False) for k, v in guards.items() } processed.append(Cluster(e, c.ispace, c.dspace, guards)) else: free.append(e) # Leftover if free: processed.append(Cluster(free, c.ispace, c.dspace)) return processed
def clusterize(exprs, dse_mode=None): """ Turn a sequence of LoweredEqs into a sequence of Clusters. """ # Initialization clusters = [Cluster(e, e.ispace, e.dspace) for e in exprs] # Compute a topological ordering that honours flow- and anti-dependences. # This is necessary prior to enforcing the iteration direction (step below) clusters = Toposort().process(clusters) # Enforce iteration directions. This turns anti- into flow-dependences by # reversing the iteration direction (Backward instead of Forward). A new # topological sorting is then computed to expose more fusion opportunities, # which will be exploited within `optimize` clusters = Enforce().process(clusters) clusters = Toposort().process(clusters) # Apply optimizations clusters = optimize(clusters, dse_mode) # Introduce conditional Clusters clusters = guard(clusters) return ClusterGroup(clusters)
def callback(self, clusters, prefix): if not prefix: # No iteration space to be lifted from return clusters hope_invariant = {i.dim for i in prefix} candidates = [ c for c in clusters if any(e.is_Tensor for e in c.exprs) and # Not just scalar exprs not any(e.is_Increment for e in c.exprs) and # No reductions not c.used_dimensions & hope_invariant ] # Not an invariant ispace if not candidates: return clusters # Now check data dependences lifted = [] processed = [] for c in clusters: impacted = set(clusters) - {c} if c in candidates and\ not any(set(c.functions) & set(i.scope.writes) for i in impacted): # Perform lifting, which requires contracting the iteration space key = lambda d: d not in hope_invariant ispace = c.ispace.project(key) dspace = c.dspace.project(key) lifted.append(Cluster(c.exprs, ispace, dspace, guards=c.guards)) else: processed.append(c) return lifted + processed
def clusterize(exprs, stencils, atomics=None): """ Derive :class:`Cluster` objects from an iterable of expressions; a stencil for each expression must be provided. A list of atomic dimensions (see description in Cluster.__doc__) may be provided. """ assert len(exprs) == len(stencils) exprs, stencils = aggregate(exprs, stencils) Info = namedtuple('Info', 'trace stencil') # Build a dependence graph and associate each node with its Stencil mapper = OrderedDict() g = TemporariesGraph(exprs) for (k, v), j in zip(g.items(), stencils): if v.is_tensor: trace = g.trace(k) trace += tuple(i for i in g.trace(k, readby=True) if i not in trace) mapper[k] = Info(trace, j) # A cluster stencil is determined iteratively, by first calculating the # "local" stencil and then by looking at the stencils of all other clusters # depending on it. The stencil information is propagated until there are # no more updates. queue = list(mapper) while queue: target = queue.pop(0) info = mapper[target] strict_trace = [i.lhs for i in info.trace if i.lhs != target] stencil = Stencil(info.stencil.entries) for i in strict_trace: if i in mapper: stencil = stencil.add(mapper[i].stencil) mapper[target] = Info(info.trace, stencil) if stencil != info.stencil: # Something has changed, need to propagate the update queue.extend([i for i in strict_trace if i not in queue]) clusters = [] for target, info in mapper.items(): # Drop all non-output tensors, as computed by other clusters exprs = [i for i in info.trace if i.lhs.is_Symbol or i.lhs == target] # Create and track the cluster clusters.append(Cluster(exprs, info.stencil.frozen, atomics)) return merge(clusters)
def fuse(clusters): """ Fuse sub-sequences of Clusters with compatible IterationSpace. """ processed = [] for k, g in groupby(clusters, key=lambda cg: cg.itintervals): maybe_fusible = list(g) if len(maybe_fusible) == 1 or any(c.guards for c in maybe_fusible): processed.extend(maybe_fusible) else: # Perform fusion fused = Cluster.from_clusters(*maybe_fusible) processed.append(fused) return processed
def clusterize(exprs): """ Turn a sequence of LoweredEqs into a sequence of Clusters. """ # Initialization clusters = [Cluster(e, e.ispace, e.dspace) for e in exprs] # Setup the IterationSpaces based on data dependence analysis clusters = Schedule().process(clusters) # Handle ConditionalDimensions clusters = guard(clusters) # Determine relevant computational properties (e.g., parallelism) clusters = analyze(clusters) return ClusterGroup(clusters)
def clusterize(exprs, dse_mode=None): """ Turn a sequence of LoweredEqs into a sequence of Clusters. """ # Initialization clusters = [Cluster(e, e.ispace, e.dspace) for e in exprs] # Compute a topological ordering that honours flow- and anti-dependences clusters = Toposort().process(clusters) # Setup the IterationSpaces based on data dependence analysis clusters = Schedule().process(clusters) # Introduce conditional Clusters clusters = guard(clusters) # Apply optimizations clusters = optimize(clusters, dse_mode) return ClusterGroup(clusters)
def clusterize(exprs): """ Turn a sequence of LoweredEqs into a sequence of Clusters. """ # Initialization clusters = [Cluster(e, e.ispace, e.dspace) for e in exprs] # Compute a topological ordering that honours flow- and anti-dependences clusters = Toposort().process(clusters) # Setup the IterationSpaces based on data dependence analysis clusters = Schedule().process(clusters) # Introduce conditional Clusters clusters = guard(clusters) # Determine relevant computational properties (e.g., parallelism) clusters = analyze(clusters) return ClusterGroup(clusters)
def fuse(clusters): """ Fuse sub-sequences of Clusters with compatible IterationSpace. """ processed = [] for k, g in groupby(clusters, key=lambda c: set(c.itintervals)): maybe_fusible = list(g) if len(maybe_fusible) == 1 or any(c.guards for c in maybe_fusible): processed.extend(maybe_fusible) else: try: # Perform fusion fused = Cluster.from_clusters(*maybe_fusible) processed.append(fused) except ValueError: # We end up here if, for example, some Clusters have same # iteration Dimensions but different (partial) orderings processed.extend(maybe_fusible) return processed
def callback(self, clusters, prefix): if not prefix: # No iteration space to be lifted from return clusters hope_invariant = {i.dim for i in prefix} lifted = [] processed = [] for n, c in enumerate(clusters): # Increments prevent lifting if c.has_increments: processed.append(c) continue # Is `c` a real candidate -- is there at least one invariant Dimension? if c.used_dimensions & hope_invariant: processed.append(c) continue impacted = set(processed) | set(clusters[n + 1:]) # None of the Functions appearing in a lifted Cluster can be written to if any(c.functions & set(i.scope.writes) for i in impacted): processed.append(c) continue # Scalars prevent lifting if they are read by another Cluster swrites = {f for f in c.scope.writes if f.is_Scalar} if any(swrites & set(i.scope.reads) for i in impacted): processed.append(c) continue # Perform lifting, which requires contracting the iteration space key = lambda d: d not in hope_invariant ispace = c.ispace.project(key).reset() dspace = c.dspace.project(key).reset() lifted.append(Cluster(c.exprs, ispace, dspace, c.guards)) return lifted + processed
def merge(clusters): """ Given an ordered collection of :class:`Cluster` objects, return a (potentially) smaller sequence in which clusters with identical stencil have been merged into a single :class:`Cluster`. """ mapper = OrderedDict() for c in clusters: mapper.setdefault((c.stencil.entries, c.atomics), []).append(c) processed = [] for (entries, atomics), clusters in mapper.items(): # Eliminate redundant temporaries temporaries = OrderedDict() for c in clusters: for k, v in c.trace.items(): if k not in temporaries: temporaries[k] = v # Squash the clusters together processed.append(Cluster(temporaries.values(), Stencil(entries), atomics)) return processed
def callback(self, clusters, prefix, backlog=None, known_break=None): if not prefix: return clusters known_break = known_break or set() backlog = backlog or [] # Take the innermost Dimension -- no other Clusters other than those in # `clusters` are supposed to share it candidates = prefix[-1].dim._defines scope = Scope(exprs=flatten(c.exprs for c in clusters)) # The nastiest case: # eq0 := u[t+1, x] = ... u[t, x] # eq1 := v[t+1, x] = ... v[t, x] ... u[t, x] ... u[t+1, x] ... u[t+2, x] # Here, `eq0` marches forward along `t`, while `eq1` has both a flow and an # anti dependence with `eq0`, which ultimately will require `eq1` to go in # a separate t-loop require_break = (scope.d_flow.cause & scope.d_anti.cause) & candidates if require_break and len(clusters) > 1: backlog = [clusters[-1]] + backlog # Try with increasingly smaller Cluster groups until the ambiguity is solved return self.callback(clusters[:-1], prefix, backlog, require_break) # If the flow- or anti-dependences are not coupled, one or more Clusters # might be scheduled separately, to increase parallelism (this is basically # what low-level compilers call "loop fission") for n, _ in enumerate(clusters): d_cross = scope.d_from_access(scope.a_query(n, 'R')).cross() if any(d.is_storage_volatile(candidates) for d in d_cross): break elif d_cross.cause & candidates: if n > 0: return self.callback( clusters[:n], prefix, clusters[n:] + backlog, (d_cross.cause & candidates) | known_break) break # Compute iteration direction direction = { d: Backward for d in candidates if d.root in scope.d_anti.cause } direction.update( {d: Forward for d in candidates if d.root in scope.d_flow.cause}) direction.update( {d: Forward for d in candidates if d not in direction}) # Enforce iteration direction on each Cluster processed = [] for c in clusters: ispace = IterationSpace(c.ispace.intervals, c.ispace.sub_iterators, { **c.ispace.directions, **direction }) processed.append(Cluster(c.exprs, ispace, c.dspace)) if not backlog: return processed # Handle the backlog -- the Clusters characterized by flow- and anti-dependences # along one or more Dimensions direction = {d: Any for d in known_break} for i, c in enumerate(list(backlog)): ispace = IterationSpace(c.ispace.intervals.lift(known_break), c.ispace.sub_iterators, { **c.ispace.directions, **direction }) dspace = c.dspace.lift(known_break) backlog[i] = Cluster(c.exprs, ispace, dspace) return processed + self.callback(backlog, prefix)
def callback(self, clusters, prefix, backlog=None, known_break=None): if not prefix: return clusters known_break = known_break or set() backlog = backlog or [] # Take the innermost Dimension -- no other Clusters other than those in # `clusters` are supposed to share it candidates = prefix[-1].dim._defines scope = Scope(exprs=flatten(c.exprs for c in clusters)) # Handle the nastiest case -- ambiguity due to the presence of both a # flow- and an anti-dependence. # # Note: in most cases, `scope.d_anti.cause == {}` -- either because # `scope.d_anti == {}` or because the few anti dependences are not carried # in any Dimension. We exploit this observation so that we only compute # `d_flow`, which instead may be expensive, when strictly necessary maybe_break = scope.d_anti.cause & candidates if len(clusters) > 1 and maybe_break: require_break = scope.d_flow.cause & maybe_break if require_break: backlog = [clusters[-1]] + backlog # Try with increasingly smaller ClusterGroups until the ambiguity is gone return self.callback(clusters[:-1], prefix, backlog, require_break) # Schedule Clusters over different IterationSpaces if this increases parallelism for i in range(1, len(clusters)): if self._break_for_parallelism(scope, candidates, i): return self.callback(clusters[:i], prefix, clusters[i:] + backlog, candidates | known_break) # Compute iteration direction idir = { d: Backward for d in candidates if d.root in scope.d_anti.cause } if maybe_break: idir.update({ d: Forward for d in candidates if d.root in scope.d_flow.cause }) idir.update({d: Forward for d in candidates if d not in idir}) # Enforce iteration direction on each Cluster processed = [] for c in clusters: ispace = IterationSpace(c.ispace.intervals, c.ispace.sub_iterators, { **c.ispace.directions, **idir }) processed.append(Cluster(c.exprs, ispace, c.dspace)) if not backlog: return processed # Handle the backlog -- the Clusters characterized by flow- and anti-dependences # along one or more Dimensions idir = {d: Any for d in known_break} for i, c in enumerate(list(backlog)): ispace = IterationSpace(c.ispace.intervals.lift(known_break), c.ispace.sub_iterators, { **c.ispace.directions, **idir }) dspace = c.dspace.lift(known_break) backlog[i] = Cluster(c.exprs, ispace, dspace) return processed + self.callback(backlog, prefix)
def callback(self, clusters, prefix, backlog=None, known_flow_break=None): if not prefix: return clusters # Take the innermost Dimension -- no other Clusters other than those in # `clusters` are supposed to share it candidates = prefix[-1].dim._defines scope = Scope(exprs=flatten(c.exprs for c in clusters)) # The most nasty case: # eq0 := u[t+1, x] = ... u[t, x] # eq1 := v[t+1, x] = ... v[t, x] ... u[t, x] ... u[t+1, x] ... u[t+2, x] # Here, `eq0` marches forward along `t`, while `eq1` has both a flow and an # anti dependence with `eq0`, which ultimately will require `eq1` to go in # a separate t-loop require_flow_break = (scope.d_flow.cause & scope.d_anti.cause) & candidates if require_flow_break and len(clusters) > 1: backlog = [clusters[-1]] + (backlog or []) # Try with increasingly smaller Cluster groups until the ambiguity is solved return self.callback(clusters[:-1], prefix, backlog, require_flow_break) # Compute iteration direction direction = { d: Backward for d in candidates if d.root in scope.d_anti.cause } direction.update( {d: Forward for d in candidates if d.root in scope.d_flow.cause}) direction.update( {d: Forward for d in candidates if d not in direction}) # Enforce iteration direction on each Cluster processed = [] for c in clusters: ispace = IterationSpace(c.ispace.intervals, c.ispace.sub_iterators, { **c.ispace.directions, **direction }) processed.append(Cluster(c.exprs, ispace, c.dspace)) if backlog is None: return processed # Handle the backlog -- the Clusters characterized by flow+anti dependences along # one or more Dimensions direction = {d: Any for d in known_flow_break} for i, c in enumerate(as_tuple(backlog)): ispace = IterationSpace(c.ispace.intervals.lift(known_flow_break), c.ispace.sub_iterators, { **c.ispace.directions, **direction }) backlog[i] = Cluster(c.exprs, ispace, c.dspace) return processed + self.callback(backlog, prefix)