def callback(self, clusters, prefix): if not prefix: return clusters d = prefix[-1].dim # Create the block Dimensions (in total `self.levels` Dimensions) name = self.template % (d.name, self.nblocked[d], '%d') bd = IncrDimension(name % 0, d, d.symbolic_min, d.symbolic_max) size = bd.step block_dims = [bd] for i in range(1, self.levels): bd = IncrDimension(name % i, bd, bd, bd + bd.step - 1, size=size) block_dims.append(bd) bd = IncrDimension(d.name, bd, bd, bd + bd.step - 1, 1, size=size) block_dims.append(bd) processed = [] for c in clusters: if TILABLE in c.properties[d]: ispace = decompose(c.ispace, d, block_dims) # Use the innermost IncrDimension in place of `d` exprs = [uxreplace(e, {d: bd}) for e in c.exprs] # The new Cluster properties # TILABLE property is dropped after the blocking. # SKEWABLE is dropped as well, but only from the new # block dimensions. properties = dict(c.properties) properties.pop(d) properties.update({bd: c.properties[d] - {TILABLE} for bd in block_dims}) properties.update({bd: c.properties[d] - {SKEWABLE} for bd in block_dims[:-1]}) processed.append(c.rebuild(exprs=exprs, ispace=ispace, properties=properties)) else: processed.append(c) # Make sure to use unique IncrDimensions self.nblocked[d] += int(any(TILABLE in c.properties[d] for c in clusters)) return processed
def optimize_unfolded_tree(unfolded, root): """ Transform folded trees to reduce the memory footprint. Examples -------- Given: .. code-block:: for i = 1 to N - 1 # Folded tree for j = 1 to N - 1 tmp[i,j] = ... for i = 2 to N - 2 # Root for j = 2 to N - 2 ... = ... tmp[i,j] ... The temporary ``tmp`` has shape ``(N-1, N-1)``. However, as soon as the iteration space is blocked, with blocks of shape ``(i_bs, j_bs)``, the ``tmp`` shape can be shrunk to ``(i_bs-1, j_bs-1)``. The resulting iteration tree becomes: .. code-block:: for i = 1 to i_bs + 1 # Folded tree for j = 1 to j_bs + 1 i' = i + i_block - 2 j' = j + j_block - 2 tmp[i,j] = ... # use i' and j' for i = i_block to i_block + i_bs # Root for j = j_block to j_block + j_bs i' = i - x_block j' = j - j_block ... = ... tmp[i',j'] ... """ processed = [] for i, tree in enumerate(unfolded): assert len(tree) == len(root) # We can optimize the folded trees only iff: # test0 := they compute temporary arrays, but not if they compute input data # test1 := the outer Iterations have actually been blocked exprs = FindNodes(Expression).visit(tree) writes = [j.write for j in exprs if j.is_tensor] test0 = not all(j.is_Array for j in writes) test1 = any(not isinstance(j.limits[0], BlockDimension) for j in root) if test0 or test1: processed.append(compose_nodes(tree)) root = compose_nodes(root) continue # Shrink the iteration space modified_tree = [] modified_root = [] modified_dims = {} mapper = {} for t, r in zip(tree, root): udim0 = IncrDimension(t.dim, t.symbolic_min, 1, "%ss%d" % (t.index, i)) modified_tree.append(t._rebuild(limits=(0, t.limits[1] - t.limits[0], t.step), uindices=t.uindices + (udim0,))) mapper[t.dim] = udim0 udim1 = IncrDimension(t.dim, 0, 1, "%ss%d" % (t.index, i)) modified_root.append(r._rebuild(uindices=r.uindices + (udim1,))) d = r.limits[0] assert isinstance(d, BlockDimension) modified_dims[d.root] = d # Temporary arrays can now be moved onto the stack for w in writes: dims = tuple(modified_dims.get(d, d) for d in w.dimensions) shape = tuple(d.symbolic_size for d in dims) w.update(shape=shape, dimensions=dims, scope='stack') # Substitute iteration variables within the folded trees modified_tree = compose_nodes(modified_tree) replaced = xreplace_indices([j.expr for j in exprs], mapper, lambda i: i.function not in writes, True) subs = [j._rebuild(expr=k) for j, k in zip(exprs, replaced)] processed.append(Transformer(dict(zip(exprs, subs))).visit(modified_tree)) # Introduce the new iteration variables within /root/ modified_root = compose_nodes(modified_root) exprs = FindNodes(Expression).visit(modified_root) candidates = [as_symbol(j.output) for j in subs] replaced = xreplace_indices([j.expr for j in exprs], mapper, candidates) subs = [j._rebuild(expr=k) for j, k in zip(exprs, replaced)] root = Transformer(dict(zip(exprs, subs))).visit(modified_root) return processed + [root]
def lower_aliases(cluster, aliases, in_writeto, maxpar): """ Create a Schedule from an AliasMapper. """ dmapper = {} processed = [] for alias, v in aliases.items(): imapper = { **{i.dim: i for i in v.intervals}, **{ i.dim.parent: i for i in v.intervals if i.dim.is_NonlinearDerived } } intervals = [] writeto = [] sub_iterators = {} indicess = [[] for _ in v.distances] for i in cluster.ispace.intervals: try: interval = imapper[i.dim] except KeyError: # E.g., `x0_blk0` or (`a[y_m+1]` => `y not in imapper`) intervals.append(i) continue assert i.stamp >= interval.stamp if not (writeto or interval != interval.zero() or in_writeto(i.dim, cluster)): # The alias doesn't require a temporary Dimension along i.dim intervals.append(i) continue assert not i.dim.is_NonlinearDerived # `i.dim` is necessarily part of the write-to region, so # we have to adjust the Interval's stamp. For example, consider # `i=x[0,0]<1>` and `interval=x[-4,4]<0>`; here we need to # use `<1>` as stamp, which is what appears in `cluster` interval = interval.lift(i.stamp) # We further bump the interval stamp if we were requested to trade # fusion for more collapse-parallelism interval = interval.lift(interval.stamp + int(maxpar)) writeto.append(interval) intervals.append(interval) if i.dim.is_Incr: # Suitable IncrDimensions must be used to avoid OOB accesses. # E.g., r[xs][ys][z] => both `xs` and `ys` must be initialized such # that all accesses are within bounds. This requires traversing the # hierarchy of IncrDimensions to set `xs` (`ys`) in a way that # consecutive blocks access consecutive regions in `r` (e.g., # `xs=x0_blk1-x0_blk0` with `blocklevels=2`; `xs=0` with # `blocklevels=1`, that is it degenerates in this case) try: d = dmapper[i.dim] except KeyError: dd = i.dim.parent assert dd.is_Incr if dd.parent.is_Incr: # An IncrDimension in between IncrDimensions m = i.dim.symbolic_min - i.dim.parent.symbolic_min else: m = 0 d = dmapper[i.dim] = IncrDimension("%ss" % i.dim.name, i.dim, m, dd.symbolic_size, 1, dd.step) sub_iterators[i.dim] = d else: d = i.dim # Given the iteration `interval`, lower distances to indices for distance, indices in zip(v.distances, indicess): indices.append(d - interval.lower + distance[interval.dim]) # The alias write-to space writeto = IterationSpace(IntervalGroup(writeto), sub_iterators) # The alias iteration space intervals = IntervalGroup(intervals, cluster.ispace.relations) ispace = IterationSpace(intervals, cluster.sub_iterators, cluster.directions) ispace = ispace.augment(sub_iterators) processed.append( ScheduledAlias(alias, writeto, ispace, v.aliaseds, indicess)) # The [ScheduledAliases] must be ordered so as to reuse as many of the # `cluster`'s IterationIntervals as possible in order to honor the # write-to region. Another fundamental reason for ordering is to ensure # deterministic code generation processed = sorted(processed, key=lambda i: cit(cluster.ispace, i.ispace)) return Schedule(*processed, dmapper=dmapper)
def optimize_unfolded_tree(unfolded, root): """ Transform folded trees to reduce the memory footprint. Examples -------- Given: .. code-block:: for i = 1 to N - 1 # Folded tree for j = 1 to N - 1 tmp[i,j] = ... for i = 2 to N - 2 # Root for j = 2 to N - 2 ... = ... tmp[i,j] ... The temporary ``tmp`` has shape ``(N-1, N-1)``. However, as soon as the iteration space is blocked, with blocks of shape ``(i_bs, j_bs)``, the ``tmp`` shape can be shrunk to ``(i_bs-1, j_bs-1)``. The resulting iteration tree becomes: .. code-block:: for i = 1 to i_bs + 1 # Folded tree for j = 1 to j_bs + 1 i' = i + i_block - 2 j' = j + j_block - 2 tmp[i,j] = ... # use i' and j' for i = i_block to i_block + i_bs # Root for j = j_block to j_block + j_bs i' = i - x_block j' = j - j_block ... = ... tmp[i',j'] ... """ processed = [] for i, tree in enumerate(unfolded): assert len(tree) == len(root) # We can optimize the folded trees only if they compute temporary # arrays, but not if they compute input data exprs = FindNodes(Expression).visit(tree[-1]) writes = [j.write for j in exprs if j.is_tensor] if not all(j.is_Array for j in writes): processed.append(compose_nodes(tree)) root = compose_nodes(root) continue modified_tree = [] modified_root = [] mapper = {} # "Shrink" the iteration space for t1, t2 in zip(tree, root): t1_udim = IncrDimension(t1.dim, t1.limits[0], 1, "%ss%d" % (t1.index, i)) limits = (0, t1.limits[1] - t1.limits[0], t1.symbolic_incr) modified_tree.append( t1._rebuild(limits=limits, uindices=t1.uindices + (t1_udim, ))) t2_udim = IncrDimension(t1.dim, -t1.limits[0], 1, "%ss%d" % (t1.index, i)) modified_root.append( t2._rebuild(uindices=t2.uindices + (t2_udim, ))) mapper[t1.dim] = t1_udim # Temporary arrays can now be moved onto the stack if all(not j.is_Remainder for j in modified_tree): dimensions = tuple(j.limits[0] for j in modified_root) for j in writes: if j.is_Array: j_dimensions = dimensions + j.dimensions[len(modified_root ):] j_shape = tuple(k.symbolic_size for k in j_dimensions) j.update(shape=j_shape, dimensions=j_dimensions, scope='stack') # Substitute iteration variables within the folded trees modified_tree = compose_nodes(modified_tree) replaced = xreplace_indices([j.expr for j in exprs], mapper, only_rhs=True) subs = [j._rebuild(expr=k) for j, k in zip(exprs, replaced)] processed.append( Transformer(dict(zip(exprs, subs))).visit(modified_tree)) # Introduce the new iteration variables within /root/ modified_root = compose_nodes(modified_root) exprs = FindNodes(Expression).visit(modified_root) candidates = [as_symbol(j.output) for j in subs] replaced = xreplace_indices([j.expr for j in exprs], mapper, candidates) subs = [j._rebuild(expr=k) for j, k in zip(exprs, replaced)] root = Transformer(dict(zip(exprs, subs))).visit(modified_root) return processed + [root]
def _create_efuncs(self, nodes, state): """ Extract Iteration sub-trees and turn them into Calls+Callables. Currently, only tagged, elementizable Iteration objects are targeted. """ noinline = self._compiler_decoration('noinline', c.Comment('noinline?')) efuncs = OrderedDict() mapper = {} for tree in retrieve_iteration_tree(nodes, mode='superset'): # Search an elementizable sub-tree (if any) tagged = filter_iterations(tree, lambda i: i.tag is not None, 'asap') if not tagged: continue root = tagged[0] if not root.is_Elementizable: continue target = tree[tree.index(root):] # Build a new Iteration/Expression tree with free bounds free = [] defined_args = {} # Map of argument values defined by loop bounds for i in target: name, bounds = i.dim.name, i.symbolic_bounds # Iteration bounds _min = Scalar(name='%sf_m' % name, dtype=np.int32, is_const=True) _max = Scalar(name='%sf_M' % name, dtype=np.int32, is_const=True) defined_args[_min.name] = bounds[0] defined_args[_max.name] = bounds[1] # Iteration unbounded indices ufunc = [ Scalar(name='%s_ub%d' % (name, j), dtype=np.int32) for j in range(len(i.uindices)) ] defined_args.update({ uf.name: j.symbolic_min for uf, j in zip(ufunc, i.uindices) }) uindices = [ IncrDimension(j.parent, i.dim + as_symbol(k), 1, j.name) for j, k in zip(i.uindices, ufunc) ] free.append( i._rebuild(limits=(_min, _max, 1), offsets=None, uindices=uindices)) # Construct elemental function body free = Transformer(dict((zip(target, free))), nested=True).visit(root) items = FindSymbols().visit(free) # Insert array casts casts = [ArrayCast(i) for i in items if i.is_Tensor] free = List(body=casts + [free]) # Insert declarations external = [i for i in items if i.is_Array] free = iet_insert_C_decls(free, external) # Create the Callable name = "f_%d" % root.tag params = derive_parameters(free) efuncs.setdefault(name, Callable(name, free, 'void', params, 'static')) # Create the Call args = [defined_args.get(i.name, i) for i in params] mapper[root] = List(header=noinline, body=Call(name, args)) # Transform the main tree processed = Transformer(mapper).visit(nodes) return processed, {'efuncs': efuncs.values()}