Пример #1
0
def iet_lower_dimensions(iet):
    """
    Replace all DerivedDimensions within the ``iet``'s expressions with
    lower-level symbolic objects (other Dimensions or Symbols).

        * Array indices involving SteppingDimensions are turned into ModuloDimensions.
          Example: ``u[t+1, x] = u[t, x] + 1 >>> u[t1, x] = u[t0, x] + 1``
        * Array indices involving ConditionalDimensions used are turned into
          integer-division expressions.
          Example: ``u[t_sub, x] = u[time, x] >>> u[time / 4, x] = u[time, x]``
    """
    # Lower SteppingDimensions
    for i in FindNodes(Iteration).visit(iet):
        if not i.uindices:
            # Be quick: avoid uselessy reconstructing nodes
            continue
        # In an expression, there could be `u[t+1, ...]` and `v[t+1, ...]`, where
        # `u` and `v` are TimeFunction with circular time buffers (save=None) *but*
        # different modulo extent. The `t+1` indices above are therefore conceptually
        # different, so they will be replaced with the proper ModuloDimension through
        # two different calls to `xreplace`
        groups = as_mapper(i.uindices, lambda d: d.modulo)
        for k, v in groups.items():
            mapper = {d.origin: d for d in v}
            rule = lambda i: i.function.is_TimeFunction and i.function._time_size == k
            replacer = lambda i: xreplace_indices(i, mapper, rule)
            iet = XSubs(replacer=replacer).visit(iet)

    # Lower ConditionalDimensions
    cdims = [d for d in FindSymbols('free-symbols').visit(iet)
             if isinstance(d, ConditionalDimension)]
    mapper = {d: IntDiv(d.index, d.factor) for d in cdims}
    iet = XSubs(mapper).visit(iet)

    return iet
Пример #2
0
def iet_lower_dimensions(iet):
    """
    Replace all DerivedDimensions within the ``iet``'s expressions with
    lower-level symbolic objects (other Dimensions or Symbols).

        * Array indices involving SteppingDimensions are turned into ModuloDimensions.
          Example: ``u[t+1, x] = u[t, x] + 1 >>> u[t1, x] = u[t0, x] + 1``
        * Array indices involving ConditionalDimensions used are turned into
          integer-division expressions.
          Example: ``u[t_sub, x] = u[time, x] >>> u[time / 4, x] = u[time, x]``
    """
    # Lower SteppingDimensions
    for i in FindNodes(Iteration).visit(iet):
        if not i.uindices:
            # Be quick: avoid uselessy reconstructing nodes
            continue
        # In an expression, there could be `u[t+1, ...]` and `v[t+1, ...]`, where
        # `u` and `v` are TimeFunction with circular time buffers (save=None) *but*
        # different modulo extent. The `t+1` indices above are therefore conceptually
        # different, so they will be replaced with the proper ModuloDimension through
        # two different calls to `xreplace`
        groups = as_mapper(i.uindices, lambda d: d.modulo)
        for k, v in groups.items():
            mapper = {d.origin: d for d in v}
            rule = lambda i: i.function.is_TimeFunction and i.function._time_size == k
            replacer = lambda i: xreplace_indices(i, mapper, rule)
            iet = XSubs(replacer=replacer).visit(iet)

    # Lower ConditionalDimensions
    cdims = [d for d in FindSymbols('free-symbols').visit(iet)
             if isinstance(d, ConditionalDimension)]
    mapper = {d: IntDiv(d.index, d.factor) for d in cdims}
    iet = XSubs(mapper).visit(iet)

    return iet
Пример #3
0
    def callback(self, clusters, prefix):
        if not prefix:
            return clusters

        d = prefix[-1].dim

        processed = []
        for c in clusters:
            if SKEWABLE not in c.properties[d]:
                return clusters

            skew_dims = {
                i.dim
                for i in c.ispace if SEQUENTIAL in c.properties[i.dim]
            }
            if len(skew_dims) > 1:
                return clusters
            skew_dim = skew_dims.pop()

            # Since we are here, prefix is skewable and nested under a SEQUENTIAL loop
            intervals = []
            for i in c.ispace:
                if i.dim is d and (not d.is_Block or d._depth == 1):
                    intervals.append(Interval(d, skew_dim, skew_dim))
                else:
                    intervals.append(i)
            intervals = IntervalGroup(intervals, relations=c.ispace.relations)
            ispace = IterationSpace(intervals, c.ispace.sub_iterators,
                                    c.ispace.directions)

            exprs = xreplace_indices(c.exprs, {d: d - skew_dim})
            processed.append(c.rebuild(exprs=exprs, ispace=ispace))

        return processed
Пример #4
0
def _lower_stepping_dims(iet):
    """
    Lower SteppingDimensions: index functions involving SteppingDimensions are
    turned into ModuloDimensions.

    Examples
    --------
    u[t+1, x] = u[t, x] + 1

    becomes

    u[t1, x] = u[t0, x] + 1
    """
    for i in FindNodes(Iteration).visit(iet):
        if not i.uindices:
            # Be quick: avoid uselessy reconstructing nodes
            continue
        # In an expression, there could be `u[t+1, ...]` and `v[t+1, ...]`, where
        # `u` and `v` are TimeFunction with circular time buffers (save=None) *but*
        # different modulo extent. The `t+1` indices above are therefore conceptually
        # different, so they will be replaced with the proper ModuloDimension through
        # two different calls to `xreplace`
        mindices = [d for d in i.uindices if d.is_Modulo]
        groups = as_mapper(mindices, lambda d: d.modulo)
        for k, v in groups.items():
            mapper = {d.origin: d for d in v}
            rule = lambda i: i.function.is_TimeFunction and i.function._time_size == k
            replacer = lambda i: xreplace_indices(i, mapper, rule)
            iet = XSubs(replacer=replacer).visit(iet)

    return iet
Пример #5
0
def scalarize(clusters, template):
    """
    Turn local "isolated" Arrays, that is Arrays appearing only in one Cluster,
    into Scalars.
    """
    processed = []
    for c in clusters:
        # Get any Arrays appearing only in `c`
        impacted = set(clusters) - {c}
        arrays = {i for i in c.scope.writes if i.is_Array}
        arrays -= set().union(*[i.scope.reads for i in impacted])

        # Turn them into scalars
        #
        # r[x,y,z] = g(b[x,y,z])                 t0 = g(b[x,y,z])
        # ... = r[x,y,z] + r[x,y,z+1]`  ---->    t1 = g(b[x,y,z+1])
        #                                        ... = t0 + t1
        mapper = {}
        exprs = []
        for n, e in enumerate(c.exprs):
            f = e.lhs.function
            if f in arrays:
                indexeds = [i.indexed for i in c.scope[f] if i.timestamp > n]
                for i in filter_ordered(indexeds):
                    mapper[i] = Scalar(name=template(), dtype=f.dtype)

                    assert len(f.indices) == len(e.lhs.indices) == len(
                        i.indices)
                    shifting = {
                        idx: idx + (o2 - o1)
                        for idx, o1, o2 in zip(f.indices, e.lhs.indices,
                                               i.indices)
                    }

                    handle = e.func(mapper[i], e.rhs.xreplace(mapper))
                    handle = xreplace_indices(handle, shifting)
                    exprs.append(handle)
            else:
                exprs.append(e.func(e.lhs, e.rhs.xreplace(mapper)))

        processed.append(c.rebuild(exprs))

    return processed
Пример #6
0
    def callback(self, clusters, prefix):
        if not prefix:
            return clusters

        d = prefix[-1].dim

        processed = []
        for c in clusters:
            if SKEWABLE not in c.properties[d]:
                return clusters

            if d is c.ispace[-1].dim and not self.skewinner:
                return clusters

            skew_dims = {i.dim for i in c.ispace if SEQUENTIAL in c.properties[i.dim]}
            if len(skew_dims) > 1:
                return clusters
            skew_dim = skew_dims.pop()

            # The level of a given Dimension in the hierarchy of block Dimensions, used
            # to skew over the outer level of loops.
            level = lambda dim: len([i for i in dim._defines if i.is_Incr])

            # Since we are here, prefix is skewable and nested under a
            # SEQUENTIAL loop.
            intervals = []
            for i in c.ispace:
                if i.dim is d and level(d) <= 1:  # Skew only at level 0 or 1
                    intervals.append(Interval(d, skew_dim, skew_dim))
                else:
                    intervals.append(i)
            intervals = IntervalGroup(intervals, relations=c.ispace.relations)
            ispace = IterationSpace(intervals, c.ispace.sub_iterators,
                                    c.ispace.directions)

            exprs = xreplace_indices(c.exprs, {d: d - skew_dim})
            processed.append(c.rebuild(exprs=exprs, ispace=ispace,
                                       properties=c.properties))

        return processed
Пример #7
0
def optimize(clusters):
    """
    Attempt scalar promotion. Candidates are tensors that do not appear in
    any other clusters.
    """
    clusters = merge(clusters)

    processed = []
    for c1 in clusters:
        mapper = {}
        temporaries = []
        for k, v in c1.trace.items():
            if v.function.is_Array and\
                    not any(v.function in c2.unknown for c2 in clusters):
                for i in c1.tensors[v.function]:
                    # LHS scalarization
                    scalarized = Scalar(name='s%d' % len(mapper)).indexify()
                    mapper[i] = scalarized

                    # May have to "unroll" some tensor expressions for scalarization;
                    # e.g., if we have two occurrences of r0, say r0[x,y,z] and
                    # r0[x+1,y,z], and r0 is to be scalarized, this will require a
                    # different scalar for each unique set of indices.
                    assert len(v.function.indices) == len(k.indices) == len(i.indices)
                    shifting = {idx: idx + (o2 - o1) for idx, o1, o2 in
                                zip(v.function.indices, k.indices, i.indices)}

                    # Transform /v/, introducing (i) a scalarized LHS and (ii) shifted
                    # indices if necessary
                    handle = v.func(scalarized, v.rhs.xreplace(mapper))
                    handle = xreplace_indices(handle, shifting)
                    temporaries.append(handle)
            else:
                temporaries.append(v.func(k, v.rhs.xreplace(mapper)))
        processed.append(c1.rebuild(temporaries))

    return processed
Пример #8
0
def optimize_unfolded_tree(unfolded, root):
    """
    Transform folded trees to reduce the memory footprint.

    Examples
    --------
    Given:

        .. code-block::
            for i = 1 to N - 1  # Folded tree
              for j = 1 to N - 1
                tmp[i,j] = ...
            for i = 2 to N - 2  # Root
              for j = 2 to N - 2
                ... = ... tmp[i,j] ...

    The temporary ``tmp`` has shape ``(N-1, N-1)``. However, as soon as the
    iteration space is blocked, with blocks of shape ``(i_bs, j_bs)``, the
    ``tmp`` shape can be shrunk to ``(i_bs-1, j_bs-1)``. The resulting
    iteration tree becomes:

        .. code-block::
            for i = 1 to i_bs + 1  # Folded tree
              for j = 1 to j_bs + 1
                i' = i + i_block - 2
                j' = j + j_block - 2
                tmp[i,j] = ... # use i' and j'
            for i = i_block to i_block + i_bs  # Root
              for j = j_block to j_block + j_bs
                i' = i - x_block
                j' = j - j_block
                ... = ... tmp[i',j'] ...
    """
    processed = []
    for i, tree in enumerate(unfolded):
        assert len(tree) == len(root)

        # We can optimize the folded trees only iff:
        # test0 := they compute temporary arrays, but not if they compute input data
        # test1 := the outer Iterations have actually been blocked
        exprs = FindNodes(Expression).visit(tree)
        writes = [j.write for j in exprs if j.is_tensor]
        test0 = not all(j.is_Array for j in writes)
        test1 = any(not isinstance(j.limits[0], BlockDimension) for j in root)
        if test0 or test1:
            processed.append(compose_nodes(tree))
            root = compose_nodes(root)
            continue

        # Shrink the iteration space
        modified_tree = []
        modified_root = []
        modified_dims = {}
        mapper = {}
        for t, r in zip(tree, root):
            udim0 = IncrDimension(t.dim, t.symbolic_min, 1, "%ss%d" % (t.index, i))
            modified_tree.append(t._rebuild(limits=(0, t.limits[1] - t.limits[0], t.step),
                                            uindices=t.uindices + (udim0,)))

            mapper[t.dim] = udim0

            udim1 = IncrDimension(t.dim, 0, 1, "%ss%d" % (t.index, i))
            modified_root.append(r._rebuild(uindices=r.uindices + (udim1,)))

            d = r.limits[0]
            assert isinstance(d, BlockDimension)
            modified_dims[d.root] = d

        # Temporary arrays can now be moved onto the stack
        for w in writes:
            dims = tuple(modified_dims.get(d, d) for d in w.dimensions)
            shape = tuple(d.symbolic_size for d in dims)
            w.update(shape=shape, dimensions=dims, scope='stack')

        # Substitute iteration variables within the folded trees
        modified_tree = compose_nodes(modified_tree)
        replaced = xreplace_indices([j.expr for j in exprs], mapper,
                                    lambda i: i.function not in writes, True)
        subs = [j._rebuild(expr=k) for j, k in zip(exprs, replaced)]
        processed.append(Transformer(dict(zip(exprs, subs))).visit(modified_tree))

        # Introduce the new iteration variables within /root/
        modified_root = compose_nodes(modified_root)
        exprs = FindNodes(Expression).visit(modified_root)
        candidates = [as_symbol(j.output) for j in subs]
        replaced = xreplace_indices([j.expr for j in exprs], mapper, candidates)
        subs = [j._rebuild(expr=k) for j, k in zip(exprs, replaced)]
        root = Transformer(dict(zip(exprs, subs))).visit(modified_root)

    return processed + [root]
Пример #9
0
 def visit_Expression(self, o):
     return o._rebuild(expr=xreplace_indices(o.expr, self.subs, self.rule))
Пример #10
0
def bump_and_contract(targets, source, sink):
    """
    Transform in-place the PartialClusters ``source`` and ``sink`` by turning the
    :class:`Array`s in ``targets`` into :class:`Scalar`. This is implemented
    through index bumping and array contraction.

    :param targets: The :class:`Array` objects that will be contracted.
    :param source: The source :class:`PartialCluster`.
    :param sink: The sink :class:`PartialCluster`.

    Examples
    ========
    Index bumping
    -------------
    Given: ::

        r[x,y,z] = b[x,y,z]*2

    Produce: ::

        r[x,y,z] = b[x,y,z]*2
        r[x,y,z+1] = b[x,y,z+1]*2

    Array contraction
    -----------------
    Given: ::

        r[x,y,z] = b[x,y,z]*2
        r[x,y,z+1] = b[x,y,z+1]*2

    Produce: ::

        tmp0 = b[x,y,z]*2
        tmp1 = b[x,y,z+1]*2

    Full example (bump+contraction)
    -------------------------------
    Given: ::

        source: [r[x,y,z] = b[x,y,z]*2]
        sink: [a = ... r[x,y,z] ... r[x,y,z+1] ...]
        targets: r

    Produce: ::

        source: [tmp0 = b[x,y,z]*2, tmp1 = b[x,y,z+1]*2]
        sink: [a = ... tmp0 ... tmp1 ...]
    """
    if not targets:
        return

    mapper = {}

    # source
    processed = []
    for k, v in source.trace.items():
        if any(v.function not in i for i in [targets, sink.tensors]):
            processed.append(v.func(k, v.rhs.xreplace(mapper)))
        else:
            for i in sink.tensors[v.function]:
                scalarized = Scalar(name='s%d' % len(mapper)).indexify()
                mapper[i] = scalarized

                # Index bumping
                assert len(v.function.indices) == len(k.indices) == len(
                    i.indices)
                shifting = {
                    idx: idx + (o2 - o1)
                    for idx, o1, o2 in zip(v.function.indices, k.indices,
                                           i.indices)
                }

                # Array contraction
                handle = v.func(scalarized, v.rhs.xreplace(mapper))
                handle = xreplace_indices(handle, shifting)
                processed.append(handle)
    source.exprs = processed

    # sink
    processed = [
        v.func(k, v.rhs.xreplace(mapper)) for k, v in sink.trace.items()
    ]
    sink.exprs = processed
Пример #11
0
def bump_and_contract(targets, source, sink):
    """
    Transform in-place the PartialClusters ``source`` and ``sink`` by turning
    the Arrays in ``targets`` into Scalars. This is implemented through index
    bumping and array contraction.

    Parameters
    ----------
    targets : list of Array
        The Arrays that will be contracted.
    source : PartialCluster
        The PartialCluster in which the Arrays are initialized.
    sink : PartialCluster
        The PartialCluster that consumes (i.e., reads) the Arrays.

    Examples
    --------
    1) Index bumping
    Given: ::

        r[x,y,z] = b[x,y,z]*2

    Produce: ::

        r[x,y,z] = b[x,y,z]*2
        r[x,y,z+1] = b[x,y,z+1]*2

    2) Array contraction
    Given: ::

        r[x,y,z] = b[x,y,z]*2
        r[x,y,z+1] = b[x,y,z+1]*2

    Produce: ::

        tmp0 = b[x,y,z]*2
        tmp1 = b[x,y,z+1]*2

    3) Full example (bump+contraction)
    Given: ::

        source: [r[x,y,z] = b[x,y,z]*2]
        sink: [a = ... r[x,y,z] ... r[x,y,z+1] ...]
        targets: r

    Produce: ::

        source: [tmp0 = b[x,y,z]*2, tmp1 = b[x,y,z+1]*2]
        sink: [a = ... tmp0 ... tmp1 ...]
    """
    if not targets:
        return
    mapper = {}

    # Source
    processed = []
    for e in source.exprs:
        function = e.lhs.function
        if any(function not in i for i in [targets, sink.tensors]):
            processed.append(e.func(e.lhs, e.rhs.xreplace(mapper)))
        else:
            for i in sink.tensors[function]:
                scalar = Scalar(name='s%s%d' %
                                (i.function.name, len(mapper))).indexify()
                mapper[i] = scalar

                # Index bumping
                assert len(function.indices) == len(e.lhs.indices) == len(
                    i.indices)
                shifting = {
                    idx: idx + (o2 - o1)
                    for idx, o1, o2 in zip(function.indices, e.lhs.indices,
                                           i.indices)
                }

                # Array contraction
                handle = e.func(scalar, e.rhs.xreplace(mapper))
                handle = xreplace_indices(handle, shifting)
                processed.append(handle)
    source.exprs = processed

    # Sink
    processed = [e.func(e.lhs, e.rhs.xreplace(mapper)) for e in sink.exprs]
    sink.exprs = processed
Пример #12
0
def optimize_unfolded_tree(unfolded, root):
    """
    Transform folded trees to reduce the memory footprint.

    Examples
    ========
    Given:

        .. code-block::
            for i = 1 to N - 1  # Folded tree
              for j = 1 to N - 1
                tmp[i,j] = ...
            for i = 2 to N - 2  # Root
              for j = 2 to N - 2
                ... = ... tmp[i,j] ...

    The temporary ``tmp`` has shape ``(N-1, N-1)``. However, as soon as the
    iteration space is blocked, with blocks of shape ``(i_bs, j_bs)``, the
    ``tmp`` shape can be shrunk to ``(i_bs-1, j_bs-1)``. The resulting
    iteration tree becomes:

        .. code-block::
            for i = 1 to i_bs + 1  # Folded tree
              for j = 1 to j_bs + 1
                i' = i + i_block - 2
                j' = j + j_block - 2
                tmp[i,j] = ... # use i' and j'
            for i = i_block to i_block + i_bs  # Root
              for j = j_block to j_block + j_bs
                i' = i - x_block
                j' = j - j_block
                ... = ... tmp[i',j'] ...
    """
    processed = []
    for i, tree in enumerate(unfolded):
        assert len(tree) == len(root)
        modified_tree = []
        modified_root = []
        mapper = {}

        # "Shrink" the iteration space
        for t1, t2 in zip(tree, root):
            index = Symbol('%ss%d' % (t1.index, i))
            mapper[t1.dim] = index

            t1_uindex = (UnboundedIndex(index, t1.limits[0]), )
            t2_uindex = (UnboundedIndex(index, -t1.limits[0]), )

            limits = (0, t1.limits[1] - t1.limits[0], t1.incr_symbolic)
            modified_tree.append(
                t1._rebuild(limits=limits, uindices=t1.uindices + t1_uindex))

            modified_root.append(t2._rebuild(uindices=t2.uindices + t2_uindex))

        # Temporary arrays can now be moved onto the stack
        exprs = FindNodes(Expression).visit(modified_tree[-1])
        if all(not j.is_Remainder for j in modified_tree):
            dimensions = tuple(j.limits[0] for j in modified_root)
            for j in exprs:
                if j.write.is_Array:
                    j_dimensions = dimensions + j.write.dimensions[
                        len(modified_root):]
                    j_shape = tuple(k.symbolic_size for k in j_dimensions)
                    j.write.update(shape=j_shape,
                                   dimensions=j_dimensions,
                                   onstack=True)

        # Substitute iteration variables within the folded trees
        modified_tree = compose_nodes(modified_tree)
        replaced = xreplace_indices([j.expr for j in exprs],
                                    mapper,
                                    only_rhs=True)
        subs = [j._rebuild(expr=k) for j, k in zip(exprs, replaced)]
        processed.append(
            Transformer(dict(zip(exprs, subs))).visit(modified_tree))

        # Introduce the new iteration variables within /root/
        modified_root = compose_nodes(modified_root)
        exprs = FindNodes(Expression).visit(modified_root)
        candidates = [as_symbol(j.output) for j in subs]
        replaced = xreplace_indices([j.expr for j in exprs], mapper,
                                    candidates)
        subs = [j._rebuild(expr=k) for j, k in zip(exprs, replaced)]
        root = Transformer(dict(zip(exprs, subs))).visit(modified_root)

    return processed + [root]
Пример #13
0
def _bump_and_scalarize(arrays, cluster, template):
    """
    Scalarize local Arrays.

    Parameters
    ----------
    arrays : list of Array
        The Arrays that will be scalarized.
    cluster : Cluster
        The Cluster where the local Arrays are used.

    Examples
    --------
    This transformation consists of two steps, "index bumping" and the
    actual "scalarization".

    Index bumping. Given:

        r[x,y,z] = b[x,y,z]*2

    Produce:

        r[x,y,z] = b[x,y,z]*2
        r[x,y,z+1] = b[x,y,z+1]*2

    Scalarization. Given:

        r[x,y,z] = b[x,y,z]*2
        r[x,y,z+1] = b[x,y,z+1]*2

    Produce:

        t0 = b[x,y,z]*2
        t1 = b[x,y,z+1]*2

    An Array being scalarized could be Indexed multiple times. Before proceeding
    with scalarization, therefore, we perform index bumping. For example, given:

        r0[x,y,z] = b[x,y,z]*2
        r1[x,y,z] = ... r[x,y,z] ... r[x,y,z-1] ...

    This function will produce:

        t0 = b[x,y,z]*2
        t1 = b[x,y,z-1]*2
        r1[x,y,z] = ... t0 ... t1 ...]
    """
    if not arrays:
        return cluster

    mapper = {}
    processed = []
    for e in cluster.exprs:
        f = e.lhs.function
        if f in arrays:
            indexeds = filter_ordered(i.indexed for i in cluster.scope[f])
            for i in indexeds:
                mapper[i] = Scalar(name=template(), dtype=f.dtype)

                # Index bumping
                assert len(f.indices) == len(e.lhs.indices) == len(i.indices)
                shifting = {
                    idx: idx + (o2 - o1)
                    for idx, o1, o2 in zip(f.indices, e.lhs.indices, i.indices)
                }

                # Scalarization
                handle = e.func(mapper[i], e.rhs.xreplace(mapper))
                handle = xreplace_indices(handle, shifting)
                processed.append(handle)
        else:
            processed.append(e.func(e.lhs, e.rhs.xreplace(mapper)))

    return cluster.rebuild(processed)
Пример #14
0
def optimize_unfolded_tree(unfolded, root):
    """
    Transform folded trees to reduce the memory footprint.

    Examples
    --------
    Given:

        .. code-block::
            for i = 1 to N - 1  # Folded tree
              for j = 1 to N - 1
                tmp[i,j] = ...
            for i = 2 to N - 2  # Root
              for j = 2 to N - 2
                ... = ... tmp[i,j] ...

    The temporary ``tmp`` has shape ``(N-1, N-1)``. However, as soon as the
    iteration space is blocked, with blocks of shape ``(i_bs, j_bs)``, the
    ``tmp`` shape can be shrunk to ``(i_bs-1, j_bs-1)``. The resulting
    iteration tree becomes:

        .. code-block::
            for i = 1 to i_bs + 1  # Folded tree
              for j = 1 to j_bs + 1
                i' = i + i_block - 2
                j' = j + j_block - 2
                tmp[i,j] = ... # use i' and j'
            for i = i_block to i_block + i_bs  # Root
              for j = j_block to j_block + j_bs
                i' = i - x_block
                j' = j - j_block
                ... = ... tmp[i',j'] ...
    """
    processed = []
    for i, tree in enumerate(unfolded):
        assert len(tree) == len(root)

        # We can optimize the folded trees only if they compute temporary
        # arrays, but not if they compute input data
        exprs = FindNodes(Expression).visit(tree[-1])
        writes = [j.write for j in exprs if j.is_tensor]
        if not all(j.is_Array for j in writes):
            processed.append(compose_nodes(tree))
            root = compose_nodes(root)
            continue

        modified_tree = []
        modified_root = []
        mapper = {}

        # "Shrink" the iteration space
        for t1, t2 in zip(tree, root):
            t1_udim = IncrDimension(t1.dim, t1.symbolic_min, 1, "%ss%d" % (t1.index, i))
            limits = (0, t1.limits[1] - t1.limits[0], t1.step)
            modified_tree.append(t1._rebuild(limits=limits,
                                             uindices=t1.uindices + (t1_udim,)))

            t2_udim = IncrDimension(t1.dim, 0, 1, "%ss%d" % (t1.index, i))
            modified_root.append(t2._rebuild(uindices=t2.uindices + (t2_udim,)))

            mapper[t1.dim] = t1_udim

        # Temporary arrays can now be moved onto the stack
        dimensions = tuple(j.limits[0] for j in modified_root)
        for j in writes:
            if j.is_Array:
                j_dimensions = dimensions + j.dimensions[len(modified_root):]
                j_shape = tuple(k.symbolic_size for k in j_dimensions)
                j.update(shape=j_shape, dimensions=j_dimensions, scope='stack')

        # Substitute iteration variables within the folded trees
        modified_tree = compose_nodes(modified_tree)
        replaced = xreplace_indices([j.expr for j in exprs], mapper, only_rhs=True)
        subs = [j._rebuild(expr=k) for j, k in zip(exprs, replaced)]
        processed.append(Transformer(dict(zip(exprs, subs))).visit(modified_tree))

        # Introduce the new iteration variables within /root/
        modified_root = compose_nodes(modified_root)
        exprs = FindNodes(Expression).visit(modified_root)
        candidates = [as_symbol(j.output) for j in subs]
        replaced = xreplace_indices([j.expr for j in exprs], mapper, candidates)
        subs = [j._rebuild(expr=k) for j, k in zip(exprs, replaced)]
        root = Transformer(dict(zip(exprs, subs))).visit(modified_root)

    return processed + [root]