Exemplo n.º 1
0
    def apply(self, sdfg):
        graph = sdfg.node(self.state_id)

        # Avoid import loop
        from dace.transformation.dataflow.local_storage import LocalStorage

        local_storage_subgraph = {
            LocalStorage._node_a:
            self.subgraph[AccumulateTransient._map_exit],
            LocalStorage._node_b:
            self.subgraph[AccumulateTransient._outer_map_exit]
        }
        sdfg_id = sdfg.sdfg_list.index(sdfg)
        in_local_storage = LocalStorage(
            sdfg_id, self.state_id, local_storage_subgraph, self.expr_index)
        in_local_storage.array = self.array
        in_local_storage.apply(sdfg)

        # Initialize transient to zero in case of summation
        # TODO: Initialize transient in other WCR types
        memlet = graph.in_edges(in_local_storage._data_node)[0].data
        if detect_reduction_type(memlet.wcr) == dtypes.ReductionType.Sum:
            in_local_storage._data_node.setzero = True
        else:
            warnings.warn('AccumulateTransient did not properly initialize'
                          'newly-created transient!')
Exemplo n.º 2
0
    def expansion(node: 'Reduce', state: SDFGState, sdfg: SDFG):
        """ Create a map around the BlockReduce node
            with in and out transients in registers
            and an if tasklet that redirects the output
            of thread 0 to a shared memory transient
        """
        ### define some useful vars
        graph = state
        reduce_node = node
        in_edge = graph.in_edges(reduce_node)[0]
        out_edge = graph.out_edges(reduce_node)[0]

        axes = reduce_node.axes
        ### add a map that encloses the reduce node
        (new_entry, new_exit) = graph.add_map(
                      name = 'inner_reduce_block',
                      ndrange = {'i'+str(i): f'{rng[0]}:{rng[1]+1}:{rng[2]}'  \
                                for (i,rng) in enumerate(in_edge.data.subset) \
                                if i in axes},
                      schedule = dtypes.ScheduleType.Default)

        map = new_entry.map
        ExpandReduceCUDABlockAll.redirect_edge(graph,
                                               in_edge,
                                               new_dst=new_entry)
        ExpandReduceCUDABlockAll.redirect_edge(graph,
                                               out_edge,
                                               new_src=new_exit)

        subset_in = subsets.Range([
            in_edge.data.subset[i] if i not in axes else
            (new_entry.map.params[0], new_entry.map.params[0], 1)
            for i in range(len(in_edge.data.subset))
        ])
        memlet_in = dace.Memlet(data=in_edge.data.data,
                                volume=1,
                                subset=subset_in)
        memlet_out = dcpy(out_edge.data)
        graph.add_edge(u=new_entry,
                       u_connector=None,
                       v=reduce_node,
                       v_connector=None,
                       memlet=memlet_in)
        graph.add_edge(u=reduce_node,
                       u_connector=None,
                       v=new_exit,
                       v_connector=None,
                       memlet=memlet_out)

        ### add in and out local storage
        from dace.transformation.dataflow.local_storage import LocalStorage

        in_local_storage_subgraph = {
            LocalStorage._node_a: graph.nodes().index(new_entry),
            LocalStorage._node_b: graph.nodes().index(reduce_node)
        }
        out_local_storage_subgraph = {
            LocalStorage._node_a: graph.nodes().index(reduce_node),
            LocalStorage._node_b: graph.nodes().index(new_exit)
        }

        local_storage = LocalStorage(sdfg.sdfg_id,
                                     sdfg.nodes().index(state),
                                     in_local_storage_subgraph, 0)

        local_storage.array = in_edge.data.data
        local_storage.apply(sdfg)
        in_transient = local_storage._data_node
        sdfg.data(in_transient.data).storage = dtypes.StorageType.Register

        local_storage = LocalStorage(sdfg.sdfg_id,
                                     sdfg.nodes().index(state),
                                     out_local_storage_subgraph, 0)
        local_storage.array = out_edge.data.data
        local_storage.apply(sdfg)
        out_transient = local_storage._data_node
        sdfg.data(out_transient.data).storage = dtypes.StorageType.Register

        # hack: swap edges as local_storage does not work correctly here
        # as subsets and data get assigned wrongly (should be swapped)
        # NOTE: If local_storage ever changes, this will not work any more
        e1 = graph.in_edges(out_transient)[0]
        e2 = graph.out_edges(out_transient)[0]
        e1.data.data = dcpy(e2.data.data)
        e1.data.subset = dcpy(e2.data.subset)

        ### add an if tasket and diverge
        code = 'if '
        for (i, param) in enumerate(new_entry.map.params):
            code += (param + '== 0')
            if i < len(axes) - 1:
                code += ' and '
        code += ':\n'
        code += '\tout=inp'

        tasklet_node = graph.add_tasklet(name='block_reduce_write',
                                         inputs=['inp'],
                                         outputs=['out'],
                                         code=code)

        edge_out_outtrans = graph.out_edges(out_transient)[0]
        edge_out_innerexit = graph.out_edges(new_exit)[0]
        ExpandReduceCUDABlockAll.redirect_edge(graph,
                                               edge_out_outtrans,
                                               new_dst=tasklet_node,
                                               new_dst_conn='inp')
        e = graph.add_edge(u=tasklet_node,
                           u_connector='out',
                           v=new_exit,
                           v_connector=None,
                           memlet=dcpy(edge_out_innerexit.data))
        # set dynamic with volume 0 FORNOW
        e.data.volume = 0
        e.data.dynamic = True

        ### set reduce_node axes to all (needed)
        reduce_node.axes = None

        # fill scope connectors, done.
        sdfg.fill_scope_connectors()

        # finally, change the implementation to cuda (block)
        # itself and expand again.
        reduce_node.implementation = 'CUDA (block)'
        sub_expansion = ExpandReduceCUDABlock(0, 0, {}, 0)
        return sub_expansion.expansion(node=node, state=state, sdfg=sdfg)
Exemplo n.º 3
0
    def expand(self, sdfg, graph, reduce_node):
        """ Splits the data dimension into an inner and outer dimension,
            where the inner dimension are the reduction axes and the
            outer axes the complement. Pushes the reduce inside a new
            map consisting of the complement axes.

        """

        out_storage_node = graph.out_edges(reduce_node)[0].dst
        in_storage_node = graph.in_edges(reduce_node)[0].src
        wcr = reduce_node.wcr
        identity = reduce_node.identity
        schedule = reduce_node.schedule
        implementation = reduce_node.implementation
        if implementation and 'warp' in implementation:
            raise NotImplementedError(
                "WIP: Warp Reductions are not Implemented yet.")

        # remove the reduce identity
        # we will reassign it later after expanding
        reduce_node.identity = None
        # expand the reduce node
        in_edge = graph.in_edges(reduce_node)[0]
        nsdfg = self._expand_reduce(sdfg, graph, reduce_node)
        # find the new nodes in the nested sdfg created
        nstate = nsdfg.sdfg.nodes()[0]
        for node, scope in nstate.scope_dict().items():
            if isinstance(node, nodes.MapEntry):
                if scope is None:
                    outer_entry = node
                else:
                    inner_entry = node
            if isinstance(node, nodes.Tasklet):
                tasklet_node = node

        inner_exit = nstate.exit_node(inner_entry)
        outer_exit = nstate.exit_node(outer_entry)

        # find earliest parent read-write occurrence of array onto which
        # we perform the reduction:
        # do BFS, best complexity O(V+E)

        queue = [nsdfg]
        array_closest_ancestor = None
        while len(queue) > 0:
            current = queue.pop(0)
            if isinstance(current, nodes.AccessNode):
                if current.data == out_storage_node.data:
                    # it suffices to find the first node
                    # no matter what access (ReadWrite or Read)
                    array_closest_ancestor = current
                    break
            queue.extend([in_edge.src for in_edge in graph.in_edges(current)])

        # if ancestor doesn't exist:
        #           if non-transient: create data node accessing it
        #           if transient: ancestor_node = none, set_zero on outer node

        shortcut = False
        if (not array_closest_ancestor and sdfg.data(out_storage_node.data).transient) \
                                        or identity is not None:
            if self.debug:
                print("ReduceExpansion::Expanding Reduction into Map")
            # we are lucky
            shortcut = True
            nstate.out_edges(outer_exit)[0].data.wcr = None

        else:
            if self.debug:
                print("ReduceExpansion::Expanding Reduction into Map "
                      "and introducing update Tasklet, "
                      "connecting with ancestor.")
            if not array_closest_ancestor:
                array_closest_ancestor = nodes.AccessNode(
                    out_storage_node.data, access=dtypes.AccessType.ReadOnly)
                graph.add_node(array_closest_ancestor)
                # array_closest_ancestor now points to the node we want to connect
                # to the map entry

            # always have to create out transient in this case
            self.create_out_transient = True

        if self.create_out_transient:
            # create an out transient between inner and outer map exit
            array_out = nstate.out_edges(outer_exit)[0].data.data

            from dace.transformation.dataflow.local_storage import LocalStorage
            local_storage_subgraph = {
                LocalStorage.node_a:
                nsdfg.sdfg.nodes()[0].nodes().index(inner_exit),
                LocalStorage.node_b:
                nsdfg.sdfg.nodes()[0].nodes().index(outer_exit)
            }
            nsdfg_id = nsdfg.sdfg.sdfg_list.index(nsdfg.sdfg)
            nstate_id = 0
            local_storage = LocalStorage(nsdfg_id, nstate_id,
                                         local_storage_subgraph, 0)
            local_storage.array = array_out
            local_storage.apply(nsdfg.sdfg)
            out_transient_node_inner = local_storage._data_node

            # push to register
            nsdfg.sdfg.data(out_transient_node_inner.data
                            ).storage = dtypes.StorageType.Register
            if shortcut:
                nstate.out_edges(out_transient_node_inner)[0].data.wcr = None
                nstate.out_edges(out_transient_node_inner)[0].data.volume = 1

            if shortcut:
                nstate.out_edges(out_transient_node_inner)[0].data.wcr = None
                nstate.out_edges(out_transient_node_inner)[0].data.volume = 1

        if self.create_in_transient:
            # create an in-transient between inner and outer map entry
            array_in = nstate.in_edges(outer_entry)[0].data.data

            from dace.transformation.dataflow.local_storage import LocalStorage
            local_storage_subgraph = {
                LocalStorage.node_a:
                nsdfg.sdfg.nodes()[0].nodes().index(outer_entry),
                LocalStorage.node_b:
                nsdfg.sdfg.nodes()[0].nodes().index(inner_entry)
            }

            nsdfg_id = nsdfg.sdfg.sdfg_list.index(nsdfg.sdfg)
            nstate_id = 0
            local_storage = LocalStorage(nsdfg_id, nstate_id,
                                         local_storage_subgraph, 0)
            local_storage.array = array_in
            local_storage.apply(nsdfg.sdfg)
            in_transient_node_inner = local_storage._data_node

            # push to shared memory / default
            nsdfg.sdfg.data(in_transient_node_inner.data
                            ).storage = dtypes.StorageType.Register

        # first, inline fuse back our nested SDFG
        from dace.transformation.interstate import InlineSDFG
        inline_sdfg = InlineSDFG(
            sdfg.sdfg_list.index(sdfg),
            sdfg.nodes().index(graph),
            {InlineSDFG._nested_sdfg: graph.nodes().index(nsdfg)}, 0)
        inline_sdfg.apply(sdfg)
        if not shortcut:
            reduction_type = detect_reduction_type(wcr)
            try:
                code = ReduceExpansion.reduction_type_update[reduction_type]
            except KeyError:
                raise NotImplementedError(
                    "Not yet implemented for custom reduction")

            new_tasklet = graph.add_tasklet(
                name="reduction_transient_update",
                inputs={"reduction_in", "array_in"},
                outputs={"out"},
                code=code)

            edge_to_remove = graph.out_edges(out_transient_node_inner)[0] \
                             if self.create_out_transient \
                             else graph.out_edges(inner_exit)[0]

            new_memlet_array_inner = Memlet(data=out_storage_node.data,
                                            volume=1,
                                            subset=edge_to_remove.data.subset)
            new_memlet_array_outer = Memlet(
                data=array_closest_ancestor.data,
                volume=graph.in_edges(outer_entry)[0].data.volume,
                subset=subsets.Range.from_array(
                    sdfg.data(out_storage_node.data)))

            new_memlet_reduction = Memlet(
                data=graph.out_edges(inner_exit)[0].data.data,
                volume=1,
                subset=graph.out_edges(inner_exit)[0].data.subset)
            new_memlet_out_inner = Memlet(data=edge_to_remove.data.data,
                                          volume=1,
                                          subset=edge_to_remove.data.subset)
            new_memlet_out_outer = dcpy(new_memlet_array_outer)

            # remove old edges

            outer_edge_to_remove = None
            for edge in graph.out_edges(outer_exit):
                if edge.src == edge_to_remove.dst:
                    outer_edge_to_remove = edge

            graph.remove_edge_and_connectors(edge_to_remove)
            graph.remove_edge_and_connectors(outer_edge_to_remove)


            graph.add_edge(out_transient_node_inner if self.create_out_transient \
                                                    else inner_exit,
                           None,
                           new_tasklet,
                           "reduction_in",
                           new_memlet_reduction)

            graph.add_edge(outer_entry, None, new_tasklet, "array_in",
                           new_memlet_array_inner)
            graph.add_edge(array_closest_ancestor, None, outer_entry, None,
                           new_memlet_array_outer)
            graph.add_edge(new_tasklet, "out", outer_exit, None,
                           new_memlet_out_inner)
            graph.add_edge(outer_exit, None, out_storage_node, None,
                           new_memlet_out_outer)

            # fill map scope connectors
            graph.fill_scope_connectors()
            graph._clear_scopedict_cache()
            # wcr is already removed

        # FORNOW: choose default schedule and implementation
        new_schedule = dtypes.ScheduleType.Default
        new_implementation = self.reduce_implementation \
                             if self.reduce_implementation is not None \
                             else implementation
        new_axes = dcpy(reduce_node.axes)

        reduce_node_new = graph.add_reduce(wcr=wcr,
                                           axes=new_axes,
                                           schedule=new_schedule,
                                           identity=identity)
        reduce_node_new.implementation = new_implementation
        edge_tmp = graph.in_edges(inner_entry)[0]
        memlet_src_reduce = dcpy(edge_tmp.data)
        graph.add_edge(edge_tmp.src, edge_tmp.src_conn, reduce_node_new, None,
                       memlet_src_reduce)

        edge_tmp = graph.out_edges(inner_exit)[0]
        memlet_reduce_dst = Memlet(data=edge_tmp.data.data,
                                   volume=1,
                                   subset=edge_tmp.data.subset)

        graph.add_edge(reduce_node_new, None, edge_tmp.dst, edge_tmp.dst_conn,
                       memlet_reduce_dst)
        identity_tasklet = graph.out_edges(inner_entry)[0].dst
        graph.remove_node(inner_entry)
        graph.remove_node(inner_exit)
        graph.remove_node(identity_tasklet)

        # propagate scope for correct volumes
        scope_tree = ScopeTree(outer_entry, outer_exit)
        scope_tree.parent = ScopeTree(None, None)
        propagate_memlets_scope(sdfg, graph, scope_tree)
        sdfg.validate()

        # create variables for outside access
        self._new_reduce = reduce_node_new
        self._outer_entry = outer_entry

        if identity is None and self.create_out_transient:
            # set the reduction identity accordingly so that the correct
            # blank result is written to the out_transient node
            # we use default values deducted from the reduction type
            reduction_type = detect_reduction_type(wcr)
            try:
                reduce_node_new.identity = self.reduction_type_identity[
                    reduction_type]
            except KeyError:

                if reduction_type == dtypes.ReductionType.Min:
                    reduce_node_new.identity = dtypes.max_value(
                        sdfg.arrays[out_storage_node.data].dtype)
                elif reduction_type == dtypes.ReductionType.Max:
                    reduce_node_new.identity = dtypes.min_value(
                        sdfg.arrays[out_storage_node.data].dtype)
                else:
                    raise ValueError(f"Cannot infer reduction identity."
                                     "Please specify the identity of node"
                                     "{reduce_node_new}")

        return
Exemplo n.º 4
0
    def expand(self, sdfg, graph, reduce_node):
        """ Splits the data dimension into an inner and outer dimension,
            where the inner dimension are the reduction axes and the
            outer axes the complement. Pushes the reduce inside a new
            map consisting of the complement axes.

        """

        # get out storage node, might be hidden behind view node
        out_data = graph.out_edges(reduce_node)[0].data
        out_storage_node = reduce_node
        while not isinstance(out_storage_node, nodes.AccessNode):
            out_storage_node = graph.out_edges(out_storage_node)[0].dst

        if isinstance(sdfg.data(out_storage_node.data), View):
            out_storage_node = graph.out_edges(out_storage_node)[0].dst
            while not isinstance(out_storage_node, nodes.AccessNode):
                out_storage_node = graph.out_edges(out_storage_node)[0].dst

        # get other useful quantities from the original reduce node
        wcr = reduce_node.wcr
        identity = reduce_node.identity
        implementation = reduce_node.implementation

        # remove the reduce identity, will get reassigned after expansion
        reduce_node.identity = None
        # expand the reduce node
        in_edge = graph.in_edges(reduce_node)[0]
        nsdfg = self._expand_reduce(sdfg, graph, reduce_node)
        # find the new nodes in the nested sdfg created
        nstate = nsdfg.sdfg.nodes()[0]
        for node, scope in nstate.scope_dict().items():
            if isinstance(node, nodes.MapEntry):
                if scope is None:
                    outer_entry = node
                else:
                    inner_entry = node
            if isinstance(node, nodes.Tasklet):
                tasklet_node = node

        inner_exit = nstate.exit_node(inner_entry)
        outer_exit = nstate.exit_node(outer_entry)

        # find earliest parent read-write occurrence of array onto which the reduction is performed: BFS

        if self.create_out_transient:
            queue = [nsdfg]
            enqueued = set()
            array_closest_ancestor = None

            while len(queue) > 0:
                current = queue.pop()
                if isinstance(current, nodes.AccessNode):
                    if current.data == out_storage_node.data:
                        # it suffices to find the first node
                        # no matter what access (ReadWrite or Read)
                        array_closest_ancestor = current
                        break
                for in_edge in graph.in_edges(current):
                    if in_edge.src not in enqueued:
                        queue.append(in_edge.src)
                        enqueued.add(in_edge.src)

            if self.debug and array_closest_ancestor:
                print(
                    f"ReduceExpansion::Closest ancestor={array_closest_ancestor}"
                )
            elif self.debug:
                print("ReduceExpansion::No closest ancestor found")

        if self.create_out_transient:
            # create an out transient between inner and outer map exit
            array_out = nstate.out_edges(outer_exit)[0].data.data

            from dace.transformation.dataflow.local_storage import LocalStorage
            local_storage_subgraph = {
                LocalStorage.node_a:
                nsdfg.sdfg.nodes()[0].nodes().index(inner_exit),
                LocalStorage.node_b:
                nsdfg.sdfg.nodes()[0].nodes().index(outer_exit)
            }
            nsdfg_id = nsdfg.sdfg.sdfg_list.index(nsdfg.sdfg)
            nstate_id = 0
            local_storage = LocalStorage(nsdfg_id, nstate_id,
                                         local_storage_subgraph, 0)
            local_storage.array = array_out
            local_storage.apply(nsdfg.sdfg)
            out_transient_node_inner = local_storage._data_node

            # push to register
            nsdfg.sdfg.data(out_transient_node_inner.data
                            ).storage = dtypes.StorageType.Register

            # remove WCRs from all edges where possible if there is no
            # prior occurrence
            if array_closest_ancestor is None:
                nstate.out_edges(outer_exit)[0].data.wcr = None
                nstate.out_edges(out_transient_node_inner)[0].data.wcr = None
                nstate.out_edges(out_transient_node_inner)[0].data.volume = 1
        else:

            # remove WCR from outer exit
            nstate.out_edges(outer_exit)[0].data.wcr = None

        if self.create_in_transient:
            # create an in-transient between inner and outer map entry
            array_in = nstate.in_edges(outer_entry)[0].data.data

            from dace.transformation.dataflow.local_storage import LocalStorage
            local_storage_subgraph = {
                LocalStorage.node_a:
                nsdfg.sdfg.nodes()[0].nodes().index(outer_entry),
                LocalStorage.node_b:
                nsdfg.sdfg.nodes()[0].nodes().index(inner_entry)
            }

            nsdfg_id = nsdfg.sdfg.sdfg_list.index(nsdfg.sdfg)
            nstate_id = 0
            local_storage = LocalStorage(nsdfg_id, nstate_id,
                                         local_storage_subgraph, 0)
            local_storage.array = array_in
            local_storage.apply(nsdfg.sdfg)
            in_transient_node_inner = local_storage._data_node

            # push to register
            nsdfg.sdfg.data(in_transient_node_inner.data
                            ).storage = dtypes.StorageType.Register

        # inline fuse back our nested SDFG
        from dace.transformation.interstate import InlineSDFG
        inline_sdfg = InlineSDFG(
            sdfg.sdfg_list.index(sdfg),
            sdfg.nodes().index(graph),
            {InlineSDFG._nested_sdfg: graph.nodes().index(nsdfg)}, 0)
        inline_sdfg.apply(sdfg)

        new_schedule = dtypes.ScheduleType.Default
        new_implementation = self.reduce_implementation \
                             if self.reduce_implementation is not None \
                             else implementation
        new_axes = dcpy(reduce_node.axes)

        reduce_node_new = graph.add_reduce(wcr=wcr,
                                           axes=new_axes,
                                           schedule=new_schedule,
                                           identity=identity)
        reduce_node_new.implementation = new_implementation
        # replace inner map with new reduction node
        edge_tmp = graph.in_edges(inner_entry)[0]
        memlet_src_reduce = dcpy(edge_tmp.data)
        graph.add_edge(edge_tmp.src, edge_tmp.src_conn, reduce_node_new, None,
                       memlet_src_reduce)

        edge_tmp = graph.out_edges(inner_exit)[0]
        memlet_reduce_dst = Memlet(data=edge_tmp.data.data,
                                   volume=1,
                                   subset=edge_tmp.data.subset)

        graph.add_edge(reduce_node_new, None, edge_tmp.dst, edge_tmp.dst_conn,
                       memlet_reduce_dst)

        identity_tasklet = graph.out_edges(inner_entry)[0].dst
        graph.remove_node(inner_entry)
        graph.remove_node(inner_exit)
        graph.remove_node(identity_tasklet)

        # propagate scope for correct volumes
        scope_tree = ScopeTree(outer_entry, outer_exit)
        scope_tree.parent = ScopeTree(None, None)
        propagate_memlets_scope(sdfg, graph, scope_tree)
        sdfg.validate()

        # create variables for outside access
        self._reduce = reduce_node_new
        self._outer_entry = outer_entry

        if identity is None and self.create_out_transient:
            if self.debug:
                print(
                    "ReduceExpansion::Trying to infer reduction WCR type due to out transient created"
                )
            # set the reduction identity accordingly so that the correct
            # blank result is written to the out_transient node
            # we use default values deducted from the reduction type
            reduction_type = detect_reduction_type(wcr)
            try:
                reduce_node_new.identity = self.reduction_type_identity[
                    reduction_type]
            except KeyError:

                if reduction_type == dtypes.ReductionType.Min:
                    reduce_node_new.identity = dtypes.max_value(
                        sdfg.arrays[out_storage_node.data].dtype)
                elif reduction_type == dtypes.ReductionType.Max:
                    reduce_node_new.identity = dtypes.min_value(
                        sdfg.arrays[out_storage_node.data].dtype)
                else:
                    raise ValueError(f"Cannot infer reduction identity."
                                     "Please specify the identity of node"
                                     "{reduce_node_new}")

        return
Exemplo n.º 5
0
    def apply(self, sdfg):
        graph = sdfg.nodes()[self.state_id]

        map_entry = graph.nodes()[self.subgraph[MPITransformMap._map_entry]]

        # Avoiding import loops
        from dace.transformation.dataflow.strip_mining import StripMining
        from dace.transformation.dataflow.local_storage import LocalStorage

        rangeexpr = str(map_entry.map.range.num_elements())

        stripmine_subgraph = {
            StripMining._map_entry: self.subgraph[MPITransformMap._map_entry]
        }
        sdfg_id = sdfg.sdfg_id
        stripmine = StripMining(sdfg_id, self.state_id, stripmine_subgraph,
                                self.expr_index)
        stripmine.dim_idx = -1
        stripmine.new_dim_prefix = "mpi"
        stripmine.tile_size = "(" + rangeexpr + "/__dace_comm_size)"
        stripmine.divides_evenly = True
        stripmine.apply(sdfg)

        # Find all in-edges that lead to candidate[MPITransformMap._map_entry]
        outer_map = None
        edges = [
            e for e in graph.in_edges(map_entry)
            if isinstance(e.src, nodes.EntryNode)
        ]

        outer_map = edges[0].src

        # Add MPI schedule attribute to outer map
        outer_map.map._schedule = dtypes.ScheduleType.MPI

        # Now create a transient for each array
        for e in edges:
            in_local_storage_subgraph = {
                LocalStorage.node_a: graph.node_id(outer_map),
                LocalStorage.node_b: self.subgraph[MPITransformMap._map_entry]
            }
            sdfg_id = sdfg.sdfg_id
            in_local_storage = LocalStorage(sdfg_id, self.state_id,
                                            in_local_storage_subgraph,
                                            self.expr_index)
            in_local_storage.array = e.data.data
            in_local_storage.apply(sdfg)

        # Transform OutLocalStorage for each output of the MPI map
        in_map_exit = graph.exit_node(map_entry)
        out_map_exit = graph.exit_node(outer_map)

        for e in graph.out_edges(out_map_exit):
            name = e.data.data
            outlocalstorage_subgraph = {
                LocalStorage.node_a: graph.node_id(in_map_exit),
                LocalStorage.node_b: graph.node_id(out_map_exit)
            }
            sdfg_id = sdfg.sdfg_id
            outlocalstorage = LocalStorage(sdfg_id, self.state_id,
                                           outlocalstorage_subgraph,
                                           self.expr_index)
            outlocalstorage.array = name
            outlocalstorage.apply(sdfg)