예제 #1
0
class GPUTransformLocalStorage(pattern_matching.Transformation):
    """Implements the GPUTransformLocalStorage transformation.

        Similar to GPUTransformMap, but takes multiple maps leading from the
        same data node into account, creating a local storage for each range.

        @see: GPUTransformMap
    """

    _arrays_removed = 0
    _maps_transformed = 0

    fullcopy = Property(desc="Copy whole arrays rather than used subset",
                        dtype=bool,
                        default=False)

    nested_seq = Property(
        desc="Makes nested code semantically-equivalent to single-core code,"
        "transforming nested maps and memory into sequential and "
        "local memory respectively.",
        dtype=bool,
        default=True,
    )

    _map_entry = nodes.MapEntry(nodes.Map("", [], []))
    _reduce = nodes.Reduce("lambda: None", None)

    @staticmethod
    def expressions():
        return [
            nxutil.node_path_graph(GPUTransformLocalStorage._map_entry),
            nxutil.node_path_graph(GPUTransformLocalStorage._reduce),
        ]

    @staticmethod
    def can_be_applied(graph, candidate, expr_index, sdfg, strict=False):
        if expr_index == 0:
            map_entry = graph.nodes()[candidate[
                GPUTransformLocalStorage._map_entry]]
            candidate_map = map_entry.map

            # Disallow GPUTransform on nested maps in strict mode
            if strict:
                if graph.scope_dict()[map_entry] is not None:
                    return False

            # Map schedules that are disallowed to transform to GPUs
            if (candidate_map.schedule == dtypes.ScheduleType.MPI
                    or candidate_map.schedule == dtypes.ScheduleType.GPU_Device
                    or candidate_map.schedule
                    == dtypes.ScheduleType.GPU_ThreadBlock or
                    candidate_map.schedule == dtypes.ScheduleType.Sequential):
                return False

            # Dynamic map ranges cannot become kernels
            if sd.has_dynamic_map_inputs(graph, map_entry):
                return False

            # Recursively check parent for GPU schedules
            sdict = graph.scope_dict()
            current_node = map_entry
            while current_node is not None:
                if (current_node.map.schedule == dtypes.ScheduleType.GPU_Device
                        or current_node.map.schedule
                        == dtypes.ScheduleType.GPU_ThreadBlock):
                    return False
                current_node = sdict[current_node]

            # Ensure that map does not include internal arrays that are
            # allocated on non-default space
            subgraph = graph.scope_subgraph(map_entry)
            for node in subgraph.nodes():
                if (isinstance(node, nodes.AccessNode) and
                        node.desc(sdfg).storage != dtypes.StorageType.Default
                        and node.desc(sdfg).storage !=
                        dtypes.StorageType.Register):
                    return False

            # If one of the outputs is a stream, do not match
            map_exit = graph.exit_nodes(map_entry)[0]
            for edge in graph.out_edges(map_exit):
                dst = graph.memlet_path(edge)[-1].dst
                if (isinstance(dst, nodes.AccessNode)
                        and isinstance(sdfg.arrays[dst.data], data.Stream)):
                    return False

            return True
        elif expr_index == 1:
            reduce = graph.nodes()[candidate[GPUTransformLocalStorage._reduce]]

            # Map schedules that are disallowed to transform to GPUs
            if (reduce.schedule == dtypes.ScheduleType.MPI
                    or reduce.schedule == dtypes.ScheduleType.GPU_Device
                    or reduce.schedule == dtypes.ScheduleType.GPU_ThreadBlock):
                return False

            # Recursively check parent for GPU schedules
            sdict = graph.scope_dict()
            current_node = sdict[reduce]
            while current_node is not None:
                if (current_node.map.schedule == dtypes.ScheduleType.GPU_Device
                        or current_node.map.schedule
                        == dtypes.ScheduleType.GPU_ThreadBlock):
                    return False
                current_node = sdict[current_node]

            return True

    @staticmethod
    def match_to_str(graph, candidate):
        if GPUTransformLocalStorage._reduce in candidate:
            return str(
                graph.nodes()[candidate[GPUTransformLocalStorage._reduce]])
        else:
            map_entry = graph.nodes()[candidate[
                GPUTransformLocalStorage._map_entry]]
            return str(map_entry)

    def apply(self, sdfg):
        graph = sdfg.nodes()[self.state_id]
        if self.expr_index == 0:
            cnode = graph.nodes()[self.subgraph[
                GPUTransformLocalStorage._map_entry]]
            node_schedprop = cnode.map
            exit_nodes = graph.exit_nodes(cnode)
        else:
            cnode = graph.nodes()[self.subgraph[
                GPUTransformLocalStorage._reduce]]
            node_schedprop = cnode
            exit_nodes = [cnode]

        # Change schedule
        node_schedprop._schedule = dtypes.ScheduleType.GPU_Device
        if Config.get_bool("debugprint"):
            GPUTransformLocalStorage._maps_transformed += 1
        # If nested graph is designated as sequential, transform schedules and
        # storage from Default to Sequential/Register
        if self.nested_seq and self.expr_index == 0:
            for node in graph.scope_subgraph(cnode).nodes():
                if isinstance(node, nodes.AccessNode):
                    arr = node.desc(sdfg)
                    if arr.storage == dtypes.StorageType.Default:
                        arr.storage = dtypes.StorageType.Register
                elif isinstance(node, nodes.MapEntry):
                    if node.map.schedule == dtypes.ScheduleType.Default:
                        node.map.schedule = dtypes.ScheduleType.Sequential

        gpu_storage_types = [
            dtypes.StorageType.GPU_Global,
            dtypes.StorageType.GPU_Shared,
            dtypes.StorageType.GPU_Stack,
        ]

        #######################################################
        # Add GPU copies of CPU arrays (i.e., not already on GPU)

        # First, understand which arrays to clone
        all_out_edges = []
        for enode in exit_nodes:
            all_out_edges.extend(list(graph.out_edges(enode)))
        in_arrays_to_clone = set()
        out_arrays_to_clone = set()
        for e in graph.in_edges(cnode):
            data_node = sd.find_input_arraynode(graph, e)
            if data_node.desc(sdfg).storage not in gpu_storage_types:
                in_arrays_to_clone.add((data_node, e.data))
        for e in all_out_edges:
            data_node = sd.find_output_arraynode(graph, e)
            if data_node.desc(sdfg).storage not in gpu_storage_types:
                out_arrays_to_clone.add((data_node, e.data))

        if Config.get_bool("debugprint"):
            GPUTransformLocalStorage._arrays_removed += len(
                in_arrays_to_clone) + len(out_arrays_to_clone)

        # Second, create a GPU clone of each array
        # TODO: Overapproximate union of memlets
        cloned_arrays = {}
        in_cloned_arraynodes = {}
        out_cloned_arraynodes = {}
        for array_node, memlet in in_arrays_to_clone:
            array = array_node.desc(sdfg)
            cloned_name = "gpu_" + array_node.data
            for i, r in enumerate(memlet.bounding_box_size()):
                size = symbolic.overapproximate(r)
                try:
                    if int(size) == 1:
                        suffix = []
                        for c in str(memlet.subset[i][0]):
                            if c.isalpha() or c.isdigit() or c == "_":
                                suffix.append(c)
                            elif c == "+":
                                suffix.append("p")
                            elif c == "-":
                                suffix.append("m")
                            elif c == "*":
                                suffix.append("t")
                            elif c == "/":
                                suffix.append("d")
                        cloned_name += "_" + "".join(suffix)
                except:
                    continue
            if cloned_name in sdfg.arrays.keys():
                cloned_array = sdfg.arrays[cloned_name]
            elif array_node.data in cloned_arrays:
                cloned_array = cloned_arrays[array_node.data]
            else:
                full_shape = []
                for r in memlet.bounding_box_size():
                    size = symbolic.overapproximate(r)
                    try:
                        full_shape.append(int(size))
                    except:
                        full_shape.append(size)
                actual_dims = [
                    idx for idx, r in enumerate(full_shape)
                    if not (isinstance(r, int) and r == 1)
                ]
                if len(actual_dims) == 0:  # abort
                    actual_dims = [len(full_shape) - 1]
                if isinstance(array, data.Scalar):
                    sdfg.add_array(name=cloned_name,
                                   shape=[1],
                                   dtype=array.dtype,
                                   transient=True,
                                   storage=dtypes.StorageType.GPU_Global)
                elif isinstance(array, data.Stream):
                    sdfg.add_stream(
                        name=cloned_name,
                        dtype=array.dtype,
                        shape=[full_shape[d] for d in actual_dims],
                        veclen=array.veclen,
                        buffer_size=array.buffer_size,
                        storage=dtypes.StorageType.GPU_Global,
                        transient=True,
                        offset=[array.offset[d] for d in actual_dims])
                else:
                    sdfg.add_array(
                        name=cloned_name,
                        shape=[full_shape[d] for d in actual_dims],
                        dtype=array.dtype,
                        materialize_func=array.materialize_func,
                        transient=True,
                        storage=dtypes.StorageType.GPU_Global,
                        allow_conflicts=array.allow_conflicts,
                        strides=[array.strides[d] for d in actual_dims],
                        offset=[array.offset[d] for d in actual_dims],
                    )
                cloned_arrays[array_node.data] = cloned_name
            cloned_node = type(array_node)(cloned_name)

            in_cloned_arraynodes[array_node.data] = cloned_node
        for array_node, memlet in out_arrays_to_clone:
            array = array_node.desc(sdfg)
            cloned_name = "gpu_" + array_node.data
            for i, r in enumerate(memlet.bounding_box_size()):
                size = symbolic.overapproximate(r)
                try:
                    if int(size) == 1:
                        suffix = []
                        for c in str(memlet.subset[i][0]):
                            if c.isalpha() or c.isdigit() or c == "_":
                                suffix.append(c)
                            elif c == "+":
                                suffix.append("p")
                            elif c == "-":
                                suffix.append("m")
                            elif c == "*":
                                suffix.append("t")
                            elif c == "/":
                                suffix.append("d")
                        cloned_name += "_" + "".join(suffix)
                except:
                    continue
            if cloned_name in sdfg.arrays.keys():
                cloned_array = sdfg.arrays[cloned_name]
            elif array_node.data in cloned_arrays:
                cloned_array = cloned_arrays[array_node.data]
            else:
                full_shape = []
                for r in memlet.bounding_box_size():
                    size = symbolic.overapproximate(r)
                    try:
                        full_shape.append(int(size))
                    except:
                        full_shape.append(size)
                actual_dims = [
                    idx for idx, r in enumerate(full_shape)
                    if not (isinstance(r, int) and r == 1)
                ]
                if len(actual_dims) == 0:  # abort
                    actual_dims = [len(full_shape) - 1]
                if isinstance(array, data.Scalar):
                    sdfg.add_array(name=cloned_name,
                                   shape=[1],
                                   dtype=array.dtype,
                                   transient=True,
                                   storage=dtypes.StorageType.GPU_Global)
                elif isinstance(array, data.Stream):
                    sdfg.add_stream(
                        name=cloned_name,
                        dtype=array.dtype,
                        shape=[full_shape[d] for d in actual_dims],
                        veclen=array.veclen,
                        buffer_size=array.buffer_size,
                        storage=dtypes.StorageType.GPU_Global,
                        transient=True,
                        offset=[array.offset[d] for d in actual_dims])
                else:
                    sdfg.add_array(
                        name=cloned_name,
                        shape=[full_shape[d] for d in actual_dims],
                        dtype=array.dtype,
                        materialize_func=array.materialize_func,
                        transient=True,
                        storage=dtypes.StorageType.GPU_Global,
                        allow_conflicts=array.allow_conflicts,
                        strides=[array.strides[d] for d in actual_dims],
                        offset=[array.offset[d] for d in actual_dims],
                    )
                cloned_arrays[array_node.data] = cloned_name
            cloned_node = type(array_node)(cloned_name)
            cloned_node.setzero = True

            out_cloned_arraynodes[array_node.data] = cloned_node

        # Third, connect the cloned arrays to the originals
        for array_name, node in in_cloned_arraynodes.items():
            graph.add_node(node)
            is_scalar = isinstance(sdfg.arrays[array_name], data.Scalar)
            for edge in graph.in_edges(cnode):
                if edge.data.data == array_name:
                    newmemlet = copy.deepcopy(edge.data)
                    newmemlet.data = node.data

                    if is_scalar:
                        newmemlet.subset = sbs.Indices([0])
                    else:
                        offset = []
                        lost_dims = []
                        lost_ranges = []
                        newsubset = [None] * len(edge.data.subset)
                        for ind, r in enumerate(edge.data.subset):
                            offset.append(r[0])
                            if isinstance(edge.data.subset[ind], tuple):
                                begin = edge.data.subset[ind][0] - r[0]
                                end = edge.data.subset[ind][1] - r[0]
                                step = edge.data.subset[ind][2]
                                if begin == end:
                                    lost_dims.append(ind)
                                    lost_ranges.append((begin, end, step))
                                else:
                                    newsubset[ind] = (begin, end, step)
                            else:
                                newsubset[ind] -= r[0]
                        if len(lost_dims) == len(edge.data.subset):
                            lost_dims.pop()
                            newmemlet.subset = type(
                                edge.data.subset)([lost_ranges[-1]])
                        else:
                            newmemlet.subset = type(edge.data.subset)(
                                [r for r in newsubset if r is not None])

                    graph.add_edge(node, None, edge.dst, edge.dst_conn,
                                   newmemlet)

                    for e in graph.bfs_edges(edge.dst, reverse=False):
                        parent, _, _child, _, memlet = e
                        if parent != edge.dst and not in_scope(
                                graph, parent, edge.dst):
                            break
                        if memlet.data != edge.data.data:
                            continue
                        path = graph.memlet_path(e)
                        if not isinstance(path[-1].dst, nodes.CodeNode):
                            if in_path(path, e, nodes.ExitNode, forward=True):
                                if isinstance(parent, nodes.CodeNode):
                                    # Output edge
                                    break
                                else:
                                    continue
                        if is_scalar:
                            memlet.subset = sbs.Indices([0])
                        else:
                            newsubset = [None] * len(memlet.subset)
                            for ind, r in enumerate(memlet.subset):
                                if ind in lost_dims:
                                    continue
                                if isinstance(memlet.subset[ind], tuple):
                                    begin = r[0] - offset[ind]
                                    end = r[1] - offset[ind]
                                    step = r[2]
                                    newsubset[ind] = (begin, end, step)
                                else:
                                    newsubset[ind] = (
                                        r - offset[ind],
                                        r - offset[ind],
                                        1,
                                    )
                            memlet.subset = type(edge.data.subset)(
                                [r for r in newsubset if r is not None])
                        memlet.data = node.data

                    if self.fullcopy:
                        edge.data.subset = sbs.Range.from_array(
                            node.desc(sdfg))
                    edge.data.other_subset = newmemlet.subset
                    graph.add_edge(edge.src, edge.src_conn, node, None,
                                   edge.data)
                    graph.remove_edge(edge)

        for array_name, node in out_cloned_arraynodes.items():
            graph.add_node(node)
            is_scalar = isinstance(sdfg.arrays[array_name], data.Scalar)
            for edge in all_out_edges:
                if edge.data.data == array_name:
                    newmemlet = copy.deepcopy(edge.data)
                    newmemlet.data = node.data

                    if is_scalar:
                        newmemlet.subset = sbs.Indices([0])
                    else:
                        offset = []
                        lost_dims = []
                        lost_ranges = []
                        newsubset = [None] * len(edge.data.subset)
                        for ind, r in enumerate(edge.data.subset):
                            offset.append(r[0])
                            if isinstance(edge.data.subset[ind], tuple):
                                begin = edge.data.subset[ind][0] - r[0]
                                end = edge.data.subset[ind][1] - r[0]
                                step = edge.data.subset[ind][2]
                                if begin == end:
                                    lost_dims.append(ind)
                                    lost_ranges.append((begin, end, step))
                                else:
                                    newsubset[ind] = (begin, end, step)
                            else:
                                newsubset[ind] -= r[0]
                        if len(lost_dims) == len(edge.data.subset):
                            lost_dims.pop()
                            newmemlet.subset = type(
                                edge.data.subset)([lost_ranges[-1]])
                        else:
                            newmemlet.subset = type(edge.data.subset)(
                                [r for r in newsubset if r is not None])

                    graph.add_edge(edge.src, edge.src_conn, node, None,
                                   newmemlet)

                    end_node = graph.scope_dict()[edge.src]
                    for e in graph.bfs_edges(edge.src, reverse=True):
                        parent, _, _child, _, memlet = e
                        if parent == end_node:
                            break
                        if memlet.data != edge.data.data:
                            continue
                        path = graph.memlet_path(e)
                        if not isinstance(path[0].dst, nodes.CodeNode):
                            if in_path(path, e, nodes.EntryNode,
                                       forward=False):
                                if isinstance(parent, nodes.CodeNode):
                                    # Output edge
                                    break
                                else:
                                    continue
                        if is_scalar:
                            memlet.subset = sbs.Indices([0])
                        else:
                            newsubset = [None] * len(memlet.subset)
                            for ind, r in enumerate(memlet.subset):
                                if ind in lost_dims:
                                    continue
                                if isinstance(memlet.subset[ind], tuple):
                                    begin = r[0] - offset[ind]
                                    end = r[1] - offset[ind]
                                    step = r[2]
                                    newsubset[ind] = (begin, end, step)
                                else:
                                    newsubset[ind] = (
                                        r - offset[ind],
                                        r - offset[ind],
                                        1,
                                    )
                            memlet.subset = type(edge.data.subset)(
                                [r for r in newsubset if r is not None])
                        memlet.data = node.data

                    edge.data.wcr = None
                    if self.fullcopy:
                        edge.data.subset = sbs.Range.from_array(
                            node.desc(sdfg))
                    edge.data.other_subset = newmemlet.subset
                    graph.add_edge(node, None, edge.dst, edge.dst_conn,
                                   edge.data)
                    graph.remove_edge(edge)

        # Fourth, replace memlet arrays as necessary
        if self.expr_index == 0:
            scope_subgraph = graph.scope_subgraph(cnode)
            for edge in scope_subgraph.edges():
                if edge.data.data is not None and edge.data.data in cloned_arrays:
                    edge.data.data = cloned_arrays[edge.data.data]

    def modifies_graph(self):
        return True
예제 #2
0
def _build_dataflow_graph_recurse(sdfg, state, primitives, modules, superEntry,
                                  super_exit):
    # Array of pairs (exit node, memlet)
    exit_nodes = []

    if len(primitives) == 0:
        # Inject empty tasklets into empty states
        primitives = [astnodes._EmptyTaskletNode("Empty Tasklet", None)]

    for prim in primitives:
        label = prim.name

        # Expand node to get entry and exit points
        if isinstance(prim, astnodes._MapNode):
            if len(prim.children) == 0:
                raise ValueError("Map node expected to have children")
            mapNode = nd.Map(label,
                             prim.params,
                             prim.range,
                             is_async=prim.is_async)
            # Add connectors for inputs that exist as array nodes
            entry = nd.MapEntry(
                mapNode,
                _get_input_symbols(prim.inputs, prim.range.free_symbols))
            exit = nd.MapExit(mapNode)
        elif isinstance(prim, astnodes._ConsumeNode):
            if len(prim.children) == 0:
                raise ValueError("Consume node expected to have children")
            consumeNode = nd.Consume(label, (prim.params[1], prim.num_pes),
                                     prim.condition)
            entry = nd.ConsumeEntry(consumeNode)
            exit = nd.ConsumeExit(consumeNode)
        elif isinstance(prim, astnodes._ReduceNode):
            rednode = nd.Reduce(prim.ast, prim.axes, prim.identity)
            state.add_node(rednode)
            entry = rednode
            exit = rednode
        elif isinstance(prim, astnodes._TaskletNode):
            if isinstance(prim, astnodes._EmptyTaskletNode):
                tasklet = nd.EmptyTasklet(prim.name)
            else:
                # Remove memlets from tasklet AST
                if prim.language == types.Language.Python:
                    clean_code = MemletRemover().visit(prim.ast)
                    clean_code = ModuleInliner(modules).visit(clean_code)
                else:  # Use external code from tasklet definition
                    if prim.extcode is None:
                        raise SyntaxError("Cannot define an intrinsic "
                                          "tasklet without an implementation")
                    clean_code = prim.extcode
                tasklet = nd.Tasklet(
                    prim.name,
                    set(prim.inputs.keys()),
                    set(prim.outputs.keys()),
                    code=clean_code,
                    language=prim.language,
                    code_global=prim.gcode)  # TODO: location=prim.location

            # Need to add the tasklet in case we're in an empty state, where no
            # edge will be drawn to it
            state.add_node(tasklet)
            entry = tasklet
            exit = tasklet

        elif isinstance(prim, astnodes._NestedSDFGNode):
            prim.sdfg.parent = state
            prim.sdfg._parent_sdfg = sdfg
            prim.sdfg.update_sdfg_list([])
            nsdfg = nd.NestedSDFG(prim.name, prim.sdfg,
                                  set(prim.inputs.keys()),
                                  set(prim.outputs.keys()))
            state.add_node(nsdfg)
            entry = nsdfg
            exit = nsdfg

        elif isinstance(prim, astnodes._ProgramNode):
            return
        elif isinstance(prim, astnodes._ControlFlowNode):
            continue
        else:
            raise TypeError("Node type not implemented: " +
                            str(prim.__class__))

        # Add incoming edges
        for varname, memlet in prim.inputs.items():
            arr = memlet.dataname
            if (prim.parent is not None
                    and memlet.dataname in prim.parent.transients.keys()):
                node = input_node_for_array(state, memlet.dataname)

                # Add incoming edge into transient as well
                # FIXME: A bit hacked?
                if arr in prim.parent.inputs:
                    astmem = prim.parent.inputs[arr]
                    _add_astmemlet_edge(sdfg, state, superEntry, None, node,
                                        None, astmem)

                    # Remove local name from incoming edge to parent
                    prim.parent.inputs[arr].local_name = None
            elif superEntry:
                node = superEntry
            else:
                node = input_node_for_array(state, memlet.dataname)

            # Destination connector inference
            # Connected to a tasklet or a nested SDFG
            dst_conn = (memlet.local_name
                        if isinstance(entry, nd.CodeNode) else None)
            # Connected to a scope as part of its range
            if str(varname).startswith('__DACEIN_'):
                dst_conn = str(varname)[9:]
            # Handle special case of consume input stream
            if (isinstance(entry, nd.ConsumeEntry)
                    and memlet.data == prim.stream):
                dst_conn = 'IN_stream'

            # If a memlet that covers this input already exists, skip
            # generating this one; otherwise replace memlet with ours
            skip_incoming_edge = False
            remove_edge = None
            for e in state.edges_between(node, entry):
                if e.data.data != memlet.dataname or dst_conn != e.dst_conn:
                    continue
                if e.data.subset.covers(memlet.subset):
                    skip_incoming_edge = True
                    break
                elif memlet.subset.covers(e.data.subset):
                    remove_edge = e
                    break
                else:
                    print('WARNING: Performing bounding-box union on',
                          memlet.subset, 'and', e.data.subset, '(in)')
                    e.data.subset = sbs.bounding_box_union(
                        e.data.subset, memlet.subset)
                    e.data.num_accesses += memlet.num_accesses
                    skip_incoming_edge = True
                    break

            if remove_edge is not None:
                state.remove_edge(remove_edge)

            if skip_incoming_edge == False:
                _add_astmemlet_edge(sdfg, state, node, None, entry, dst_conn,
                                    memlet)

        # If there are no inputs, generate a dummy edge
        if superEntry and len(prim.inputs) == 0:
            state.add_edge(superEntry, None, entry, None, EmptyMemlet())

        if len(prim.children) > 0:
            # Recurse
            inner_outputs = _build_dataflow_graph_recurse(
                sdfg, state, prim.children, modules, entry, exit)
            # Infer output node for each memlet
            for i, (out_src, mem) in enumerate(inner_outputs):
                # If there is no such array in this primitive's outputs,
                # it's an external array (e.g., a map in a map). In this case,
                # connect to the exit node
                if mem.dataname in prim.outputs:
                    inner_outputs[i] = (out_src, prim.outputs[mem.dataname])
                else:
                    inner_outputs[i] = (out_src, mem)
        else:
            inner_outputs = [(exit, mem) for mem in prim.outputs.values()]

        # Add outgoing edges
        for out_src, astmem in inner_outputs:

            data = astmem.data
            dataname = astmem.dataname

            # If WCR is not none, it needs to be handled in the code. Check for
            # this after, as we only expect it for one distinct case
            wcr_was_handled = astmem.wcr is None

            # TODO: This is convoluted. We should find a more readable
            # way of connecting the outgoing edges.

            if super_exit is None:

                # Assert that we're in a top-level node
                if ((not isinstance(prim.parent, astnodes._ProgramNode)) and
                    (not isinstance(prim.parent, astnodes._ControlFlowNode))):
                    raise RuntimeError("Expected to be at the top node")

                # Looks hacky
                src_conn = (astmem.local_name if isinstance(
                    out_src, (nd.Tasklet, nd.NestedSDFG)) else None)

                # Here we just need to connect memlets directly to their
                # respective data nodes
                out_tgt = output_node_for_array(state, astmem.dataname)

                # If a memlet that covers this outuput already exists, skip
                # generating this one; otherwise replace memlet with ours
                skip_outgoing_edge = False
                remove_edge = None
                for e in state.edges_between(out_src, out_tgt):
                    if e.data.data != astmem.dataname or src_conn != e.src_conn:
                        continue
                    if e.data.subset.covers(astmem.subset):
                        skip_outgoing_edge = True
                        break
                    elif astmem.subset.covers(e.data.subset):
                        remove_edge = e
                        break
                    else:
                        print('WARNING: Performing bounding-box union on',
                              astmem.subset, 'and', e.data.subset, '(out)')
                        e.data.subset = sbs.bounding_box_union(
                            e.data.subset, astmem.subset)
                        e.data.num_accesses += astmem.num_accesses
                        skip_outgoing_edge = True
                        break

                if skip_outgoing_edge == True:
                    continue
                if remove_edge is not None:
                    state.remove_edge(remove_edge)

                _add_astmemlet_edge(sdfg,
                                    state,
                                    out_src,
                                    src_conn,
                                    out_tgt,
                                    None,
                                    astmem,
                                    wcr=astmem.wcr,
                                    wcr_identity=astmem.wcr_identity)
                wcr_was_handled = (True if astmem.wcr is not None else
                                   wcr_was_handled)

                # If the program defines another output, connect it too.
                # This refers to the case where we have streams, which
                # must define an input and output, and sometimes this output
                # is defined in pdp.outputs
                if (isinstance(out_tgt, nd.AccessNode)
                        and isinstance(out_tgt.desc(sdfg), dt.Stream)):
                    try:
                        stream_memlet = next(
                            v for k, v in prim.parent.outputs.items()
                            if k == out_tgt.data)
                        stream_output = output_node_for_array(
                            state, stream_memlet.dataname)
                        _add_astmemlet_edge(sdfg, state, out_tgt, None,
                                            stream_output, None, stream_memlet)
                    except StopIteration:  # Stream output not found, skip
                        pass

            else:  # We're in a nest

                if isinstance(prim, astnodes._ScopeNode):
                    # We're a map or a consume node, that needs to connect our
                    # exit to either an array or to the super_exit
                    if data.transient and dataname in prim.parent.transients:
                        # Connect the exit directly
                        out_tgt = output_node_for_array(state, data.dataname)
                        _add_astmemlet_edge(sdfg, state, out_src, None,
                                            out_tgt, None, astmem)
                    else:
                        # This is either a transient defined in an outer scope,
                        # or an I/O array, so redirect thruogh the exit node
                        _add_astmemlet_edge(sdfg, state, out_src, None,
                                            super_exit, None, astmem)
                        # Instruct outer recursion layer to continue the route
                        exit_nodes.append((super_exit, astmem))
                elif isinstance(
                        prim,
                    (astnodes._TaskletNode, astnodes._NestedSDFGNode)):
                    # We're a tasklet, and need to connect either to the exit
                    # if the array is I/O or is defined in a scope further out,
                    # or directly to the transient if it's defined locally
                    if dataname in prim.parent.transients:
                        # This is a local transient variable, so connect to it
                        # directly
                        out_tgt = output_node_for_array(state, data.dataname)
                        _add_astmemlet_edge(sdfg, state, out_src,
                                            astmem.local_name, out_tgt, None,
                                            astmem)
                    else:
                        # This is an I/O array, or an outer level transient, so
                        # redirect through the exit node
                        _add_astmemlet_edge(sdfg,
                                            state,
                                            out_src,
                                            astmem.local_name,
                                            super_exit,
                                            None,
                                            astmem,
                                            wcr=astmem.wcr,
                                            wcr_identity=astmem.wcr_identity)
                        exit_nodes.append((super_exit, astmem))
                        if astmem.wcr is not None:
                            wcr_was_handled = True  # Sanity check
                else:
                    raise TypeError("Unexpected node type: {}".format(
                        type(out_src).__name__))

            if not wcr_was_handled and not isinstance(prim,
                                                      astnodes._ScopeNode):
                raise RuntimeError("Detected unhandled WCR for primitive '{}' "
                                   "of type {}. WCR is only expected for "
                                   "tasklets in a map/consume scope.".format(
                                       prim.name,
                                       type(prim).__name__))

    return exit_nodes
예제 #3
0
파일: mapreduce.py 프로젝트: orausch/dace
class MapReduceFusion(pm.Transformation):
    """ Implements the map-reduce-fusion transformation.
        Fuses a map with an immediately following reduction, where the array
        between the map and the reduction is not used anywhere else.
    """

    _tasklet = nodes.Tasklet('_')
    _tmap_exit = nodes.MapExit(nodes.Map("", [], []))
    _in_array = nodes.AccessNode('_')
    _reduce = nodes.Reduce('lambda: None', None)
    _out_array = nodes.AccessNode('_')

    @staticmethod
    def expressions():
        return [
            nxutil.node_path_graph(MapReduceFusion._tasklet,
                                   MapReduceFusion._tmap_exit,
                                   MapReduceFusion._in_array,
                                   MapReduceFusion._reduce,
                                   MapReduceFusion._out_array)
        ]

    @staticmethod
    def can_be_applied(graph, candidate, expr_index, sdfg, strict=False):
        tmap_exit = graph.nodes()[candidate[MapReduceFusion._tmap_exit]]
        in_array = graph.nodes()[candidate[MapReduceFusion._in_array]]
        reduce_node = graph.nodes()[candidate[MapReduceFusion._reduce]]
        tasklet = graph.nodes()[candidate[MapReduceFusion._tasklet]]

        # Make sure that the array is only accessed by the map and the reduce
        if any([
                src != tmap_exit
                for src, _, _, _, memlet in graph.in_edges(in_array)
        ]):
            return False
        if any([
                dest != reduce_node
                for _, _, dest, _, memlet in graph.out_edges(in_array)
        ]):
            return False

        tmem = next(e for e in graph.edges_between(tasklet, tmap_exit)
                    if e.data.data == in_array.data).data

        # (strict) Make sure that the transient is not accessed anywhere else
        # in this state or other states
        if strict and (len([
                n for n in graph.nodes()
                if isinstance(n, nodes.AccessNode) and n.data == in_array.data
        ]) > 1 or in_array.data in sdfg.shared_transients()):
            return False

        # If memlet already has WCR and it is different from reduce node,
        # do not match
        if tmem.wcr is not None and tmem.wcr != reduce_node.wcr:
            return False

        # Verify that reduction ranges match tasklet map
        tout_memlet = graph.in_edges(in_array)[0].data
        rin_memlet = graph.out_edges(in_array)[0].data
        if tout_memlet.subset != rin_memlet.subset:
            return False

        return True

    @staticmethod
    def match_to_str(graph, candidate):
        tasklet = candidate[MapReduceFusion._tasklet]
        map_exit = candidate[MapReduceFusion._tmap_exit]
        reduce = candidate[MapReduceFusion._reduce]

        return ' -> '.join(str(node) for node in [tasklet, map_exit, reduce])

    def apply(self, sdfg):
        graph = sdfg.nodes()[self.state_id]
        tmap_exit = graph.nodes()[self.subgraph[MapReduceFusion._tmap_exit]]
        in_array = graph.nodes()[self.subgraph[MapReduceFusion._in_array]]
        reduce_node = graph.nodes()[self.subgraph[MapReduceFusion._reduce]]
        out_array = graph.nodes()[self.subgraph[MapReduceFusion._out_array]]

        # Set nodes to remove according to the expression index
        nodes_to_remove = [in_array]
        nodes_to_remove.append(reduce_node)

        memlet_edge = None
        for edge in graph.in_edges(tmap_exit):
            if edge.data.data == in_array.data:
                memlet_edge = edge
                break
        if memlet_edge is None:
            raise RuntimeError('Reduction memlet cannot be None')

        # Find which indices should be removed from new memlet
        input_edge = graph.in_edges(reduce_node)[0]
        axes = reduce_node.axes or list(range(input_edge.data.subset))
        array_edge = graph.out_edges(reduce_node)[0]

        # Delete relevant edges and nodes
        graph.remove_nodes_from(nodes_to_remove)

        # Filter out reduced dimensions from subset
        filtered_subset = [
            dim for i, dim in enumerate(memlet_edge.data.subset)
            if i not in axes
        ]
        if len(filtered_subset) == 0:  # Output is a scalar
            filtered_subset = [0]

        # Modify edge from tasklet to map exit
        memlet_edge.data.data = out_array.data
        memlet_edge.data.wcr = reduce_node.wcr
        memlet_edge.data.wcr_identity = reduce_node.identity
        memlet_edge.data.subset = type(
            memlet_edge.data.subset)(filtered_subset)

        # Add edge from map exit to output array
        graph.add_edge(
            memlet_edge.dst, 'OUT_' + memlet_edge.dst_conn[3:], array_edge.dst,
            array_edge.dst_conn,
            Memlet(array_edge.data.data, array_edge.data.num_accesses,
                   array_edge.data.subset, array_edge.data.veclen,
                   reduce_node.wcr, reduce_node.identity))
예제 #4
0
파일: mapreduce.py 프로젝트: cpenny42/dace
class MapReduceFusion(pm.Transformation):
    """ Implements the map-reduce-fusion transformation.
        Fuses a map with an immediately following reduction, where the array
        between the map and the reduction is not used anywhere else.
    """

    _tasklet = nodes.Tasklet('_')
    _tmap_exit = nodes.MapExit(nodes.Map("", [], []))
    _in_array = nodes.AccessNode('_')
    _rmap_in_entry = nodes.MapEntry(nodes.Map("", [], []))
    _rmap_in_tasklet = nodes.Tasklet('_')
    _rmap_in_cr = nodes.MapExit(nodes.Map("", [], []))
    _rmap_out_entry = nodes.MapEntry(nodes.Map("", [], []))
    _rmap_out_exit = nodes.MapExit(nodes.Map("", [], []))
    _out_array = nodes.AccessNode('_')
    _reduce = nodes.Reduce('lambda: None', None)

    @staticmethod
    def expressions():
        return [
            # Map, then reduce of all axes
            nxutil.node_path_graph(
                MapReduceFusion._tasklet, MapReduceFusion._tmap_exit,
                MapReduceFusion._in_array, MapReduceFusion._rmap_in_entry,
                MapReduceFusion._rmap_in_tasklet, MapReduceFusion._rmap_in_cr,
                MapReduceFusion._out_array),
            # Map, then partial reduction of axes
            nxutil.node_path_graph(
                MapReduceFusion._tasklet, MapReduceFusion._tmap_exit,
                MapReduceFusion._in_array, MapReduceFusion._rmap_out_entry,
                MapReduceFusion._rmap_in_entry,
                MapReduceFusion._rmap_in_tasklet, MapReduceFusion._rmap_in_cr,
                MapReduceFusion._rmap_out_exit, MapReduceFusion._out_array),
            # Map, then reduce node
            nxutil.node_path_graph(
                MapReduceFusion._tasklet, MapReduceFusion._tmap_exit,
                MapReduceFusion._in_array, MapReduceFusion._reduce,
                MapReduceFusion._out_array)
        ]

    @staticmethod
    def can_be_applied(graph, candidate, expr_index, sdfg, strict=False):
        tmap_exit = graph.nodes()[candidate[MapReduceFusion._tmap_exit]]
        in_array = graph.nodes()[candidate[MapReduceFusion._in_array]]
        if expr_index == 0:  # Reduce without outer map
            rmap_entry = graph.nodes()[candidate[
                MapReduceFusion._rmap_in_entry]]
            # rmap_in_entry = rmap_entry
        elif expr_index == 1:  # Reduce with outer map
            rmap_entry = graph.nodes()[candidate[
                MapReduceFusion._rmap_out_entry]]
            # rmap_in_entry = graph.nodes()[candidate[
            #     MapReduceFusion._rmap_in_entry]]
        else:  # Reduce node
            rmap_entry = graph.nodes()[candidate[MapReduceFusion._reduce]]

        # Make sure that the array is only accessed by the map and the reduce
        if any([
                src != tmap_exit
                for src, _, _, _, memlet in graph.in_edges(in_array)
        ]):
            return False
        if any([
                dest != rmap_entry
                for _, _, dest, _, memlet in graph.out_edges(in_array)
        ]):
            return False

        # Make sure that there is a reduction in the second map
        if expr_index < 2:
            rmap_cr = graph.nodes()[candidate[MapReduceFusion._rmap_in_cr]]
            reduce_edge = graph.in_edges(rmap_cr)[0]
            if reduce_edge.data.wcr is None:
                return False

        # Make sure that the transient is not accessed by other states
        # if garr.get_unique_name() in cgen_state.sdfg.shared_transients():
        #     return False

        # reduce_inarr = reduce.in_array
        # reduce_outarr = reduce.out_array
        # reduce_inslice = reduce.inslice
        # reduce_outslice = reduce.outslice

        # insize = cgen_state.var_sizes[reduce_inarr]
        # outsize = cgen_state.var_sizes[reduce_outarr]

        # Currently only supports full-range arrays
        # TODO(later): Support fusion of partial reductions and refactor slice/subarray handling
        #if not nxutil.fullrange(reduce_inslice, insize) or \
        #   not nxutil.fullrange(reduce_outslice, outsize):
        #    return False

        # Verify acceses from tasklet through MapExit
        #already_found = False
        #for _src, _, _dest, _, memlet in graph.in_edges(map_exit):
        #    if isinstance(memlet.subset, subsets.Indices):
        #        # Make sure that only one value is reduced at a time
        #        if memlet.data == in_array.desc:
        #            if already_found:
        #                return False
        #            already_found = True

        ## Find axes after reduction
        #indims = len(reduce.inslice)
        #axis_after_reduce = [None] * indims
        #ctr = 0
        #for i in range(indims):
        #    if reduce.axes is not None and i in reduce.axes:
        #        axis_after_reduce[i] = None
        #    else:
        #        axis_after_reduce[i] = ctr
        #        ctr += 1

        ## Match map ranges with reduce ranges
        #curaxis = 0
        #for dim, var in enumerate(memlet.subset):
        #    # Make sure that indices are direct symbols
        #    #if not isinstance(symbolic.pystr_to_symbolic(var), sympy.Symbol):
        #    #    return False
        #    perm = None
        #    for i, mapvar in enumerate(map_exit.map.params):
        #        if symbolic.pystr_to_symbolic(mapvar) == var:
        #            perm = i
        #            break
        #    if perm is None:  # If symbol is not found in map range
        #        return False

        #    # Make sure that map ranges match output slice after reduction
        #    map_range = map_exit.map.range[perm]
        #    if map_range[0] != 0:
        #        return False  # Disallow start from middle
        #    if map_range[2] is not None and map_range[2] != 1:
        #        return False  # Disallow skip
        #    if reduce.axes is not None and dim not in reduce.axes:
        #        if map_range[1] != symbolic.pystr_to_symbolic(
        #                reduce.outslice[axis_after_reduce[dim]][1]):
        #            return False  # Range check (output axis)
        #    else:
        #        if map_range[1] != symbolic.pystr_to_symbolic(reduce.inslice[dim][1]):
        #            return False  # Range check (reduction axis)

        # Verify that reduction ranges match tasklet map
        tout_memlet = graph.in_edges(in_array)[0].data
        rin_memlet = graph.out_edges(in_array)[0].data
        if tout_memlet.subset != rin_memlet.subset:
            return False

        return True

    @staticmethod
    def match_to_str(graph, candidate):
        tasklet = candidate[MapReduceFusion._tasklet]
        map_exit = candidate[MapReduceFusion._tmap_exit]
        if len(candidate) == 5:  # Expression 2
            reduce = candidate[MapReduceFusion._reduce]
        else:
            reduce = candidate[MapReduceFusion._rmap_in_cr]

        return ' -> '.join(str(node) for node in [tasklet, map_exit, reduce])

    @staticmethod
    def find_memlet_map_permutation(memlet: Memlet, map: nodes.Map):
        perm = [None] * len(memlet.subset)
        indices = set()
        for i, dim in enumerate(memlet.subset):
            for j, mapdim in enumerate(map.params):
                if symbolic.pystr_to_symbolic(
                        mapdim) == dim and j not in indices:
                    perm[i] = j
                    indices.add(j)
                    break
        return perm

    @staticmethod
    def find_permutation(tasklet_map: nodes.Map, red_outer_map: nodes.Map,
                         red_inner_map: nodes.Map, tmem: Memlet):
        """ Find permutation between tasklet-exit memlet and tasklet map. """
        result = [], []

        assert len(tasklet_map.range) == len(red_inner_map.range) + len(
            red_outer_map.range)

        # Match map ranges with reduce ranges
        unavailable_ranges_out = set()
        unavailable_ranges_in = set()
        for i, tmap_rng in enumerate(tasklet_map.range):
            found = False
            for j, rng in enumerate(red_outer_map.range):
                if tmap_rng == rng and j not in unavailable_ranges_out:
                    result[0].append(i)
                    unavailable_ranges_out.add(j)
                    found = True
                    break
            if found: continue
            for j, rng in enumerate(red_inner_map.range):
                if tmap_rng == rng and j not in unavailable_ranges_in:
                    result[1].append(i)
                    unavailable_ranges_in.add(j)
                    found = True
                    break
            if not found: break

        # Ensure all map variables matched with reduce variables
        assert len(result[0]) + len(result[1]) == len(tasklet_map.range)

        # Returns ([outer map indices], [inner (CR) map indices])
        return result

    @staticmethod
    def find_permutation_reduce(tasklet_map: nodes.Map,
                                reduce_node: nodes.Reduce, graph: SDFGState,
                                tmem: Memlet):

        in_memlet = graph.in_edges(reduce_node)[0].data
        out_memlet = graph.out_edges(reduce_node)[0].data
        assert len(tasklet_map.range) == in_memlet.subset.dims()

        # Find permutation between tasklet-exit memlet and tasklet map
        tmem_perm = MapReduceFusion.find_memlet_map_permutation(
            tmem, tasklet_map)
        mapred_perm = []

        # Match map ranges with reduce ranges
        unavailable_ranges = set()
        for i, tmap_rng in enumerate(tasklet_map.range):
            found = False

            for j, in_rng in enumerate(in_memlet.subset):
                if tmap_rng == in_rng and j not in unavailable_ranges:
                    mapred_perm.append(i)
                    unavailable_ranges.add(j)
                    found = True
                    break
            if not found: break

        # Ensure all map variables matched with reduce variables
        assert len(tmem_perm) == len(tmem.subset)
        assert len(mapred_perm) == len(in_memlet.subset)

        # Prepare result from the two permutations and the reduction axes
        result = []
        for i in range(len(mapred_perm)):
            if reduce_node.axes is None or i in reduce_node.axes:
                continue
            result.append(mapred_perm[tmem_perm[i]])

        return result

    def apply(self, sdfg):
        def gnode(nname):
            return graph.nodes()[self.subgraph[nname]]

        expr_index = self.expr_index
        graph = sdfg.nodes()[self.state_id]
        tasklet = gnode(MapReduceFusion._tasklet)
        tmap_exit = graph.nodes()[self.subgraph[MapReduceFusion._tmap_exit]]
        in_array = graph.nodes()[self.subgraph[MapReduceFusion._in_array]]
        if expr_index == 0:  # Reduce without outer map
            rmap_entry = graph.nodes()[self.subgraph[
                MapReduceFusion._rmap_in_entry]]
        elif expr_index == 1:  # Reduce with outer map
            rmap_out_entry = graph.nodes()[self.subgraph[
                MapReduceFusion._rmap_out_entry]]
            rmap_out_exit = graph.nodes()[self.subgraph[
                MapReduceFusion._rmap_out_exit]]
            rmap_in_entry = graph.nodes()[self.subgraph[
                MapReduceFusion._rmap_in_entry]]
            rmap_tasklet = graph.nodes()[self.subgraph[
                MapReduceFusion._rmap_in_tasklet]]

        if expr_index == 2:
            rmap_cr = graph.nodes()[self.subgraph[MapReduceFusion._reduce]]
        else:
            rmap_cr = graph.nodes()[self.subgraph[MapReduceFusion._rmap_in_cr]]
        out_array = gnode(MapReduceFusion._out_array)

        # Set nodes to remove according to the expression index
        nodes_to_remove = [in_array]
        if expr_index == 0:
            nodes_to_remove.append(gnode(MapReduceFusion._rmap_in_entry))
        elif expr_index == 1:
            nodes_to_remove.append(gnode(MapReduceFusion._rmap_out_entry))
            nodes_to_remove.append(gnode(MapReduceFusion._rmap_in_entry))
            nodes_to_remove.append(gnode(MapReduceFusion._rmap_out_exit))
        else:
            nodes_to_remove.append(gnode(MapReduceFusion._reduce))

        # If no other edges lead to mapexit, remove it. Otherwise, keep
        # it and remove reduction incoming/outgoing edges
        if expr_index != 2 and len(graph.in_edges(tmap_exit)) == 1:
            nodes_to_remove.append(tmap_exit)

        memlet_edge = None
        for edge in graph.in_edges(tmap_exit):
            if edge.data.data == in_array.data:
                memlet_edge = edge
                break
        if memlet_edge is None:
            raise RuntimeError('Reduction memlet cannot be None')

        if expr_index == 0:  # Reduce without outer map
            # Index order does not matter, merge as-is
            pass
        elif expr_index == 1:  # Reduce with outer map
            tmap = tmap_exit.map
            perm_outer, perm_inner = MapReduceFusion.find_permutation(
                tmap, rmap_out_entry.map, rmap_in_entry.map, memlet_edge.data)

            # Split tasklet map into tmap_out -> tmap_in (according to
            # reduction)
            omap = nodes.Map(
                tmap.label + '_nonreduce',
                [p for i, p in enumerate(tmap.params) if i in perm_outer],
                [r for i, r in enumerate(tmap.range) if i in perm_outer],
                tmap.schedule, tmap.unroll, tmap.is_async)
            tmap.params = [
                p for i, p in enumerate(tmap.params) if i in perm_inner
            ]
            tmap.range = [
                r for i, r in enumerate(tmap.range) if i in perm_inner
            ]
            omap_entry = nodes.MapEntry(omap)
            omap_exit = rmap_out_exit
            rmap_out_exit.map = omap

            # Reconnect graph to new map
            tmap_entry = graph.entry_node(tmap_exit)
            tmap_in_edges = list(graph.in_edges(tmap_entry))
            for e in tmap_in_edges:
                nxutil.change_edge_dest(graph, tmap_entry, omap_entry)
            for e in tmap_in_edges:
                graph.add_edge(omap_entry, e.src_conn, tmap_entry, e.dst_conn,
                               copy.copy(e.data))
        elif expr_index == 2:  # Reduce node
            # Find correspondence between map indices and array outputs
            tmap = tmap_exit.map
            perm = MapReduceFusion.find_permutation_reduce(
                tmap, rmap_cr, graph, memlet_edge.data)

            output_subset = [tmap.params[d] for d in perm]
            if len(output_subset) == 0:  # Output is a scalar
                output_subset = [0]

            array_edge = graph.out_edges(rmap_cr)[0]

            # Delete relevant edges and nodes
            graph.remove_edge(memlet_edge)
            graph.remove_nodes_from(nodes_to_remove)

            # Add new edges and nodes
            #   From tasklet to map exit
            graph.add_edge(
                memlet_edge.src, memlet_edge.src_conn, memlet_edge.dst,
                memlet_edge.dst_conn,
                Memlet(out_array.data, memlet_edge.data.num_accesses,
                       subsets.Indices(output_subset), memlet_edge.data.veclen,
                       rmap_cr.wcr, rmap_cr.identity))

            #   From map exit to output array
            graph.add_edge(
                memlet_edge.dst, 'OUT_' + memlet_edge.dst_conn[3:],
                array_edge.dst, array_edge.dst_conn,
                Memlet(array_edge.data.data, array_edge.data.num_accesses,
                       array_edge.data.subset, array_edge.data.veclen,
                       rmap_cr.wcr, rmap_cr.identity))

            return

        # Remove tmp array node prior to the others, so that a new one
        # can be created in its stead (see below)
        graph.remove_node(nodes_to_remove[0])
        nodes_to_remove = nodes_to_remove[1:]

        # Create tasklet -> tmp -> tasklet connection
        tmp = graph.add_array(
            'tmp',
            memlet_edge.data.subset.bounding_box_size(),
            sdfg.arrays[memlet_edge.data.data].dtype,
            transient=True)
        tasklet_tmp_memlet = copy.deepcopy(memlet_edge.data)
        tasklet_tmp_memlet.data = tmp.data
        tasklet_tmp_memlet.subset = ShapeProperty.to_string(tmp.shape)

        # Modify memlet to point to output array
        memlet_edge.data.data = out_array.data

        # Recover reduction axes from CR reduce subset
        reduce_cr_subset = graph.in_edges(rmap_tasklet)[0].data.subset
        reduce_axes = []
        for ind, crvar in enumerate(reduce_cr_subset.indices):
            if '__i' in str(crvar):
                reduce_axes.append(ind)

        # Modify memlet access index by filtering out reduction axes
        if True:  # expr_index == 0:
            newindices = []
            for ind, ovar in enumerate(memlet_edge.data.subset.indices):
                if ind not in reduce_axes:
                    newindices.append(ovar)
        if len(newindices) == 0:
            newindices = [0]

        memlet_edge.data.subset = subsets.Indices(newindices)

        graph.remove_edge(memlet_edge)

        graph.add_edge(memlet_edge.src, memlet_edge.src_conn, tmp,
                       memlet_edge.dst_conn, tasklet_tmp_memlet)

        red_edges = list(graph.in_edges(rmap_tasklet))
        if len(red_edges) != 1:
            raise RuntimeError('CR edge must be unique')

        tmp_tasklet_memlet = copy.deepcopy(tasklet_tmp_memlet)
        graph.add_edge(tmp, None, rmap_tasklet, red_edges[0].dst_conn,
                       tmp_tasklet_memlet)

        for e in graph.edges_between(rmap_tasklet, rmap_cr):
            e.data.subset = memlet_edge.data.subset

        # Move output edges to point directly to CR node
        if expr_index == 1:
            # Set output memlet between CR node and outer reduction map to
            # contain the same subset as the one pointing to the CR node
            for e in graph.out_edges(rmap_cr):
                e.data.subset = memlet_edge.data.subset

            rmap_out = gnode(MapReduceFusion._rmap_out_exit)
            nxutil.change_edge_src(graph, rmap_out, omap_exit)

        # Remove nodes
        graph.remove_nodes_from(nodes_to_remove)

        # For unrelated outputs, connect original output to rmap_out
        if expr_index == 1 and tmap_exit not in nodes_to_remove:
            other_out_edges = list(graph.out_edges(tmap_exit))
            for e in other_out_edges:
                graph.remove_edge(e)
                graph.add_edge(e.src, e.src_conn, omap_exit, None, e.data)
                graph.add_edge(omap_exit, None, e.dst, e.dst_conn,
                               copy.copy(e.data))

    def modifies_graph(self):
        return True
예제 #5
0
class ReduceExpansion(pm.Transformation):
    """ Implements the reduce-expansion transformation.

        Reduce-expansion replaces a reduce node with nested maps and edges with
        WCR.
    """

    _reduce = nodes.Reduce(wcr='lambda x: x', axes=None)

    @staticmethod
    def expressions():
        return [nxutil.node_path_graph(ReduceExpansion._reduce)]

    @staticmethod
    def can_be_applied(graph, candidate, expr_index, sdfg, strict=False):
        return True

    @staticmethod
    def match_to_str(graph, candidate):
        red_node = graph.nodes()[candidate[ReduceExpansion._reduce]]
        return "{}: {} on {}".format(red_node, red_node.wcr, red_node.axes)

    def apply(self, sdfg):
        """ The method creates two nested maps. The inner map ranges over the
            reduction axes, while the outer map ranges over the rest of the 
            input dimensions. The inner map contains a trivial tasklet, while
            the outgoing edges copy the reduction WCR.
        """
        graph = sdfg.nodes()[self.state_id]
        red_node = graph.nodes()[self.subgraph[ReduceExpansion._reduce]]

        inputs = []
        in_memlets = []
        for src, _, _, _, memlet in graph.in_edges(red_node):
            if src not in inputs:
                inputs.append(src)
                in_memlets.append(memlet)
        if len(inputs) > 1:
            raise NotImplementedError

        outputs = []
        out_memlets = []
        for _, _, dst, _, memlet in graph.out_edges(red_node):
            if dst not in outputs:
                outputs.append(dst)
                out_memlets.append(memlet)
        if len(outputs) > 1:
            raise NotImplementedError

        axes = red_node.axes
        if axes is None:
            axes = tuple(i for i in range(in_memlets[0].subset.dims()))

        outer_map_range = {}
        inner_map_range = {}
        for idx, r in enumerate(in_memlets[0].subset):
            if idx in axes:
                inner_map_range.update({
                    "__dim_{}".format(str(idx)):
                    subsets.Range.dim_to_string(r)
                })
            else:
                outer_map_range.update({
                    "__dim_{}".format(str(idx)):
                    subsets.Range.dim_to_string(r)
                })

        if len(outer_map_range) > 0:
            outer_map_entry, outer_map_exit = graph.add_map(
                'reduce_outer', outer_map_range, schedule=red_node.schedule)

        inner_map_entry, inner_map_exit = graph.add_map(
            'reduce_inner',
            inner_map_range,
            schedule=(dtypes.ScheduleType.Default
                      if len(outer_map_range) > 0 else red_node.schedule))

        tasklet = graph.add_tasklet(name='red_tasklet',
                                    inputs={'in_1'},
                                    outputs={'out_1'},
                                    code='out_1 = in_1')

        inner_map_entry.in_connectors = {'IN_1'}
        inner_map_entry.out_connectors = {'OUT_1'}

        outer_in_memlet = dcpy(in_memlets[0])

        if len(outer_map_range) > 0:
            outer_map_entry.in_connectors = {'IN_1'}
            outer_map_entry.out_connectors = {'OUT_1'}
            graph.add_edge(inputs[0], None, outer_map_entry, 'IN_1',
                           outer_in_memlet)
        else:
            graph.add_edge(inputs[0], None, inner_map_entry, 'IN_1',
                           outer_in_memlet)

        med_in_memlet = dcpy(in_memlets[0])
        med_in_range = []
        for idx, r in enumerate(med_in_memlet.subset):
            if idx in axes:
                med_in_range.append(r)
            else:
                med_in_range.append(("__dim_{}".format(str(idx)),
                                     "__dim_{}".format(str(idx)), 1))
        med_in_memlet.subset = subsets.Range(med_in_range)
        med_in_memlet.num_accesses = med_in_memlet.subset.num_elements()

        if len(outer_map_range) > 0:
            graph.add_edge(outer_map_entry, 'OUT_1', inner_map_entry, 'IN_1',
                           med_in_memlet)

        inner_in_memlet = dcpy(med_in_memlet)
        inner_in_idx = []
        for idx in range(len(inner_in_memlet.subset)):
            inner_in_idx.append("__dim_{}".format(str(idx)))
        inner_in_memlet.subset = subsets.Indices(inner_in_idx)
        inner_in_memlet.num_accesses = inner_in_memlet.subset.num_elements()
        graph.add_edge(inner_map_entry, 'OUT_1', tasklet, 'in_1',
                       inner_in_memlet)
        inner_map_exit.in_connectors = {'IN_1'}
        inner_map_exit.out_connectors = {'OUT_1'}

        inner_out_memlet = dcpy(out_memlets[0])
        inner_out_idx = []
        for idx, r in enumerate(inner_in_memlet.subset):
            if idx not in axes:
                inner_out_idx.append(r)
        if len(inner_out_idx) == 0:
            inner_out_idx = [0]

        inner_out_memlet.subset = subsets.Indices(inner_out_idx)
        inner_out_memlet.wcr = red_node.wcr
        inner_out_memlet.num_accesses = inner_out_memlet.subset.num_elements()
        graph.add_edge(tasklet, 'out_1', inner_map_exit, 'IN_1',
                       inner_out_memlet)

        outer_out_memlet = dcpy(out_memlets[0])
        outer_out_range = []
        for idx, r in enumerate(outer_out_memlet.subset):
            if idx not in axes:
                outer_out_range.append(r)
        if len(outer_out_range) == 0:
            outer_out_range = [(0, 0, 1)]

        outer_out_memlet.subset = subsets.Range(outer_out_range)
        outer_out_memlet.wcr = red_node.wcr

        if len(outer_map_range) > 0:
            outer_map_exit.in_connectors = {'IN_1'}
            outer_map_exit.out_connectors = {'OUT_1'}
            med_out_memlet = dcpy(inner_out_memlet)
            med_out_memlet.num_accesses = med_out_memlet.subset.num_elements()
            graph.add_edge(inner_map_exit, 'OUT_1', outer_map_exit, 'IN_1',
                           med_out_memlet)

            graph.add_edge(outer_map_exit, 'OUT_1', outputs[0], None,
                           outer_out_memlet)
        else:
            graph.add_edge(inner_map_exit, 'OUT_1', outputs[0], None,
                           outer_out_memlet)

        graph.remove_edge(graph.in_edges(red_node)[0])
        graph.remove_edge(graph.out_edges(red_node)[0])
        graph.remove_node(red_node)
예제 #6
0
class GPUTransformMap(pattern_matching.Transformation):
    """ Implements the GPUTransformMap transformation.

        Converts a single map to a GPU-scheduled map and creates GPU arrays
        outside it, generating CPU<->GPU memory copies automatically.
    """

    fullcopy = Property(
        desc="Copy whole arrays rather than used subset",
        dtype=bool,
        default=False)

    toplevel_trans = Property(
        desc="Make all GPU transients top-level", dtype=bool, default=False)

    register_trans = Property(
        desc="Make all transients inside GPU maps registers",
        dtype=bool,
        default=False)

    sequential_innermaps = Property(
        desc="Make all internal maps Sequential", dtype=bool, default=False)

    _map_entry = nodes.MapEntry(nodes.Map("", [], []))
    _reduce = nodes.Reduce('lambda: None', None)

    @staticmethod
    def expressions():
        return [
            nxutil.node_path_graph(GPUTransformMap._map_entry),
            nxutil.node_path_graph(GPUTransformMap._reduce)
        ]

    @staticmethod
    def can_be_applied(graph, candidate, expr_index, sdfg, strict=False):
        if expr_index == 0:
            map_entry = graph.nodes()[candidate[GPUTransformMap._map_entry]]
            candidate_map = map_entry.map

            # Map schedules that are disallowed to transform to GPUs
            if (candidate_map.schedule in [dtypes.ScheduleType.MPI] +
                    dtypes.GPU_SCHEDULES):
                return False
            if sd.is_devicelevel(sdfg, graph, map_entry):
                return False

            # Dynamic map ranges cannot become kernels
            if sd.has_dynamic_map_inputs(graph, map_entry):
                return False

            # Ensure that map does not include internal arrays that are
            # allocated on non-default space
            subgraph = graph.scope_subgraph(map_entry)
            for node in subgraph.nodes():
                if (isinstance(node, nodes.AccessNode) and
                        node.desc(sdfg).storage != dtypes.StorageType.Default
                        and node.desc(sdfg).storage !=
                        dtypes.StorageType.Register):
                    return False

            # If one of the outputs is a stream, do not match
            map_exit = graph.exit_nodes(map_entry)[0]
            for edge in graph.out_edges(map_exit):
                dst = graph.memlet_path(edge)[-1].dst
                if (isinstance(dst, nodes.AccessNode)
                        and isinstance(sdfg.arrays[dst.data], data.Stream)):
                    return False

            return True
        elif expr_index == 1:
            reduce = graph.nodes()[candidate[GPUTransformMap._reduce]]

            # Map schedules that are disallowed to transform to GPUs
            if (reduce.schedule in [dtypes.ScheduleType.MPI] +
                    dtypes.GPU_SCHEDULES):
                return False
            if sd.is_devicelevel(sdfg, graph, reduce):
                return False

            return True

    @staticmethod
    def match_to_str(graph, candidate):
        if GPUTransformMap._reduce in candidate:
            return str(graph.nodes()[candidate[GPUTransformMap._reduce]])
        else:
            return str(graph.nodes()[candidate[GPUTransformMap._map_entry]])

    def apply(self, sdfg):
        graph = sdfg.nodes()[self.state_id]
        if self.expr_index == 0:
            map_entry = graph.nodes()[self.subgraph[
                GPUTransformMap._map_entry]]
            nsdfg_node = helpers.nest_state_subgraph(
                sdfg,
                graph,
                graph.scope_subgraph(map_entry),
                full_data=self.fullcopy)
        else:
            cnode = graph.nodes()[self.subgraph[GPUTransformMap._reduce]]
            nsdfg_node = helpers.nest_state_subgraph(
                sdfg,
                graph,
                SubgraphView(graph, [cnode]),
                full_data=self.fullcopy)

        # Avoiding import loops
        from dace.transformation.interstate import GPUTransformSDFG
        transformation = GPUTransformSDFG(0, 0, {}, 0)
        transformation.register_trans = self.register_trans
        transformation.sequential_innermaps = self.sequential_innermaps
        transformation.toplevel_trans = self.toplevel_trans

        transformation.apply(nsdfg_node.sdfg)

        # Inline back as necessary
        sdfg.apply_strict_transformations()
예제 #7
0
class GPUTransformMap(pattern_matching.Transformation):
    """ Implements the GPUTransformMap transformation.

        Converts a single map to a GPU-scheduled map and creates GPU arrays
        outside it, generating CPU<->GPU memory copies automatically.
    """

    fullcopy = Property(desc="Copy whole arrays rather than used subset",
                        dtype=bool,
                        default=False)

    _map_entry = nodes.MapEntry(nodes.Map("", [], []))
    _reduce = nodes.Reduce('lambda: None', None)

    @staticmethod
    def expressions():
        return [
            nxutil.node_path_graph(GPUTransformMap._map_entry),
            nxutil.node_path_graph(GPUTransformMap._reduce)
        ]

    @staticmethod
    def can_be_applied(graph, candidate, expr_index, sdfg, strict=False):
        if expr_index == 0:
            map_entry = graph.nodes()[candidate[GPUTransformMap._map_entry]]
            candidate_map = map_entry.map

            # Map schedules that are disallowed to transform to GPUs
            if (candidate_map.schedule == types.ScheduleType.MPI
                    or candidate_map.schedule == types.ScheduleType.GPU_Device
                    or candidate_map.schedule
                    == types.ScheduleType.GPU_ThreadBlock):
                return False

            # Recursively check parent for GPU schedules
            sdict = graph.scope_dict()
            current_node = map_entry
            while current_node != None:
                if (current_node.map.schedule == types.ScheduleType.GPU_Device
                        or current_node.map.schedule
                        == types.ScheduleType.GPU_ThreadBlock):
                    return False
                current_node = sdict[current_node]

            # Ensure that map does not include internal arrays that are allocated
            # on non-default space
            subgraph = graph.scope_subgraph(map_entry)
            for node in subgraph.nodes():
                if (isinstance(node, nodes.AccessNode) and
                        node.desc(sdfg).storage != types.StorageType.Default
                        and
                        node.desc(sdfg).storage != types.StorageType.Register):
                    return False

            return True
        elif expr_index == 1:
            reduce = graph.nodes()[candidate[GPUTransformMap._reduce]]

            # Map schedules that are disallowed to transform to GPUs
            if (reduce.schedule == types.ScheduleType.MPI
                    or reduce.schedule == types.ScheduleType.GPU_Device
                    or reduce.schedule == types.ScheduleType.GPU_ThreadBlock):
                return False

            # Recursively check parent for GPU schedules
            sdict = graph.scope_dict()
            current_node = sdict[reduce]
            while current_node != None:
                if (current_node.map.schedule == types.ScheduleType.GPU_Device
                        or current_node.map.schedule
                        == types.ScheduleType.GPU_ThreadBlock):
                    return False
                current_node = sdict[current_node]

            return True

    @staticmethod
    def match_to_str(graph, candidate):
        if GPUTransformMap._reduce in candidate:
            return str(graph.nodes()[candidate[GPUTransformMap._reduce]])
        else:
            map_entry = graph.nodes()[candidate[GPUTransformMap._map_entry]]
            return str(map_entry)

    def apply(self, sdfg):
        graph = sdfg.nodes()[self.state_id]
        if self.expr_index == 0:
            cnode = graph.nodes()[self.subgraph[GPUTransformMap._map_entry]]
            node_schedprop = cnode.map
            exit_nodes = graph.exit_nodes(cnode)
        else:
            cnode = graph.nodes()[self.subgraph[GPUTransformMap._reduce]]
            node_schedprop = cnode
            exit_nodes = [cnode]

        # Change schedule
        node_schedprop._schedule = types.ScheduleType.GPU_Device

        gpu_storage_types = [
            types.StorageType.GPU_Global,
            types.StorageType.GPU_Shared,
            types.StorageType.GPU_Stack  #, types.StorageType.CPU_Pinned
        ]

        #######################################################
        # Add GPU copies of CPU arrays (i.e., not already on GPU)

        # First, understand which arrays to clone
        all_out_edges = []
        for enode in exit_nodes:
            all_out_edges.extend(list(graph.out_edges(enode)))
        in_arrays_to_clone = set()
        out_arrays_to_clone = set()
        for e in graph.in_edges(cnode):
            data_node = sd.find_input_arraynode(graph, e)
            if data_node.desc(sdfg).storage not in gpu_storage_types:
                in_arrays_to_clone.add(data_node)
        for e in all_out_edges:
            data_node = sd.find_output_arraynode(graph, e)
            if data_node.desc(sdfg).storage not in gpu_storage_types:
                out_arrays_to_clone.add(data_node)

        # Second, create a GPU clone of each array
        cloned_arrays = {}
        in_cloned_arraynodes = {}
        out_cloned_arraynodes = {}
        for array_node in in_arrays_to_clone:
            array = array_node.desc(sdfg)
            if array_node.data in cloned_arrays:
                cloned_array = cloned_arrays[array_node.data]
            else:
                cloned_array = sdfg.add_array(
                    'gpu_' + array_node.data,
                    array.shape,
                    array.dtype,
                    materialize_func=array.materialize_func,
                    transient=True,
                    storage=types.StorageType.GPU_Global,
                    allow_conflicts=array.allow_conflicts,
                    access_order=array.access_order,
                    strides=array.strides,
                    offset=array.offset)
                cloned_arrays[array_node.data] = 'gpu_' + array_node.data
            cloned_node = type(array_node)('gpu_' + array_node.data)

            in_cloned_arraynodes[array_node.data] = cloned_node
        for array_node in out_arrays_to_clone:
            array = array_node.desc(sdfg)
            if array_node.data in cloned_arrays:
                cloned_array = cloned_arrays[array_node.data]
            else:
                cloned_array = sdfg.add_array(
                    'gpu_' + array_node.data,
                    array.shape,
                    array.dtype,
                    materialize_func=array.materialize_func,
                    transient=True,
                    storage=types.StorageType.GPU_Global,
                    allow_conflicts=array.allow_conflicts,
                    access_order=array.access_order,
                    strides=array.strides,
                    offset=array.offset)
                cloned_arrays[array_node.data] = 'gpu_' + array_node.data
            cloned_node = type(array_node)('gpu_' + array_node.data)

            out_cloned_arraynodes[array_node.data] = cloned_node

        # Third, connect the cloned arrays to the originals
        # TODO(later): Shift indices and create only the necessary sub-arrays
        for array_name, node in in_cloned_arraynodes.items():
            graph.add_node(node)
            for edge in graph.in_edges(cnode):
                if edge.data.data == array_name:
                    graph.remove_edge(edge)
                    newmemlet = copy.copy(edge.data)
                    newmemlet.data = node.data
                    graph.add_edge(node, edge.src_conn, edge.dst,
                                   edge.dst_conn, newmemlet)

                    if self.fullcopy:
                        edge.data.subset = sbs.Range.from_array(
                            node.desc(sdfg))
                    edge.data.other_subset = edge.data.subset
                    graph.add_edge(edge.src, None, node, None, edge.data)
        for array_name, node in out_cloned_arraynodes.items():
            graph.add_node(node)
            for edge in all_out_edges:
                if edge.data.data == array_name:
                    graph.remove_edge(edge)
                    newmemlet = copy.copy(edge.data)
                    newmemlet.data = node.data
                    graph.add_edge(edge.src, edge.src_conn, node,
                                   edge.dst_conn, newmemlet)
                    edge.data.wcr = None
                    if self.fullcopy:
                        edge.data.subset = sbs.Range.from_array(
                            node.desc(sdfg))
                    edge.data.other_subset = edge.data.subset
                    graph.add_edge(node, None, edge.dst, None, edge.data)

        # Fourth, replace memlet arrays as necessary
        if self.expr_index == 0:
            scope_subgraph = graph.scope_subgraph(cnode)
            for edge in scope_subgraph.edges():
                if (edge.data.data is not None
                        and edge.data.data in cloned_arrays):
                    edge.data.data = cloned_arrays[edge.data.data]

    def modifies_graph(self):
        return True
예제 #8
0
class GPUTransformMap(pattern_matching.Transformation):
    """ Implements the GPUTransformMap transformation.

        Converts a single map to a GPU-scheduled map and creates GPU arrays
        outside it, generating CPU<->GPU memory copies automatically.
    """

    _maps_transformed = 0
    _arrays_removed = 0

    fullcopy = Property(desc="Copy whole arrays rather than used subset",
                        dtype=bool,
                        default=False)

    _map_entry = nodes.MapEntry(nodes.Map("", [], []))
    _reduce = nodes.Reduce('lambda: None', None)

    @staticmethod
    def expressions():
        return [
            nxutil.node_path_graph(GPUTransformMap._map_entry),
            nxutil.node_path_graph(GPUTransformMap._reduce)
        ]

    @staticmethod
    def can_be_applied(graph, candidate, expr_index, sdfg, strict=False):
        if expr_index == 0:
            map_entry = graph.nodes()[candidate[GPUTransformMap._map_entry]]
            candidate_map = map_entry.map

            # Map schedules that are disallowed to transform to GPUs
            if (candidate_map.schedule == dtypes.ScheduleType.MPI
                    or candidate_map.schedule == dtypes.ScheduleType.GPU_Device
                    or candidate_map.schedule
                    == dtypes.ScheduleType.GPU_ThreadBlock):
                return False

            # Recursively check parent for GPU schedules
            sdict = graph.scope_dict()
            current_node = map_entry
            while current_node is not None:
                if (current_node.map.schedule == dtypes.ScheduleType.GPU_Device
                        or current_node.map.schedule
                        == dtypes.ScheduleType.GPU_ThreadBlock):
                    return False
                current_node = sdict[current_node]

            # Ensure that map does not include internal arrays that are allocated
            # on non-default space
            subgraph = graph.scope_subgraph(map_entry)
            for node in subgraph.nodes():
                if (isinstance(node, nodes.AccessNode) and
                        node.desc(sdfg).storage != dtypes.StorageType.Default
                        and node.desc(sdfg).storage !=
                        dtypes.StorageType.Register):
                    return False

            return True
        elif expr_index == 1:
            reduce = graph.nodes()[candidate[GPUTransformMap._reduce]]

            # Map schedules that are disallowed to transform to GPUs
            if (reduce.schedule == dtypes.ScheduleType.MPI
                    or reduce.schedule == dtypes.ScheduleType.GPU_Device
                    or reduce.schedule == dtypes.ScheduleType.GPU_ThreadBlock):
                return False

            # Recursively check parent for GPU schedules
            sdict = graph.scope_dict()
            current_node = sdict[reduce]
            while current_node is not None:
                if (current_node.map.schedule == dtypes.ScheduleType.GPU_Device
                        or current_node.map.schedule
                        == dtypes.ScheduleType.GPU_ThreadBlock):
                    return False
                current_node = sdict[current_node]

            return True

    @staticmethod
    def match_to_str(graph, candidate):
        if GPUTransformMap._reduce in candidate:
            return str(graph.nodes()[candidate[GPUTransformMap._reduce]])
        else:
            map_entry = graph.nodes()[candidate[GPUTransformMap._map_entry]]
            return str(map_entry)

    def apply(self, sdfg):
        graph = sdfg.nodes()[self.state_id]
        if self.expr_index == 0:
            cnode = graph.nodes()[self.subgraph[GPUTransformMap._map_entry]]
            node_schedprop = cnode.map
            exit_nodes = graph.exit_nodes(cnode)
        else:
            cnode = graph.nodes()[self.subgraph[GPUTransformMap._reduce]]
            node_schedprop = cnode
            exit_nodes = [cnode]

        # Change schedule
        node_schedprop._schedule = dtypes.ScheduleType.GPU_Device
        if Config.get_bool("debugprint"):
            GPUTransformMap._maps_transformed += 1

        gpu_storage_types = [
            dtypes.StorageType.GPU_Global,
            dtypes.StorageType.GPU_Shared,
            dtypes.StorageType.GPU_Stack  #, dtypes.StorageType.CPU_Pinned
        ]

        #######################################################
        # Add GPU copies of CPU arrays (i.e., not already on GPU)

        # First, understand which arrays to clone
        all_out_edges = []
        for enode in exit_nodes:
            all_out_edges.extend(list(graph.out_edges(enode)))
        in_arrays_to_clone = set()
        out_arrays_to_clone = set()
        out_streamarrays = {}
        for e in graph.in_edges(cnode):
            data_node = sd.find_input_arraynode(graph, e)
            if isinstance(data_node.desc(sdfg), data.Scalar):
                continue
            if data_node.desc(sdfg).storage not in gpu_storage_types:
                in_arrays_to_clone.add(data_node)
        for e in all_out_edges:
            data_node = sd.find_output_arraynode(graph, e)
            if isinstance(data_node.desc(sdfg), data.Scalar):
                continue
            if data_node.desc(sdfg).storage not in gpu_storage_types:
                # Stream directly connected to an array
                if sd.is_array_stream_view(sdfg, graph, data_node):
                    datadesc = data_node.desc(sdfg)
                    if datadesc.transient is False:
                        raise TypeError('Non-transient stream-array view are '
                                        'unsupported')
                    # Add parent node to clone
                    out_arrays_to_clone.add(graph.out_edges(data_node)[0].dst)
                    out_streamarrays[graph.out_edges(data_node)
                                     [0].dst] = data_node

                    # Do not clone stream
                    continue

                out_arrays_to_clone.add(data_node)
        if Config.get_bool("debugprint"):
            GPUTransformMap._arrays_removed += len(in_arrays_to_clone) + len(
                out_arrays_to_clone)

        # Second, create a GPU clone of each array
        cloned_arrays = {}
        in_cloned_arraynodes = {}
        out_cloned_arraynodes = {}
        for array_node in in_arrays_to_clone:
            array = array_node.desc(sdfg)
            if array_node.data in cloned_arrays:
                cloned_array = cloned_arrays[array_node.data]
            else:
                cloned_array = array.clone()
                cloned_array.storage = dtypes.StorageType.GPU_Global
                cloned_array.transient = True
                sdfg.add_datadesc('gpu_' + array_node.data, cloned_array)
                cloned_arrays[array_node.data] = 'gpu_' + array_node.data
            cloned_node = type(array_node)('gpu_' + array_node.data)

            in_cloned_arraynodes[array_node.data] = cloned_node
        for array_node in out_arrays_to_clone:
            array = array_node.desc(sdfg)
            if array_node.data in cloned_arrays:
                cloned_array = cloned_arrays[array_node.data]
            else:
                cloned_array = array.clone()
                cloned_array.storage = dtypes.StorageType.GPU_Global
                cloned_array.transient = True
                sdfg.add_datadesc('gpu_' + array_node.data, cloned_array)
                cloned_arrays[array_node.data] = 'gpu_' + array_node.data
            cloned_node = type(array_node)('gpu_' + array_node.data)

            out_cloned_arraynodes[array_node.data] = cloned_node

        # Third, connect the cloned arrays to the originals
        # TODO(later): Shift indices and create only the necessary sub-arrays
        for array_name, node in in_cloned_arraynodes.items():
            graph.add_node(node)
            for edge in graph.in_edges(cnode):
                if edge.data.data == array_name:
                    graph.remove_edge(edge)
                    newmemlet = copy.copy(edge.data)
                    newmemlet.data = node.data
                    graph.add_edge(node, edge.src_conn, edge.dst,
                                   edge.dst_conn, newmemlet)

                    if self.fullcopy:
                        edge.data.subset = sbs.Range.from_array(
                            node.desc(sdfg))
                    edge.data.other_subset = edge.data.subset
                    graph.add_edge(edge.src, None, node, None, edge.data)
        for array_name, node in out_cloned_arraynodes.items():
            graph.add_node(node)
            for edge in all_out_edges:
                if edge.data.data == array_name:
                    graph.remove_edge(edge)
                    newmemlet = copy.copy(edge.data)
                    newmemlet.data = node.data
                    graph.add_edge(edge.src, edge.src_conn, node,
                                   edge.dst_conn, newmemlet)
                    edge.data.wcr = None
                    if self.fullcopy:
                        edge.data.subset = sbs.Range.from_array(
                            node.desc(sdfg))
                    edge.data.other_subset = edge.data.subset
                    graph.add_edge(node, None, edge.dst, None, edge.data)

        # Reconnect stream-arrays
        for array_node, streamnode in out_streamarrays.items():
            # Set stream storage to GPU
            streamnode.desc(sdfg).storage = dtypes.StorageType.GPU_Global

            cloned_node = out_cloned_arraynodes[array_node.data]

            e = graph.out_edges(streamnode)[0]
            graph.remove_edge(e)
            newmemlet = copy.copy(e.data)
            newmemlet.data = cloned_node.data
            # stream -> cloned array
            graph.add_edge(e.src, e.src_conn, cloned_node, e.dst_conn,
                           newmemlet)
            # cloned array -> array
            graph.add_nedge(cloned_node, array_node, e.data)

        # Fourth, replace memlet arrays as necessary
        if self.expr_index == 0:
            scope_subgraph = graph.scope_subgraph(cnode)
            for edge in scope_subgraph.edges():
                if (edge.data.data is not None
                        and edge.data.data in cloned_arrays):
                    edge.data.data = cloned_arrays[edge.data.data]

    def modifies_graph(self):
        return True

    @staticmethod
    def print_debuginfo():
        print("Automatically cloned {} arrays for the GPU.".format(
            GPUTransformMap._arrays_removed))
        print("Automatically changed {} maps for the GPU.".format(
            GPUTransformMap._maps_transformed))