Пример #1
0
 def generate_node(self, sdfg: sdfg.SDFG, dfg: state.StateSubgraphView,
                   state_id: int, node: nodes.Node,
                   function_stream: prettycode.CodeIOStream,
                   callsite_stream: prettycode.CodeIOStream):
     # check instance type
     if isinstance(node, nodes.Tasklet):
         """
         handle Tasklet:
             (1) generate in->tasklet
             (2) generate tasklet->out
             (3) generate tasklet
         """
         # generate code to handle data input to the tasklet
         for edge in dfg.in_edges(node):
             # find input array
             src_node = find_input_arraynode(dfg, edge)
             # dispatch code gen (copy_memory)
             self.dispatcher.dispatch_copy(src_node, node, edge, sdfg, dfg,
                                           state_id, function_stream,
                                           callsite_stream)
         # generate code to handle data output from the tasklet
         for edge in dfg.out_edges(node):
             # find output array
             dst_node = find_output_arraynode(dfg, edge)
             # dispatch code gen (define_out_memlet)
             self.dispatcher.dispatch_output_definition(
                 node, dst_node, edge, sdfg, dfg, state_id, function_stream,
                 callsite_stream)
         # generate tasklet code
         self.unparse_tasklet(sdfg, dfg, state_id, node, function_stream,
                              callsite_stream)
     else:
         raise RuntimeError(
             "Only tasklets are handled here, not {}. This should have been filtered by the predicate"
             .format(type(node)))
Пример #2
0
    def apply(self, sdfg):
        graph = sdfg.nodes()[self.state_id]
        if self.expr_index == 0:
            cnode = graph.nodes()[self.subgraph[
                GPUTransformLocalStorage._map_entry]]
            node_schedprop = cnode.map
            exit_nodes = graph.exit_nodes(cnode)
        else:
            cnode = graph.nodes()[self.subgraph[
                GPUTransformLocalStorage._reduce]]
            node_schedprop = cnode
            exit_nodes = [cnode]

        # Change schedule
        node_schedprop._schedule = dtypes.ScheduleType.GPU_Device
        if Config.get_bool("debugprint"):
            GPUTransformLocalStorage._maps_transformed += 1
        # If nested graph is designated as sequential, transform schedules and
        # storage from Default to Sequential/Register
        if self.nested_seq and self.expr_index == 0:
            for node in graph.scope_subgraph(cnode).nodes():
                if isinstance(node, nodes.AccessNode):
                    arr = node.desc(sdfg)
                    if arr.storage == dtypes.StorageType.Default:
                        arr.storage = dtypes.StorageType.Register
                elif isinstance(node, nodes.MapEntry):
                    if node.map.schedule == dtypes.ScheduleType.Default:
                        node.map.schedule = dtypes.ScheduleType.Sequential

        gpu_storage_types = [
            dtypes.StorageType.GPU_Global,
            dtypes.StorageType.GPU_Shared,
            dtypes.StorageType.GPU_Stack,
        ]

        #######################################################
        # Add GPU copies of CPU arrays (i.e., not already on GPU)

        # First, understand which arrays to clone
        all_out_edges = []
        for enode in exit_nodes:
            all_out_edges.extend(list(graph.out_edges(enode)))
        in_arrays_to_clone = set()
        out_arrays_to_clone = set()
        for e in graph.in_edges(cnode):
            data_node = sd.find_input_arraynode(graph, e)
            if data_node.desc(sdfg).storage not in gpu_storage_types:
                in_arrays_to_clone.add((data_node, e.data))
        for e in all_out_edges:
            data_node = sd.find_output_arraynode(graph, e)
            if data_node.desc(sdfg).storage not in gpu_storage_types:
                out_arrays_to_clone.add((data_node, e.data))

        if Config.get_bool("debugprint"):
            GPUTransformLocalStorage._arrays_removed += len(
                in_arrays_to_clone) + len(out_arrays_to_clone)

        # Second, create a GPU clone of each array
        # TODO: Overapproximate union of memlets
        cloned_arrays = {}
        in_cloned_arraynodes = {}
        out_cloned_arraynodes = {}
        for array_node, memlet in in_arrays_to_clone:
            array = array_node.desc(sdfg)
            cloned_name = "gpu_" + array_node.data
            for i, r in enumerate(memlet.bounding_box_size()):
                size = symbolic.overapproximate(r)
                try:
                    if int(size) == 1:
                        suffix = []
                        for c in str(memlet.subset[i][0]):
                            if c.isalpha() or c.isdigit() or c == "_":
                                suffix.append(c)
                            elif c == "+":
                                suffix.append("p")
                            elif c == "-":
                                suffix.append("m")
                            elif c == "*":
                                suffix.append("t")
                            elif c == "/":
                                suffix.append("d")
                        cloned_name += "_" + "".join(suffix)
                except:
                    continue
            if cloned_name in sdfg.arrays.keys():
                cloned_array = sdfg.arrays[cloned_name]
            elif array_node.data in cloned_arrays:
                cloned_array = cloned_arrays[array_node.data]
            else:
                full_shape = []
                for r in memlet.bounding_box_size():
                    size = symbolic.overapproximate(r)
                    try:
                        full_shape.append(int(size))
                    except:
                        full_shape.append(size)
                actual_dims = [
                    idx for idx, r in enumerate(full_shape)
                    if not (isinstance(r, int) and r == 1)
                ]
                if len(actual_dims) == 0:  # abort
                    actual_dims = [len(full_shape) - 1]
                if isinstance(array, data.Scalar):
                    sdfg.add_array(name=cloned_name,
                                   shape=[1],
                                   dtype=array.dtype,
                                   transient=True,
                                   storage=dtypes.StorageType.GPU_Global)
                elif isinstance(array, data.Stream):
                    sdfg.add_stream(
                        name=cloned_name,
                        dtype=array.dtype,
                        shape=[full_shape[d] for d in actual_dims],
                        veclen=array.veclen,
                        buffer_size=array.buffer_size,
                        storage=dtypes.StorageType.GPU_Global,
                        transient=True,
                        offset=[array.offset[d] for d in actual_dims])
                else:
                    sdfg.add_array(
                        name=cloned_name,
                        shape=[full_shape[d] for d in actual_dims],
                        dtype=array.dtype,
                        materialize_func=array.materialize_func,
                        transient=True,
                        storage=dtypes.StorageType.GPU_Global,
                        allow_conflicts=array.allow_conflicts,
                        strides=[array.strides[d] for d in actual_dims],
                        offset=[array.offset[d] for d in actual_dims],
                    )
                cloned_arrays[array_node.data] = cloned_name
            cloned_node = type(array_node)(cloned_name)

            in_cloned_arraynodes[array_node.data] = cloned_node
        for array_node, memlet in out_arrays_to_clone:
            array = array_node.desc(sdfg)
            cloned_name = "gpu_" + array_node.data
            for i, r in enumerate(memlet.bounding_box_size()):
                size = symbolic.overapproximate(r)
                try:
                    if int(size) == 1:
                        suffix = []
                        for c in str(memlet.subset[i][0]):
                            if c.isalpha() or c.isdigit() or c == "_":
                                suffix.append(c)
                            elif c == "+":
                                suffix.append("p")
                            elif c == "-":
                                suffix.append("m")
                            elif c == "*":
                                suffix.append("t")
                            elif c == "/":
                                suffix.append("d")
                        cloned_name += "_" + "".join(suffix)
                except:
                    continue
            if cloned_name in sdfg.arrays.keys():
                cloned_array = sdfg.arrays[cloned_name]
            elif array_node.data in cloned_arrays:
                cloned_array = cloned_arrays[array_node.data]
            else:
                full_shape = []
                for r in memlet.bounding_box_size():
                    size = symbolic.overapproximate(r)
                    try:
                        full_shape.append(int(size))
                    except:
                        full_shape.append(size)
                actual_dims = [
                    idx for idx, r in enumerate(full_shape)
                    if not (isinstance(r, int) and r == 1)
                ]
                if len(actual_dims) == 0:  # abort
                    actual_dims = [len(full_shape) - 1]
                if isinstance(array, data.Scalar):
                    sdfg.add_array(name=cloned_name,
                                   shape=[1],
                                   dtype=array.dtype,
                                   transient=True,
                                   storage=dtypes.StorageType.GPU_Global)
                elif isinstance(array, data.Stream):
                    sdfg.add_stream(
                        name=cloned_name,
                        dtype=array.dtype,
                        shape=[full_shape[d] for d in actual_dims],
                        veclen=array.veclen,
                        buffer_size=array.buffer_size,
                        storage=dtypes.StorageType.GPU_Global,
                        transient=True,
                        offset=[array.offset[d] for d in actual_dims])
                else:
                    sdfg.add_array(
                        name=cloned_name,
                        shape=[full_shape[d] for d in actual_dims],
                        dtype=array.dtype,
                        materialize_func=array.materialize_func,
                        transient=True,
                        storage=dtypes.StorageType.GPU_Global,
                        allow_conflicts=array.allow_conflicts,
                        strides=[array.strides[d] for d in actual_dims],
                        offset=[array.offset[d] for d in actual_dims],
                    )
                cloned_arrays[array_node.data] = cloned_name
            cloned_node = type(array_node)(cloned_name)
            cloned_node.setzero = True

            out_cloned_arraynodes[array_node.data] = cloned_node

        # Third, connect the cloned arrays to the originals
        for array_name, node in in_cloned_arraynodes.items():
            graph.add_node(node)
            is_scalar = isinstance(sdfg.arrays[array_name], data.Scalar)
            for edge in graph.in_edges(cnode):
                if edge.data.data == array_name:
                    newmemlet = copy.deepcopy(edge.data)
                    newmemlet.data = node.data

                    if is_scalar:
                        newmemlet.subset = sbs.Indices([0])
                    else:
                        offset = []
                        lost_dims = []
                        lost_ranges = []
                        newsubset = [None] * len(edge.data.subset)
                        for ind, r in enumerate(edge.data.subset):
                            offset.append(r[0])
                            if isinstance(edge.data.subset[ind], tuple):
                                begin = edge.data.subset[ind][0] - r[0]
                                end = edge.data.subset[ind][1] - r[0]
                                step = edge.data.subset[ind][2]
                                if begin == end:
                                    lost_dims.append(ind)
                                    lost_ranges.append((begin, end, step))
                                else:
                                    newsubset[ind] = (begin, end, step)
                            else:
                                newsubset[ind] -= r[0]
                        if len(lost_dims) == len(edge.data.subset):
                            lost_dims.pop()
                            newmemlet.subset = type(
                                edge.data.subset)([lost_ranges[-1]])
                        else:
                            newmemlet.subset = type(edge.data.subset)(
                                [r for r in newsubset if r is not None])

                    graph.add_edge(node, None, edge.dst, edge.dst_conn,
                                   newmemlet)

                    for e in graph.bfs_edges(edge.dst, reverse=False):
                        parent, _, _child, _, memlet = e
                        if parent != edge.dst and not in_scope(
                                graph, parent, edge.dst):
                            break
                        if memlet.data != edge.data.data:
                            continue
                        path = graph.memlet_path(e)
                        if not isinstance(path[-1].dst, nodes.CodeNode):
                            if in_path(path, e, nodes.ExitNode, forward=True):
                                if isinstance(parent, nodes.CodeNode):
                                    # Output edge
                                    break
                                else:
                                    continue
                        if is_scalar:
                            memlet.subset = sbs.Indices([0])
                        else:
                            newsubset = [None] * len(memlet.subset)
                            for ind, r in enumerate(memlet.subset):
                                if ind in lost_dims:
                                    continue
                                if isinstance(memlet.subset[ind], tuple):
                                    begin = r[0] - offset[ind]
                                    end = r[1] - offset[ind]
                                    step = r[2]
                                    newsubset[ind] = (begin, end, step)
                                else:
                                    newsubset[ind] = (
                                        r - offset[ind],
                                        r - offset[ind],
                                        1,
                                    )
                            memlet.subset = type(edge.data.subset)(
                                [r for r in newsubset if r is not None])
                        memlet.data = node.data

                    if self.fullcopy:
                        edge.data.subset = sbs.Range.from_array(
                            node.desc(sdfg))
                    edge.data.other_subset = newmemlet.subset
                    graph.add_edge(edge.src, edge.src_conn, node, None,
                                   edge.data)
                    graph.remove_edge(edge)

        for array_name, node in out_cloned_arraynodes.items():
            graph.add_node(node)
            is_scalar = isinstance(sdfg.arrays[array_name], data.Scalar)
            for edge in all_out_edges:
                if edge.data.data == array_name:
                    newmemlet = copy.deepcopy(edge.data)
                    newmemlet.data = node.data

                    if is_scalar:
                        newmemlet.subset = sbs.Indices([0])
                    else:
                        offset = []
                        lost_dims = []
                        lost_ranges = []
                        newsubset = [None] * len(edge.data.subset)
                        for ind, r in enumerate(edge.data.subset):
                            offset.append(r[0])
                            if isinstance(edge.data.subset[ind], tuple):
                                begin = edge.data.subset[ind][0] - r[0]
                                end = edge.data.subset[ind][1] - r[0]
                                step = edge.data.subset[ind][2]
                                if begin == end:
                                    lost_dims.append(ind)
                                    lost_ranges.append((begin, end, step))
                                else:
                                    newsubset[ind] = (begin, end, step)
                            else:
                                newsubset[ind] -= r[0]
                        if len(lost_dims) == len(edge.data.subset):
                            lost_dims.pop()
                            newmemlet.subset = type(
                                edge.data.subset)([lost_ranges[-1]])
                        else:
                            newmemlet.subset = type(edge.data.subset)(
                                [r for r in newsubset if r is not None])

                    graph.add_edge(edge.src, edge.src_conn, node, None,
                                   newmemlet)

                    end_node = graph.scope_dict()[edge.src]
                    for e in graph.bfs_edges(edge.src, reverse=True):
                        parent, _, _child, _, memlet = e
                        if parent == end_node:
                            break
                        if memlet.data != edge.data.data:
                            continue
                        path = graph.memlet_path(e)
                        if not isinstance(path[0].dst, nodes.CodeNode):
                            if in_path(path, e, nodes.EntryNode,
                                       forward=False):
                                if isinstance(parent, nodes.CodeNode):
                                    # Output edge
                                    break
                                else:
                                    continue
                        if is_scalar:
                            memlet.subset = sbs.Indices([0])
                        else:
                            newsubset = [None] * len(memlet.subset)
                            for ind, r in enumerate(memlet.subset):
                                if ind in lost_dims:
                                    continue
                                if isinstance(memlet.subset[ind], tuple):
                                    begin = r[0] - offset[ind]
                                    end = r[1] - offset[ind]
                                    step = r[2]
                                    newsubset[ind] = (begin, end, step)
                                else:
                                    newsubset[ind] = (
                                        r - offset[ind],
                                        r - offset[ind],
                                        1,
                                    )
                            memlet.subset = type(edge.data.subset)(
                                [r for r in newsubset if r is not None])
                        memlet.data = node.data

                    edge.data.wcr = None
                    if self.fullcopy:
                        edge.data.subset = sbs.Range.from_array(
                            node.desc(sdfg))
                    edge.data.other_subset = newmemlet.subset
                    graph.add_edge(node, None, edge.dst, edge.dst_conn,
                                   edge.data)
                    graph.remove_edge(edge)

        # Fourth, replace memlet arrays as necessary
        if self.expr_index == 0:
            scope_subgraph = graph.scope_subgraph(cnode)
            for edge in scope_subgraph.edges():
                if edge.data.data is not None and edge.data.data in cloned_arrays:
                    edge.data.data = cloned_arrays[edge.data.data]
Пример #3
0
    def apply(self, sdfg):
        graph = sdfg.nodes()[self.state_id]
        map_entry = graph.nodes()[self.subgraph[FPGATransformMap._map_entry]]
        map_entry.map._schedule = dtypes.ScheduleType.FPGA_Device

        # Find map exit nodes
        exit_nodes = graph.exit_nodes(map_entry)

        fpga_storage_types = [
            dtypes.StorageType.FPGA_Global, dtypes.StorageType.FPGA_Local,
            dtypes.StorageType.CPU_Pinned
        ]

        #######################################################
        # Add FPGA copies of CPU arrays (i.e., not already on FPGA)

        # First, understand which arrays to clone
        all_out_edges = []
        for enode in exit_nodes:
            all_out_edges.extend(list(graph.out_edges(enode)))
        in_arrays_to_clone = set()
        out_arrays_to_clone = set()
        for e in graph.in_edges(map_entry):
            data_node = sd.find_input_arraynode(graph, e)
            if data_node.desc(sdfg).storage not in fpga_storage_types:
                in_arrays_to_clone.add(data_node)
        for e in all_out_edges:
            data_node = sd.find_output_arraynode(graph, e)
            if data_node.desc(sdfg).storage not in fpga_storage_types:
                out_arrays_to_clone.add(data_node)

        # Second, create a FPGA clone of each array
        cloned_arrays = {}
        in_cloned_arraynodes = {}
        out_cloned_arraynodes = {}
        for array_node in in_arrays_to_clone:
            array = array_node.desc(sdfg)
            if array_node.data in cloned_arrays:
                pass
            elif 'fpga_' + array_node.data in sdfg.arrays:
                pass
            else:
                sdfg.add_array('fpga_' + array_node.data,
                               dtype=array.dtype,
                               shape=array.shape,
                               materialize_func=array.materialize_func,
                               transient=True,
                               storage=dtypes.StorageType.FPGA_Global,
                               allow_conflicts=array.allow_conflicts,
                               access_order=array.access_order,
                               strides=array.strides,
                               offset=array.offset)
                cloned_arrays[array_node.data] = 'fpga_' + array_node.data
            cloned_node = nodes.AccessNode('fpga_' + array_node.data)

            in_cloned_arraynodes[array_node.data] = cloned_node
        for array_node in out_arrays_to_clone:
            array = array_node.desc(sdfg)
            if array_node.data in cloned_arrays:
                pass
            elif 'fpga_' + array_node.data in sdfg.arrays:
                pass
            else:
                sdfg.add_array('fpga_' + array_node.data,
                               dtype=array.dtype,
                               shape=array.shape,
                               materialize_func=array.materialize_func,
                               transient=True,
                               storage=dtypes.StorageType.FPGA_Global,
                               allow_conflicts=array.allow_conflicts,
                               access_order=array.access_order,
                               strides=array.strides,
                               offset=array.offset)
                cloned_arrays[array_node.data] = 'fpga_' + array_node.data
            cloned_node = nodes.AccessNode('fpga_' + array_node.data)

            out_cloned_arraynodes[array_node.data] = cloned_node

        # Third, connect the cloned arrays to the originals
        # TODO(later): Shift indices and create only the necessary sub-arrays
        for array_name, node in in_cloned_arraynodes.items():
            graph.add_node(node)
            for edge in graph.in_edges(map_entry):
                if edge.data.data == array_name:
                    graph.remove_edge(edge)
                    graph.add_edge(edge.src, None, node, None, edge.data)
                    newmemlet = copy.copy(edge.data)
                    newmemlet.data = node.data
                    graph.add_edge(node, edge.src_conn, edge.dst,
                                   edge.dst_conn, newmemlet)
        for array_name, node in out_cloned_arraynodes.items():
            graph.add_node(node)
            for edge in all_out_edges:
                if edge.data.data == array_name:
                    graph.remove_edge(edge)
                    graph.add_edge(node, None, edge.dst, None, edge.data)
                    newmemlet = copy.copy(edge.data)
                    newmemlet.data = node.data
                    graph.add_edge(edge.src, edge.src_conn, node,
                                   edge.dst_conn, newmemlet)

        # Fourth, replace memlet arrays as necessary
        scope_subgraph = graph.scope_subgraph(map_entry)
        for edge in scope_subgraph.edges():
            if (edge.data.data is not None
                    and edge.data.data in cloned_arrays):
                edge.data.data = cloned_arrays[edge.data.data]
Пример #4
0
    def generate_node(self, sdfg: SDFG, dfg: SDFGState, state_id: int,
                      node: nodes.Node, function_stream: CodeIOStream,
                      callsite_stream: CodeIOStream):
        self.add_header(function_stream)

        # Reset the mappings
        self.stream_associations = dict()

        # Create empty shared registers for outputs into other tasklets
        for edge in dfg.out_edges(node):
            if isinstance(edge.dst, dace.nodes.Tasklet):
                self.create_empty_definition(node.out_connectors[edge.src_conn],
                                             edge,
                                             callsite_stream,
                                             is_code_code=True)

        callsite_stream.write('{')

        # Create input registers (and fill them accordingly)
        for edge in dfg.in_edges(node):
            if isinstance(edge.src, nodes.Tasklet):
                # Copy from tasklet is treated differently (because it involves a shared register)
                # Changing src_node to a Tasklet will trigger a different copy
                self.dispatcher.dispatch_copy(edge.src, node, edge, sdfg, dfg,
                                              state_id, function_stream,
                                              callsite_stream)
            else:
                # Copy from some array (or stream)
                src_node = find_input_arraynode(dfg, edge)
                self.dispatcher.dispatch_copy(src_node, node, edge, sdfg, dfg,
                                              state_id, function_stream,
                                              callsite_stream)

        # Keep track of (edge, node) that need a writeback
        requires_wb = []

        # Create output registers
        for edge in dfg.out_edges(node):
            if isinstance(edge.dst, nodes.Tasklet):
                # Output into another tasklet again is treated differently similar to the input registers
                self.dispatcher.dispatch_output_definition(
                    node, edge.dst, edge, sdfg, dfg, state_id, function_stream,
                    callsite_stream)

                requires_wb.append((edge, node))
            else:
                dst_node = find_output_arraynode(dfg, edge)
                dst_desc = dst_node.desc(sdfg)

                # Streams neither need an output register (pushes can happen at any time in a tasklet) nor a writeback
                if isinstance(dst_desc, dace.data.Stream):
                    # We flag the name of the stream variable
                    self.stream_associations[edge.src_conn] = (dst_node.data,
                                                               dst_desc.dtype)
                else:
                    self.dispatcher.dispatch_output_definition(
                        node, dst_node, edge, sdfg, dfg, state_id,
                        function_stream, callsite_stream)

                    requires_wb.append((edge, dst_node))

        # Generate tasklet code
        if isinstance(node, nodes.Tasklet):
            self.unparse_tasklet(sdfg, dfg, state_id, node, function_stream,
                                 callsite_stream)

        # Write back output registers to memory
        for edge, dst_node in requires_wb:
            self.write_back(sdfg, dfg, state_id, node, dst_node, edge,
                            function_stream, callsite_stream)

        callsite_stream.write('}')
Пример #5
0
    def _generate_Tasklet(self, sdfg, dfg, state_id, node, function_stream,
                          callsite_stream):

        # TODO: this is copy-pasta from the CPU-codegen, necessary to inject
        # pragmas at the output memlets! Should consolidate.

        callsite_stream.write('{\n', sdfg, state_id, node)

        state_dfg = sdfg.nodes()[state_id]

        self._dispatcher.defined_vars.enter_scope(node)

        arrays = set()
        for edge in dfg.in_edges(node):
            u = edge.src
            memlet = edge.data

            if edge.dst_conn:  # Not (None or "")

                if edge.dst_conn in arrays:  # Disallow duplicates
                    raise SyntaxError('Duplicates found in memlets')

                # Special case: code->code
                if isinstance(edge.src, dace.sdfg.nodes.CodeNode):
                    raise NotImplementedError(
                        "Tasklet to tasklet memlets not implemented")

                else:
                    src_node = find_input_arraynode(state_dfg, edge)
                    self._dispatcher.dispatch_copy(src_node, node, edge, sdfg,
                                                   state_dfg, state_id,
                                                   function_stream,
                                                   callsite_stream)

                # Also define variables in the C++ unparser scope
                self._cpu_codegen._locals.define(edge.dst_conn, -1,
                                                 self._cpu_codegen._ldepth + 1)
                arrays.add(edge.dst_conn)

        callsite_stream.write('\n', sdfg, state_id, node)

        # Use outgoing edges to preallocate output local vars
        for edge in dfg.out_edges(node):
            v = edge.dst
            memlet = edge.data

            if edge.src_conn:

                if edge.src_conn in arrays:  # Disallow duplicates
                    continue

                # Special case: code->code
                if isinstance(edge.dst, dace.sdfg.nodes.CodeNode):
                    raise NotImplementedError(
                        "Tasklet to tasklet memlets not implemented")

                else:
                    dst_node = find_output_arraynode(state_dfg, edge)
                    self._dispatcher.dispatch_copy(node, dst_node, edge, sdfg,
                                                   state_dfg, state_id,
                                                   function_stream,
                                                   callsite_stream)

                # Also define variables in the C++ unparser scope
                self._cpu_codegen._locals.define(edge.src_conn, -1,
                                                 self._cpu_codegen._ldepth + 1)
                arrays.add(edge.src_conn)

        callsite_stream.write("\n////////////////////\n", sdfg, state_id, node)

        cpp.unparse_tasklet(sdfg, state_id, dfg, node, function_stream,
                            callsite_stream, self._cpu_codegen._locals,
                            self._cpu_codegen._ldepth,
                            self._cpu_codegen._toplevel_schedule, self)

        callsite_stream.write("////////////////////\n\n", sdfg, state_id, node)

        # Process outgoing memlets
        self._cpu_codegen.process_out_memlets(sdfg,
                                              state_id,
                                              node,
                                              state_dfg,
                                              self._dispatcher,
                                              callsite_stream,
                                              True,
                                              function_stream,
                                              codegen=self)

        for edge in state_dfg.out_edges(node):
            datadesc = sdfg.arrays[edge.data.data]
            if (isinstance(datadesc, dace.data.Array) and
                (datadesc.storage == dace.dtypes.StorageType.FPGA_Local
                 or datadesc.storage == dace.dtypes.StorageType.FPGA_Registers)
                    and edge.data.wcr is None):
                self.generate_no_dependence_post(edge.src_conn,
                                                 callsite_stream, sdfg,
                                                 state_id, node)

        callsite_stream.write('}\n', sdfg, state_id, node)

        self._dispatcher.defined_vars.exit_scope(node)
Пример #6
0
    def apply(self, sdfg):
        graph = sdfg.nodes()[self.state_id]
        if self.expr_index == 0:
            cnode = graph.nodes()[self.subgraph[GPUTransformMap._map_entry]]
            node_schedprop = cnode.map
            exit_nodes = graph.exit_nodes(cnode)
        else:
            cnode = graph.nodes()[self.subgraph[GPUTransformMap._reduce]]
            node_schedprop = cnode
            exit_nodes = [cnode]

        # Change schedule
        node_schedprop._schedule = types.ScheduleType.GPU_Device

        gpu_storage_types = [
            types.StorageType.GPU_Global,
            types.StorageType.GPU_Shared,
            types.StorageType.GPU_Stack  #, types.StorageType.CPU_Pinned
        ]

        #######################################################
        # Add GPU copies of CPU arrays (i.e., not already on GPU)

        # First, understand which arrays to clone
        all_out_edges = []
        for enode in exit_nodes:
            all_out_edges.extend(list(graph.out_edges(enode)))
        in_arrays_to_clone = set()
        out_arrays_to_clone = set()
        for e in graph.in_edges(cnode):
            data_node = sd.find_input_arraynode(graph, e)
            if data_node.desc(sdfg).storage not in gpu_storage_types:
                in_arrays_to_clone.add(data_node)
        for e in all_out_edges:
            data_node = sd.find_output_arraynode(graph, e)
            if data_node.desc(sdfg).storage not in gpu_storage_types:
                out_arrays_to_clone.add(data_node)

        # Second, create a GPU clone of each array
        cloned_arrays = {}
        in_cloned_arraynodes = {}
        out_cloned_arraynodes = {}
        for array_node in in_arrays_to_clone:
            array = array_node.desc(sdfg)
            if array_node.data in cloned_arrays:
                cloned_array = cloned_arrays[array_node.data]
            else:
                cloned_array = sdfg.add_array(
                    'gpu_' + array_node.data,
                    array.shape,
                    array.dtype,
                    materialize_func=array.materialize_func,
                    transient=True,
                    storage=types.StorageType.GPU_Global,
                    allow_conflicts=array.allow_conflicts,
                    access_order=array.access_order,
                    strides=array.strides,
                    offset=array.offset)
                cloned_arrays[array_node.data] = 'gpu_' + array_node.data
            cloned_node = type(array_node)('gpu_' + array_node.data)

            in_cloned_arraynodes[array_node.data] = cloned_node
        for array_node in out_arrays_to_clone:
            array = array_node.desc(sdfg)
            if array_node.data in cloned_arrays:
                cloned_array = cloned_arrays[array_node.data]
            else:
                cloned_array = sdfg.add_array(
                    'gpu_' + array_node.data,
                    array.shape,
                    array.dtype,
                    materialize_func=array.materialize_func,
                    transient=True,
                    storage=types.StorageType.GPU_Global,
                    allow_conflicts=array.allow_conflicts,
                    access_order=array.access_order,
                    strides=array.strides,
                    offset=array.offset)
                cloned_arrays[array_node.data] = 'gpu_' + array_node.data
            cloned_node = type(array_node)('gpu_' + array_node.data)

            out_cloned_arraynodes[array_node.data] = cloned_node

        # Third, connect the cloned arrays to the originals
        # TODO(later): Shift indices and create only the necessary sub-arrays
        for array_name, node in in_cloned_arraynodes.items():
            graph.add_node(node)
            for edge in graph.in_edges(cnode):
                if edge.data.data == array_name:
                    graph.remove_edge(edge)
                    newmemlet = copy.copy(edge.data)
                    newmemlet.data = node.data
                    graph.add_edge(node, edge.src_conn, edge.dst,
                                   edge.dst_conn, newmemlet)

                    if self.fullcopy:
                        edge.data.subset = sbs.Range.from_array(
                            node.desc(sdfg))
                    edge.data.other_subset = edge.data.subset
                    graph.add_edge(edge.src, None, node, None, edge.data)
        for array_name, node in out_cloned_arraynodes.items():
            graph.add_node(node)
            for edge in all_out_edges:
                if edge.data.data == array_name:
                    graph.remove_edge(edge)
                    newmemlet = copy.copy(edge.data)
                    newmemlet.data = node.data
                    graph.add_edge(edge.src, edge.src_conn, node,
                                   edge.dst_conn, newmemlet)
                    edge.data.wcr = None
                    if self.fullcopy:
                        edge.data.subset = sbs.Range.from_array(
                            node.desc(sdfg))
                    edge.data.other_subset = edge.data.subset
                    graph.add_edge(node, None, edge.dst, None, edge.data)

        # Fourth, replace memlet arrays as necessary
        if self.expr_index == 0:
            scope_subgraph = graph.scope_subgraph(cnode)
            for edge in scope_subgraph.edges():
                if (edge.data.data is not None
                        and edge.data.data in cloned_arrays):
                    edge.data.data = cloned_arrays[edge.data.data]
Пример #7
0
    def apply(self, sdfg):
        graph = sdfg.nodes()[self.state_id]
        if self.expr_index == 0:
            cnode = graph.nodes()[self.subgraph[GPUTransformMap._map_entry]]
            node_schedprop = cnode.map
            exit_nodes = graph.exit_nodes(cnode)
        else:
            cnode = graph.nodes()[self.subgraph[GPUTransformMap._reduce]]
            node_schedprop = cnode
            exit_nodes = [cnode]

        # Change schedule
        node_schedprop._schedule = dtypes.ScheduleType.GPU_Device
        if Config.get_bool("debugprint"):
            GPUTransformMap._maps_transformed += 1

        gpu_storage_types = [
            dtypes.StorageType.GPU_Global,
            dtypes.StorageType.GPU_Shared,
            dtypes.StorageType.GPU_Stack  #, dtypes.StorageType.CPU_Pinned
        ]

        #######################################################
        # Add GPU copies of CPU arrays (i.e., not already on GPU)

        # First, understand which arrays to clone
        all_out_edges = []
        for enode in exit_nodes:
            all_out_edges.extend(list(graph.out_edges(enode)))
        in_arrays_to_clone = set()
        out_arrays_to_clone = set()
        out_streamarrays = {}
        for e in graph.in_edges(cnode):
            data_node = sd.find_input_arraynode(graph, e)
            if isinstance(data_node.desc(sdfg), data.Scalar):
                continue
            if data_node.desc(sdfg).storage not in gpu_storage_types:
                in_arrays_to_clone.add(data_node)
        for e in all_out_edges:
            data_node = sd.find_output_arraynode(graph, e)
            if isinstance(data_node.desc(sdfg), data.Scalar):
                continue
            if data_node.desc(sdfg).storage not in gpu_storage_types:
                # Stream directly connected to an array
                if sd.is_array_stream_view(sdfg, graph, data_node):
                    datadesc = data_node.desc(sdfg)
                    if datadesc.transient is False:
                        raise TypeError('Non-transient stream-array view are '
                                        'unsupported')
                    # Add parent node to clone
                    out_arrays_to_clone.add(graph.out_edges(data_node)[0].dst)
                    out_streamarrays[graph.out_edges(data_node)
                                     [0].dst] = data_node

                    # Do not clone stream
                    continue

                out_arrays_to_clone.add(data_node)
        if Config.get_bool("debugprint"):
            GPUTransformMap._arrays_removed += len(in_arrays_to_clone) + len(
                out_arrays_to_clone)

        # Second, create a GPU clone of each array
        cloned_arrays = {}
        in_cloned_arraynodes = {}
        out_cloned_arraynodes = {}
        for array_node in in_arrays_to_clone:
            array = array_node.desc(sdfg)
            if array_node.data in cloned_arrays:
                cloned_array = cloned_arrays[array_node.data]
            else:
                cloned_array = array.clone()
                cloned_array.storage = dtypes.StorageType.GPU_Global
                cloned_array.transient = True
                sdfg.add_datadesc('gpu_' + array_node.data, cloned_array)
                cloned_arrays[array_node.data] = 'gpu_' + array_node.data
            cloned_node = type(array_node)('gpu_' + array_node.data)

            in_cloned_arraynodes[array_node.data] = cloned_node
        for array_node in out_arrays_to_clone:
            array = array_node.desc(sdfg)
            if array_node.data in cloned_arrays:
                cloned_array = cloned_arrays[array_node.data]
            else:
                cloned_array = array.clone()
                cloned_array.storage = dtypes.StorageType.GPU_Global
                cloned_array.transient = True
                sdfg.add_datadesc('gpu_' + array_node.data, cloned_array)
                cloned_arrays[array_node.data] = 'gpu_' + array_node.data
            cloned_node = type(array_node)('gpu_' + array_node.data)

            out_cloned_arraynodes[array_node.data] = cloned_node

        # Third, connect the cloned arrays to the originals
        # TODO(later): Shift indices and create only the necessary sub-arrays
        for array_name, node in in_cloned_arraynodes.items():
            graph.add_node(node)
            for edge in graph.in_edges(cnode):
                if edge.data.data == array_name:
                    graph.remove_edge(edge)
                    newmemlet = copy.copy(edge.data)
                    newmemlet.data = node.data
                    graph.add_edge(node, edge.src_conn, edge.dst,
                                   edge.dst_conn, newmemlet)

                    if self.fullcopy:
                        edge.data.subset = sbs.Range.from_array(
                            node.desc(sdfg))
                    edge.data.other_subset = edge.data.subset
                    graph.add_edge(edge.src, None, node, None, edge.data)
        for array_name, node in out_cloned_arraynodes.items():
            graph.add_node(node)
            for edge in all_out_edges:
                if edge.data.data == array_name:
                    graph.remove_edge(edge)
                    newmemlet = copy.copy(edge.data)
                    newmemlet.data = node.data
                    graph.add_edge(edge.src, edge.src_conn, node,
                                   edge.dst_conn, newmemlet)
                    edge.data.wcr = None
                    if self.fullcopy:
                        edge.data.subset = sbs.Range.from_array(
                            node.desc(sdfg))
                    edge.data.other_subset = edge.data.subset
                    graph.add_edge(node, None, edge.dst, None, edge.data)

        # Reconnect stream-arrays
        for array_node, streamnode in out_streamarrays.items():
            # Set stream storage to GPU
            streamnode.desc(sdfg).storage = dtypes.StorageType.GPU_Global

            cloned_node = out_cloned_arraynodes[array_node.data]

            e = graph.out_edges(streamnode)[0]
            graph.remove_edge(e)
            newmemlet = copy.copy(e.data)
            newmemlet.data = cloned_node.data
            # stream -> cloned array
            graph.add_edge(e.src, e.src_conn, cloned_node, e.dst_conn,
                           newmemlet)
            # cloned array -> array
            graph.add_nedge(cloned_node, array_node, e.data)

        # Fourth, replace memlet arrays as necessary
        if self.expr_index == 0:
            scope_subgraph = graph.scope_subgraph(cnode)
            for edge in scope_subgraph.edges():
                if (edge.data.data is not None
                        and edge.data.data in cloned_arrays):
                    edge.data.data = cloned_arrays[edge.data.data]