Пример #1
0
 def on_node_begin(self,
                   sdfg,
                   state,
                   node,
                   outer_stream,
                   inner_stream=None,
                   global_stream=None):
     if (not isinstance(node, nodes.CodeNode)
             or is_devicelevel_gpu(sdfg, state, node)):
         return
     # Only run for host nodes
     # TODO(later): Implement "clock64"-based GPU counters
     if node.instrument == dtypes.InstrumentationType.GPU_Events:
         try:
             gpu_id = node.location['gpu']
         except KeyError:
             return
         state_id = sdfg.node_id(state)
         idstr = 'b' + self._idstr(sdfg, state, node)
         stream = node._cuda_stream[gpu_id]
         if self.debug:
             outer_stream.write(f"\n// node begin: {node}: {gpu_id}\n", sdfg,
                                state_id, node)
         outer_stream.write(self._record_event(idstr, stream, gpu_id), sdfg,
                            state_id, node)
Пример #2
0
    def can_be_applied(graph, candidate, expr_index, sdfg, strict=False):
        if expr_index == 0:
            map_entry = graph.nodes()[candidate[GPUTransformMap._map_entry]]
            candidate_map = map_entry.map

            # Map schedules that are disallowed to transform to GPUs
            if (candidate_map.schedule
                    in [dtypes.ScheduleType.MPI] + dtypes.GPU_SCHEDULES):
                return False
            if sd.is_devicelevel_gpu(sdfg, graph, map_entry):
                return False

            # Dynamic map ranges cannot become kernels
            if sd.has_dynamic_map_inputs(graph, map_entry):
                return False

            # Ensure that map does not include internal arrays that are
            # allocated on non-default space
            subgraph = graph.scope_subgraph(map_entry)
            for node in subgraph.nodes():
                if (isinstance(node, nodes.AccessNode) and
                        node.desc(sdfg).storage != dtypes.StorageType.Default
                        and
                        node.desc(sdfg).storage != dtypes.StorageType.Register):
                    return False

            # If one of the outputs is a stream, do not match
            map_exit = graph.exit_node(map_entry)
            for edge in graph.out_edges(map_exit):
                dst = graph.memlet_path(edge)[-1].dst
                if (isinstance(dst, nodes.AccessNode)
                        and isinstance(sdfg.arrays[dst.data], data.Stream)):
                    return False

            return True
        elif expr_index == 1:
            reduce = graph.nodes()[candidate[GPUTransformMap._reduce]]

            # Disallow GPU transformation if already in device-level code
            if sd.is_devicelevel_gpu(sdfg, graph, reduce):
                return False

            return True
Пример #3
0
 def on_node_begin(self, sdfg, state, node, outer_stream, inner_stream,
                   global_stream):
     if (not isinstance(node, nodes.CodeNode)
             or is_devicelevel_gpu(sdfg, state, node)):
         return
     # Only run for host nodes
     # TODO(later): Implement "clock64"-based GPU counters
     if node.instrument == dtypes.InstrumentationType.GPU_Events:
         state_id = sdfg.node_id(state)
         idstr = 'b' + self._idstr(sdfg, state, node)
         outer_stream.write(self._record_event(idstr, node._cuda_stream),
                            sdfg, state_id, node)
Пример #4
0
def presynchronize_streams(sdfg, dfg, state_id, node, callsite_stream):
    state_dfg = sdfg.nodes()[state_id]
    if hasattr(node, "_cuda_stream") or is_devicelevel_gpu(
            sdfg, state_dfg, node):
        return
    for e in state_dfg.in_edges(node):
        if hasattr(e.src, "_cuda_stream"):
            cudastream = "dace::cuda::__streams[%d]" % e.src._cuda_stream
            callsite_stream.write(
                "cudaStreamSynchronize(%s);" % cudastream,
                sdfg,
                state_id,
                [e.src, e.dst],
            )
Пример #5
0
def presynchronize_streams(sdfg, dfg, state_id, node, callsite_stream):
    state_dfg = sdfg.nodes()[state_id]
    if hasattr(node, "_cuda_stream") or is_devicelevel_gpu(
            sdfg, state_dfg, node):
        return
    backend = Config.get('compiler', 'cuda', 'backend')
    for e in state_dfg.in_edges(node):
        if hasattr(e.src, "_cuda_stream"):
            cudastream = "dace::cuda::__streams[%d]" % e.src._cuda_stream
            callsite_stream.write(
                "%sStreamSynchronize(%s);" % (backend, cudastream),
                sdfg,
                state_id,
                [e.src, e.dst],
            )
Пример #6
0
 def on_node_end(self, sdfg, state, node, outer_stream, inner_stream,
                 global_stream):
     if (not isinstance(node, nodes.Tasklet)
             or is_devicelevel_gpu(sdfg, state, node)):
         return
     # Only run for host nodes
     # TODO(later): Implement "clock64"-based GPU counters
     if node.instrument == dtypes.InstrumentationType.GPU_Events:
         state_id = sdfg.node_id(state)
         idstr = 'e' + self._idstr(sdfg, state, node)
         stream = getattr(node, '_cuda_stream', -1)
         outer_stream.write(self._record_event(idstr, stream), sdfg,
                            state_id, node)
         outer_stream.write(
             self._report('%s %s' % (type(node).__name__, node.label), sdfg,
                          state, node), sdfg, state_id, node)
Пример #7
0
    def on_node_end(self, sdfg: SDFG, state: SDFGState, node: nodes.AccessNode,
                    outer_stream: CodeIOStream, inner_stream: CodeIOStream,
                    global_stream: CodeIOStream):
        from dace.codegen.dispatcher import DefinedType  # Avoid import loop

        if is_devicelevel_gpu(sdfg, state, node) or is_devicelevel_fpga(
                sdfg, state, node):
            # Only run on host code
            return

        desc = node.desc(sdfg)

        # Obtain a pointer for arrays and scalars
        ptrname = cpp.ptr(node.data, desc, sdfg, self.codegen)
        defined_type, _ = self.codegen.dispatcher.defined_vars.get(ptrname)
        if defined_type == DefinedType.Scalar:
            ptrname = '&' + ptrname

        # Create UUID
        state_id = sdfg.node_id(state)
        node_id = state.node_id(node)
        uuid = f'{sdfg.sdfg_id}_{state_id}_{node_id}'

        # Get optional pre/postamble for instrumenting device data
        preamble, postamble = '', ''
        if desc.storage == dtypes.StorageType.GPU_Global:
            self._setup_gpu_runtime(sdfg, global_stream)
            preamble, postamble, ptrname = self._generate_copy_to_host(
                node, desc, ptrname)

        # Encode runtime shape and strides
        shape = ', '.join(cpp.sym2cpp(s) for s in desc.shape)
        strides = ', '.join(cpp.sym2cpp(s) for s in desc.strides)

        # Write code
        inner_stream.write(preamble, sdfg, state_id, node_id)
        inner_stream.write(
            f'__state->serializer->save({ptrname}, {cpp.sym2cpp(desc.total_size - desc.start_offset)}, '
            f'"{node.data}", "{uuid}", {shape}, {strides});\n', sdfg, state_id,
            node_id)
        inner_stream.write(postamble, sdfg, state_id, node_id)
Пример #8
0
def unparse_tasklet(sdfg, state_id, dfg, node, function_stream,
                    callsite_stream, locals, ldepth, toplevel_schedule,
                    codegen):

    if node.label is None or node.label == "":
        return ""

    state_dfg = sdfg.nodes()[state_id]

    # Not [], "" or None
    if not node.code:
        return ""

    # If raw C++ code, return the code directly
    if node.language != dtypes.Language.Python:
        # If this code runs on the host and is associated with a GPU stream,
        # set the stream to a local variable.
        max_streams = int(
            Config.get("compiler", "cuda", "max_concurrent_streams"))
        if (max_streams >= 0 and not is_devicelevel_gpu(sdfg, state_dfg, node)
                and hasattr(node, "_cuda_stream")):
            callsite_stream.write(
                'int __dace_current_stream_id = %d;\n%sStream_t __dace_current_stream = dace::cuda::__streams[__dace_current_stream_id];'
                %
                (node._cuda_stream, Config.get('compiler', 'cuda', 'backend')),
                sdfg,
                state_id,
                node,
            )

        if node.language != dtypes.Language.CPP:
            raise ValueError(
                "Only Python or C++ code supported in CPU codegen, got: {}".
                format(node.language))
        callsite_stream.write(
            type(node).__properties__["code"].to_string(node.code), sdfg,
            state_id, node)

        if hasattr(node, "_cuda_stream") and not is_devicelevel_gpu(
                sdfg, state_dfg, node):
            synchronize_streams(sdfg, state_dfg, state_id, node, node,
                                callsite_stream)
        return

    body = node.code.code

    # Map local names to memlets (for WCR detection)
    memlets = {}
    for edge in state_dfg.all_edges(node):
        u, uconn, v, vconn, memlet = edge
        if u == node:
            memlet_nc = not is_write_conflicted(
                dfg, edge, sdfg_schedule=toplevel_schedule)
            memlet_wcr = memlet.wcr
            if uconn in u.out_connectors:
                conntype = u.out_connectors[uconn]
            else:
                conntype = None

            memlets[uconn] = (memlet, memlet_nc, memlet_wcr, conntype)
        elif v == node:
            if vconn in v.in_connectors:
                conntype = v.in_connectors[vconn]
            else:
                conntype = None

            memlets[vconn] = (memlet, False, None, conntype)

    callsite_stream.write("// Tasklet code (%s)\n" % node.label, sdfg,
                          state_id, node)
    for stmt in body:
        stmt = copy.deepcopy(stmt)
        rk = StructInitializer(sdfg).visit(stmt)
        if isinstance(stmt, ast.Expr):
            rk = DaCeKeywordRemover(sdfg, memlets, sdfg.constants,
                                    codegen).visit_TopLevelExpr(stmt)
        else:
            rk = DaCeKeywordRemover(sdfg, memlets, sdfg.constants,
                                    codegen).visit(stmt)

        if rk is not None:
            # Unparse to C++ and add 'auto' declarations if locals not declared
            result = StringIO()
            cppunparse.CPPUnparser(rk, ldepth + 1, locals, result)
            callsite_stream.write(result.getvalue(), sdfg, state_id, node)