def on_node_begin(self, sdfg, state, node, outer_stream, inner_stream=None, global_stream=None): if (not isinstance(node, nodes.CodeNode) or is_devicelevel_gpu(sdfg, state, node)): return # Only run for host nodes # TODO(later): Implement "clock64"-based GPU counters if node.instrument == dtypes.InstrumentationType.GPU_Events: try: gpu_id = node.location['gpu'] except KeyError: return state_id = sdfg.node_id(state) idstr = 'b' + self._idstr(sdfg, state, node) stream = node._cuda_stream[gpu_id] if self.debug: outer_stream.write(f"\n// node begin: {node}: {gpu_id}\n", sdfg, state_id, node) outer_stream.write(self._record_event(idstr, stream, gpu_id), sdfg, state_id, node)
def can_be_applied(graph, candidate, expr_index, sdfg, strict=False): if expr_index == 0: map_entry = graph.nodes()[candidate[GPUTransformMap._map_entry]] candidate_map = map_entry.map # Map schedules that are disallowed to transform to GPUs if (candidate_map.schedule in [dtypes.ScheduleType.MPI] + dtypes.GPU_SCHEDULES): return False if sd.is_devicelevel_gpu(sdfg, graph, map_entry): return False # Dynamic map ranges cannot become kernels if sd.has_dynamic_map_inputs(graph, map_entry): return False # Ensure that map does not include internal arrays that are # allocated on non-default space subgraph = graph.scope_subgraph(map_entry) for node in subgraph.nodes(): if (isinstance(node, nodes.AccessNode) and node.desc(sdfg).storage != dtypes.StorageType.Default and node.desc(sdfg).storage != dtypes.StorageType.Register): return False # If one of the outputs is a stream, do not match map_exit = graph.exit_node(map_entry) for edge in graph.out_edges(map_exit): dst = graph.memlet_path(edge)[-1].dst if (isinstance(dst, nodes.AccessNode) and isinstance(sdfg.arrays[dst.data], data.Stream)): return False return True elif expr_index == 1: reduce = graph.nodes()[candidate[GPUTransformMap._reduce]] # Disallow GPU transformation if already in device-level code if sd.is_devicelevel_gpu(sdfg, graph, reduce): return False return True
def on_node_begin(self, sdfg, state, node, outer_stream, inner_stream, global_stream): if (not isinstance(node, nodes.CodeNode) or is_devicelevel_gpu(sdfg, state, node)): return # Only run for host nodes # TODO(later): Implement "clock64"-based GPU counters if node.instrument == dtypes.InstrumentationType.GPU_Events: state_id = sdfg.node_id(state) idstr = 'b' + self._idstr(sdfg, state, node) outer_stream.write(self._record_event(idstr, node._cuda_stream), sdfg, state_id, node)
def presynchronize_streams(sdfg, dfg, state_id, node, callsite_stream): state_dfg = sdfg.nodes()[state_id] if hasattr(node, "_cuda_stream") or is_devicelevel_gpu( sdfg, state_dfg, node): return for e in state_dfg.in_edges(node): if hasattr(e.src, "_cuda_stream"): cudastream = "dace::cuda::__streams[%d]" % e.src._cuda_stream callsite_stream.write( "cudaStreamSynchronize(%s);" % cudastream, sdfg, state_id, [e.src, e.dst], )
def presynchronize_streams(sdfg, dfg, state_id, node, callsite_stream): state_dfg = sdfg.nodes()[state_id] if hasattr(node, "_cuda_stream") or is_devicelevel_gpu( sdfg, state_dfg, node): return backend = Config.get('compiler', 'cuda', 'backend') for e in state_dfg.in_edges(node): if hasattr(e.src, "_cuda_stream"): cudastream = "dace::cuda::__streams[%d]" % e.src._cuda_stream callsite_stream.write( "%sStreamSynchronize(%s);" % (backend, cudastream), sdfg, state_id, [e.src, e.dst], )
def on_node_end(self, sdfg, state, node, outer_stream, inner_stream, global_stream): if (not isinstance(node, nodes.Tasklet) or is_devicelevel_gpu(sdfg, state, node)): return # Only run for host nodes # TODO(later): Implement "clock64"-based GPU counters if node.instrument == dtypes.InstrumentationType.GPU_Events: state_id = sdfg.node_id(state) idstr = 'e' + self._idstr(sdfg, state, node) stream = getattr(node, '_cuda_stream', -1) outer_stream.write(self._record_event(idstr, stream), sdfg, state_id, node) outer_stream.write( self._report('%s %s' % (type(node).__name__, node.label), sdfg, state, node), sdfg, state_id, node)
def on_node_end(self, sdfg: SDFG, state: SDFGState, node: nodes.AccessNode, outer_stream: CodeIOStream, inner_stream: CodeIOStream, global_stream: CodeIOStream): from dace.codegen.dispatcher import DefinedType # Avoid import loop if is_devicelevel_gpu(sdfg, state, node) or is_devicelevel_fpga( sdfg, state, node): # Only run on host code return desc = node.desc(sdfg) # Obtain a pointer for arrays and scalars ptrname = cpp.ptr(node.data, desc, sdfg, self.codegen) defined_type, _ = self.codegen.dispatcher.defined_vars.get(ptrname) if defined_type == DefinedType.Scalar: ptrname = '&' + ptrname # Create UUID state_id = sdfg.node_id(state) node_id = state.node_id(node) uuid = f'{sdfg.sdfg_id}_{state_id}_{node_id}' # Get optional pre/postamble for instrumenting device data preamble, postamble = '', '' if desc.storage == dtypes.StorageType.GPU_Global: self._setup_gpu_runtime(sdfg, global_stream) preamble, postamble, ptrname = self._generate_copy_to_host( node, desc, ptrname) # Encode runtime shape and strides shape = ', '.join(cpp.sym2cpp(s) for s in desc.shape) strides = ', '.join(cpp.sym2cpp(s) for s in desc.strides) # Write code inner_stream.write(preamble, sdfg, state_id, node_id) inner_stream.write( f'__state->serializer->save({ptrname}, {cpp.sym2cpp(desc.total_size - desc.start_offset)}, ' f'"{node.data}", "{uuid}", {shape}, {strides});\n', sdfg, state_id, node_id) inner_stream.write(postamble, sdfg, state_id, node_id)
def unparse_tasklet(sdfg, state_id, dfg, node, function_stream, callsite_stream, locals, ldepth, toplevel_schedule, codegen): if node.label is None or node.label == "": return "" state_dfg = sdfg.nodes()[state_id] # Not [], "" or None if not node.code: return "" # If raw C++ code, return the code directly if node.language != dtypes.Language.Python: # If this code runs on the host and is associated with a GPU stream, # set the stream to a local variable. max_streams = int( Config.get("compiler", "cuda", "max_concurrent_streams")) if (max_streams >= 0 and not is_devicelevel_gpu(sdfg, state_dfg, node) and hasattr(node, "_cuda_stream")): callsite_stream.write( 'int __dace_current_stream_id = %d;\n%sStream_t __dace_current_stream = dace::cuda::__streams[__dace_current_stream_id];' % (node._cuda_stream, Config.get('compiler', 'cuda', 'backend')), sdfg, state_id, node, ) if node.language != dtypes.Language.CPP: raise ValueError( "Only Python or C++ code supported in CPU codegen, got: {}". format(node.language)) callsite_stream.write( type(node).__properties__["code"].to_string(node.code), sdfg, state_id, node) if hasattr(node, "_cuda_stream") and not is_devicelevel_gpu( sdfg, state_dfg, node): synchronize_streams(sdfg, state_dfg, state_id, node, node, callsite_stream) return body = node.code.code # Map local names to memlets (for WCR detection) memlets = {} for edge in state_dfg.all_edges(node): u, uconn, v, vconn, memlet = edge if u == node: memlet_nc = not is_write_conflicted( dfg, edge, sdfg_schedule=toplevel_schedule) memlet_wcr = memlet.wcr if uconn in u.out_connectors: conntype = u.out_connectors[uconn] else: conntype = None memlets[uconn] = (memlet, memlet_nc, memlet_wcr, conntype) elif v == node: if vconn in v.in_connectors: conntype = v.in_connectors[vconn] else: conntype = None memlets[vconn] = (memlet, False, None, conntype) callsite_stream.write("// Tasklet code (%s)\n" % node.label, sdfg, state_id, node) for stmt in body: stmt = copy.deepcopy(stmt) rk = StructInitializer(sdfg).visit(stmt) if isinstance(stmt, ast.Expr): rk = DaCeKeywordRemover(sdfg, memlets, sdfg.constants, codegen).visit_TopLevelExpr(stmt) else: rk = DaCeKeywordRemover(sdfg, memlets, sdfg.constants, codegen).visit(stmt) if rk is not None: # Unparse to C++ and add 'auto' declarations if locals not declared result = StringIO() cppunparse.CPPUnparser(rk, ldepth + 1, locals, result) callsite_stream.write(result.getvalue(), sdfg, state_id, node)