def on_node_end(self, sdfg: SDFG, state: SDFGState, node: nodes.AccessNode, outer_stream: CodeIOStream, inner_stream: CodeIOStream, global_stream: CodeIOStream): from dace.codegen.dispatcher import DefinedType # Avoid import loop if is_devicelevel_gpu(sdfg, state, node) or is_devicelevel_fpga( sdfg, state, node): # Only run on host code return desc = node.desc(sdfg) # Obtain a pointer for arrays and scalars ptrname = cpp.ptr(node.data, desc, sdfg, self.codegen) defined_type, _ = self.codegen.dispatcher.defined_vars.get(ptrname) if defined_type == DefinedType.Scalar: ptrname = '&' + ptrname # Create UUID state_id = sdfg.node_id(state) node_id = state.node_id(node) uuid = f'{sdfg.sdfg_id}_{state_id}_{node_id}' # Get optional pre/postamble for instrumenting device data preamble, postamble = '', '' if desc.storage == dtypes.StorageType.GPU_Global: self._setup_gpu_runtime(sdfg, global_stream) preamble, postamble, ptrname = self._generate_copy_to_host( node, desc, ptrname) # Encode runtime shape and strides shape = ', '.join(cpp.sym2cpp(s) for s in desc.shape) strides = ', '.join(cpp.sym2cpp(s) for s in desc.strides) # Write code inner_stream.write(preamble, sdfg, state_id, node_id) inner_stream.write( f'__state->serializer->save({ptrname}, {cpp.sym2cpp(desc.total_size - desc.start_offset)}, ' f'"{node.data}", "{uuid}", {shape}, {strides});\n', sdfg, state_id, node_id) inner_stream.write(postamble, sdfg, state_id, node_id)
def apply(self, graph: SDFGState, sdfg: SDFG): tile_strides = self.tile_sizes if self.strides is not None and len(self.strides) == len(tile_strides): tile_strides = self.strides # Retrieve map entry and exit nodes. map_entry = self.map_entry from dace.transformation.dataflow.map_collapse import MapCollapse from dace.transformation.dataflow.strip_mining import StripMining stripmine_subgraph = { StripMining.map_entry: self.subgraph[MapTiling.map_entry] } sdfg_id = sdfg.sdfg_id last_map_entry = None removed_maps = 0 original_schedule = map_entry.schedule for dim_idx in range(len(map_entry.map.params)): if dim_idx >= len(self.tile_sizes): tile_size = symbolic.pystr_to_symbolic(self.tile_sizes[-1]) tile_stride = symbolic.pystr_to_symbolic(tile_strides[-1]) else: tile_size = symbolic.pystr_to_symbolic( self.tile_sizes[dim_idx]) tile_stride = symbolic.pystr_to_symbolic(tile_strides[dim_idx]) # handle offsets if self.tile_offset and dim_idx >= len(self.tile_offset): offset = self.tile_offset[-1] elif self.tile_offset: offset = self.tile_offset[dim_idx] else: offset = 0 dim_idx -= removed_maps # If tile size is trivial, skip strip-mining map dimension if tile_size == map_entry.map.range.size()[dim_idx]: continue stripmine = StripMining(sdfg, sdfg_id, self.state_id, stripmine_subgraph, self.expr_index) # Special case: Tile size of 1 should be omitted from inner map if tile_size == 1 and tile_stride == 1 and self.tile_trivial == False: stripmine.dim_idx = dim_idx stripmine.new_dim_prefix = '' stripmine.tile_size = str(tile_size) stripmine.tile_stride = str(tile_stride) stripmine.divides_evenly = True stripmine.tile_offset = str(offset) stripmine.apply(graph, sdfg) removed_maps += 1 else: stripmine.dim_idx = dim_idx stripmine.new_dim_prefix = self.prefix stripmine.tile_size = str(tile_size) stripmine.tile_stride = str(tile_stride) stripmine.divides_evenly = self.divides_evenly stripmine.tile_offset = str(offset) stripmine.apply(graph, sdfg) # apply to the new map the schedule of the original one map_entry.schedule = original_schedule if last_map_entry: new_map_entry = graph.in_edges(map_entry)[0].src mapcollapse_subgraph = { MapCollapse.outer_map_entry: graph.node_id(last_map_entry), MapCollapse.inner_map_entry: graph.node_id(new_map_entry) } mapcollapse = MapCollapse(sdfg, sdfg_id, self.state_id, mapcollapse_subgraph, 0) mapcollapse.apply(graph, sdfg) last_map_entry = graph.in_edges(map_entry)[0].src return last_map_entry
def apply(self, graph: SDFGState, sdfg: SDFG): map_entry = self.map_entry # Avoiding import loops from dace.transformation.dataflow.strip_mining import StripMining from dace.transformation.dataflow.local_storage import InLocalStorage, OutLocalStorage, LocalStorage rangeexpr = str(map_entry.map.range.num_elements()) stripmine_subgraph = { StripMining.map_entry: self.subgraph[MPITransformMap.map_entry] } sdfg_id = sdfg.sdfg_id stripmine = StripMining(sdfg, sdfg_id, self.state_id, stripmine_subgraph, self.expr_index) stripmine.dim_idx = -1 stripmine.new_dim_prefix = "mpi" stripmine.tile_size = "(" + rangeexpr + "/__dace_comm_size)" stripmine.divides_evenly = True stripmine.apply(graph, sdfg) # Find all in-edges that lead to the map entry outer_map = None edges = [ e for e in graph.in_edges(map_entry) if isinstance(e.src, nodes.EntryNode) ] outer_map = edges[0].src # Add MPI schedule attribute to outer map outer_map.map._schedule = dtypes.ScheduleType.MPI # Now create a transient for each array for e in edges: in_local_storage_subgraph = { LocalStorage.node_a: graph.node_id(outer_map), LocalStorage.node_b: self.subgraph[MPITransformMap.map_entry] } sdfg_id = sdfg.sdfg_id in_local_storage = InLocalStorage(sdfg, sdfg_id, self.state_id, in_local_storage_subgraph, self.expr_index) in_local_storage.array = e.data.data in_local_storage.apply(graph, sdfg) # Transform OutLocalStorage for each output of the MPI map in_map_exit = graph.exit_node(map_entry) out_map_exit = graph.exit_node(outer_map) for e in graph.out_edges(out_map_exit): name = e.data.data outlocalstorage_subgraph = { LocalStorage.node_a: graph.node_id(in_map_exit), LocalStorage.node_b: graph.node_id(out_map_exit) } sdfg_id = sdfg.sdfg_id outlocalstorage = OutLocalStorage(sdfg, sdfg_id, self.state_id, outlocalstorage_subgraph, self.expr_index) outlocalstorage.array = name outlocalstorage.apply(graph, sdfg)