def tile_wcrs(graph_or_subgraph: GraphViewType, validate_all: bool, prefer_partial_parallelism: bool = None) -> None: """ Tiles parallel write-conflict resolution maps in an SDFG, state, or subgraphs thereof. Reduces the number of atomic operations by tiling and introducing transient arrays to accumulate atomics on. :param graph_or_subgraph: The SDFG/state/subgraph to optimize within. :param validate_all: If True, runs SDFG validation after every tiling. :param prefer_partial_parallelism: If set, prefers extracting non-conflicted map dimensions over tiling WCR map (may not perform well if parallel dimensions are small). :note: This function operates in-place. """ # Avoid import loops from dace.codegen.targets import cpp from dace.frontend import operations from dace.transformation import dataflow, helpers as xfh # Determine on which nodes to run the operation graph = graph_or_subgraph if isinstance(graph_or_subgraph, gr.SubgraphView): graph = graph_or_subgraph.graph if isinstance(graph, SDFG): for state in graph_or_subgraph.nodes(): tile_wcrs(state, validate_all) return if not isinstance(graph, SDFGState): raise TypeError( 'Graph must be a state, an SDFG, or a subgraph of either') sdfg = graph.parent edges_to_consider: Set[Tuple[gr.MultiConnectorEdge[Memlet], nodes.MapEntry]] = set() for edge in graph_or_subgraph.edges(): if edge.data.wcr is not None: if (isinstance(edge.src, (nodes.MapExit, nodes.NestedSDFG)) or isinstance(edge.dst, nodes.MapEntry)): # Do not consider intermediate edges continue reason = cpp.is_write_conflicted_with_reason(graph, edge) if reason is None or not isinstance(reason, nodes.MapEntry): # Do not consider edges that will not generate atomics or # atomics we cannot transform continue if reason not in graph_or_subgraph.nodes(): # Skip if conflict exists outside of nested SDFG continue # Check if identity value can be inferred redtype = operations.detect_reduction_type(edge.data.wcr) dtype = sdfg.arrays[edge.data.data].dtype identity = dtypes.reduction_identity(dtype, redtype) if identity is None: # Cannot infer identity value continue edges_to_consider.add((edge, reason)) tile_size = config.Config.get('optimizer', 'autotile_size') debugprint = config.Config.get_bool('debugprint') if prefer_partial_parallelism is None: prefer_partial_parallelism = config.Config.get_bool( 'optimizer', 'autotile_partial_parallelism') maps_to_consider: Set[nodes.MapEntry] = set(me for _, me in edges_to_consider) transformed: Set[nodes.MapEntry] = set() # Heuristic: If the map is only partially conflicted, extract # parallel dimensions instead of tiling if prefer_partial_parallelism: for mapentry in maps_to_consider: # Check the write-conflicts of all WCR edges in map conflicts: Set[str] = set() for edge, me in edges_to_consider: if me is not mapentry: continue conflicts |= set( cpp.write_conflicted_map_params(mapentry, edge)) nonconflicted_dims = set(mapentry.params) - conflicts if nonconflicted_dims: dims = [ i for i, p in enumerate(mapentry.params) if p in nonconflicted_dims ] if ((dt._prod(s for i, s in enumerate(mapentry.range.size()) if i in dims) < tile_size) == True): # Map has a small range, extracting parallelism may not be # beneficial continue xfh.extract_map_dims(sdfg, mapentry, dims) transformed.add(mapentry) # Tile and accumulate other not-transformed maps for edge, mapentry in edges_to_consider: if mapentry in transformed: continue transformed.add(mapentry) # NOTE: The test "(x < y) == True" below is crafted for SymPy # to be "definitely True" if all((s < tile_size) == True for s in mapentry.map.range.size()): # If smaller than tile size, don't transform and instead # make map sequential if debugprint: print(f'Making map "{mapentry}" sequential due to being ' 'smaller than tile size') mapentry.map.schedule = dtypes.ScheduleType.Sequential continue # MapTiling -> AccumulateTransient / AccumulateStream outer_mapentry = dataflow.MapTiling.apply_to( sdfg, dict(tile_sizes=(tile_size, )), map_entry=mapentry) # Transform all outgoing WCR and stream edges mapexit = graph.exit_node(mapentry) outer_mapexit = graph.exit_node(outer_mapentry) # Tuple of (transformation type, options, pattern) to_apply: Tuple[Union[dataflow.StreamTransient, dataflow.AccumulateTransient], Dict[str, Any], Dict[str, nodes.Node]] = None for e in graph.out_edges(mapexit): if isinstance(sdfg.arrays[e.data.data], dt.Stream): mpath = graph.memlet_path(e) tasklet = mpath[0].src if not isinstance(tasklet, nodes.Tasklet) or len(mpath) != 3: # TODO(later): Implement StreamTransient independently of tasklet continue # Make transient only if there is one WCR/stream if to_apply is not None: to_apply = None break to_apply = (dataflow.StreamTransient, {}, dict(tasklet=tasklet, map_exit=mapexit, outer_map_exit=outer_mapexit)) else: if (e.data.is_empty() or e.data.wcr is None or e.data.wcr_nonatomic or (e.data.dst_subset is not None and e.data.dst_subset.num_elements() > 0 and e.data.dynamic)): continue dtype = sdfg.arrays[e.data.data].dtype redtype = operations.detect_reduction_type(e.data.wcr) identity = dtypes.reduction_identity(dtype, redtype) if identity is None: # Cannot infer identity value continue # Make transient only if there is one WCR/stream if to_apply is not None: to_apply = None break to_apply = (dataflow.AccumulateTransient, dict(identity=identity, array=e.data.data), dict(map_exit=mapexit, outer_map_exit=outer_mapexit)) if to_apply is not None: xform, opts, pattern = to_apply xform.apply_to(sdfg, options=opts, **pattern) if debugprint and len(transformed) > 0: print(f'Optimized {len(transformed)} write-conflicted maps')
def apply(self, sdfg: SDFG) -> None: graph: SDFGState = sdfg.nodes()[self.state_id] inner_map_entry: nodes.MapEntry = graph.nodes()[self.subgraph[ GPUMultiTransformMap._map_entry]] number_of_gpus = self.number_of_gpus ngpus = Config.get("compiler", "cuda", "max_number_gpus") if (number_of_gpus == None): number_of_gpus = ngpus if number_of_gpus > ngpus: raise ValueError( 'Requesting more gpus than specified in the dace config') # Avoiding import loops from dace.transformation.dataflow import (StripMining, InLocalStorage, OutLocalStorage, AccumulateTransient) # The user has responsibility for the implementation of a Library node. scope_subgraph = graph.scope_subgraph(inner_map_entry) for node in scope_subgraph.nodes(): if isinstance(node, nodes.LibraryNode): warnings.warn( 'Node %s is a library node, make sure to manually set the ' 'implementation to a GPU compliant specialization.' % node) # Tile map into number_of_gpus tiles outer_map: nodes.Map = StripMining.apply_to( sdfg, dict(dim_idx=-1, new_dim_prefix=self.new_dim_prefix, tile_size=number_of_gpus, tiling_type=dtypes.TilingType.NumberOfTiles), _map_entry=inner_map_entry) outer_map_entry: nodes.MapEntry = graph.scope_dict()[inner_map_entry] inner_map_exit: nodes.MapExit = graph.exit_node(inner_map_entry) outer_map_exit: nodes.MapExit = graph.exit_node(outer_map_entry) # Change map schedules inner_map_entry.map.schedule = dtypes.ScheduleType.GPU_Device outer_map.schedule = dtypes.ScheduleType.GPU_Multidevice symbolic_gpu_id = outer_map.params[0] # Add the parameter of the outer map for node in graph.successors(inner_map_entry): if isinstance(node, nodes.NestedSDFG): map_syms = inner_map_entry.range.free_symbols for sym in map_syms: symname = str(sym) if symname not in node.symbol_mapping.keys(): node.symbol_mapping[symname] = sym node.sdfg.symbols[symname] = graph.symbols_defined_at( node)[symname] # Add transient Data leading to the inner map prefix = self.new_transient_prefix for node in graph.predecessors(outer_map_entry): # Only AccessNodes are relevant if (isinstance(node, nodes.AccessNode) and not (self.skip_scalar and isinstance(node.desc(sdfg), Scalar))): if self.use_p2p and node.desc( sdfg).storage is dtypes.StorageType.GPU_Global: continue in_data_node = InLocalStorage.apply_to(sdfg, dict(array=node.data, prefix=prefix), verify=False, save=False, node_a=outer_map_entry, node_b=inner_map_entry) in_data_node.desc(sdfg).location['gpu'] = symbolic_gpu_id in_data_node.desc(sdfg).storage = dtypes.StorageType.GPU_Global wcr_data: Dict[str, Any] = {} # Add transient Data leading to the outer map for edge in graph.in_edges(outer_map_exit): node = graph.memlet_path(edge)[-1].dst if isinstance(node, nodes.AccessNode): data_name = node.data # Transients with write-conflict resolution need to be # collected first as AccumulateTransient creates a nestedSDFG if edge.data.wcr is not None: dtype = sdfg.arrays[data_name].dtype redtype = operations.detect_reduction_type(edge.data.wcr) # Custom reduction can not have an accumulate transient, # as the accumulation from the transient to the outer # storage is not defined. if redtype == dtypes.ReductionType.Custom: warnings.warn( 'Using custom reductions in a GPUMultitransformed ' 'Map only works for a small data volume. For large ' 'volume there is no guarantee.') continue identity = dtypes.reduction_identity(dtype, redtype) wcr_data[data_name] = identity elif (not isinstance(node.desc(sdfg), Scalar) or not self.skip_scalar): if self.use_p2p and node.desc( sdfg).storage is dtypes.StorageType.GPU_Global: continue # Transients without write-conflict resolution if prefix + '_' + data_name in sdfg.arrays: create_array = False else: create_array = True out_data_node = OutLocalStorage.apply_to( sdfg, dict(array=data_name, prefix=prefix, create_array=create_array), verify=False, save=False, node_a=inner_map_exit, node_b=outer_map_exit) out_data_node.desc(sdfg).location['gpu'] = symbolic_gpu_id out_data_node.desc( sdfg).storage = dtypes.StorageType.GPU_Global # Add Transients for write-conflict resolution if len(wcr_data) != 0: nsdfg = AccumulateTransient.apply_to( sdfg, options=dict(array_identity_dict=wcr_data, prefix=prefix), map_exit=inner_map_exit, outer_map_exit=outer_map_exit) nsdfg.schedule = dtypes.ScheduleType.GPU_Multidevice nsdfg.location['gpu'] = symbolic_gpu_id for transient_node in graph.successors(nsdfg): if isinstance(transient_node, nodes.AccessNode): transient_node.desc(sdfg).location['gpu'] = symbolic_gpu_id transient_node.desc( sdfg).storage = dtypes.StorageType.GPU_Global nsdfg.sdfg.arrays[ transient_node.label].location['gpu'] = symbolic_gpu_id nsdfg.sdfg.arrays[ transient_node. label].storage = dtypes.StorageType.GPU_Global infer_types.set_default_schedule_storage_types_and_location( nsdfg.sdfg, dtypes.ScheduleType.GPU_Multidevice, symbolic_gpu_id) # Remove the parameter of the outer_map from the sdfg symbols, # as it got added as a symbol in StripMining. if outer_map.params[0] in sdfg.free_symbols: sdfg.remove_symbol(outer_map.params[0])