class GPUTransformLocalStorage(pattern_matching.Transformation): """Implements the GPUTransformLocalStorage transformation. Similar to GPUTransformMap, but takes multiple maps leading from the same data node into account, creating a local storage for each range. @see: GPUTransformMap """ _arrays_removed = 0 _maps_transformed = 0 fullcopy = Property(desc="Copy whole arrays rather than used subset", dtype=bool, default=False) nested_seq = Property( desc="Makes nested code semantically-equivalent to single-core code," "transforming nested maps and memory into sequential and " "local memory respectively.", dtype=bool, default=True, ) _map_entry = nodes.MapEntry(nodes.Map("", [], [])) _reduce = nodes.Reduce("lambda: None", None) @staticmethod def expressions(): return [ nxutil.node_path_graph(GPUTransformLocalStorage._map_entry), nxutil.node_path_graph(GPUTransformLocalStorage._reduce), ] @staticmethod def can_be_applied(graph, candidate, expr_index, sdfg, strict=False): if expr_index == 0: map_entry = graph.nodes()[candidate[ GPUTransformLocalStorage._map_entry]] candidate_map = map_entry.map # Disallow GPUTransform on nested maps in strict mode if strict: if graph.scope_dict()[map_entry] is not None: return False # Map schedules that are disallowed to transform to GPUs if (candidate_map.schedule == dtypes.ScheduleType.MPI or candidate_map.schedule == dtypes.ScheduleType.GPU_Device or candidate_map.schedule == dtypes.ScheduleType.GPU_ThreadBlock or candidate_map.schedule == dtypes.ScheduleType.Sequential): return False # Dynamic map ranges cannot become kernels if sd.has_dynamic_map_inputs(graph, map_entry): return False # Recursively check parent for GPU schedules sdict = graph.scope_dict() current_node = map_entry while current_node is not None: if (current_node.map.schedule == dtypes.ScheduleType.GPU_Device or current_node.map.schedule == dtypes.ScheduleType.GPU_ThreadBlock): return False current_node = sdict[current_node] # Ensure that map does not include internal arrays that are # allocated on non-default space subgraph = graph.scope_subgraph(map_entry) for node in subgraph.nodes(): if (isinstance(node, nodes.AccessNode) and node.desc(sdfg).storage != dtypes.StorageType.Default and node.desc(sdfg).storage != dtypes.StorageType.Register): return False # If one of the outputs is a stream, do not match map_exit = graph.exit_nodes(map_entry)[0] for edge in graph.out_edges(map_exit): dst = graph.memlet_path(edge)[-1].dst if (isinstance(dst, nodes.AccessNode) and isinstance(sdfg.arrays[dst.data], data.Stream)): return False return True elif expr_index == 1: reduce = graph.nodes()[candidate[GPUTransformLocalStorage._reduce]] # Map schedules that are disallowed to transform to GPUs if (reduce.schedule == dtypes.ScheduleType.MPI or reduce.schedule == dtypes.ScheduleType.GPU_Device or reduce.schedule == dtypes.ScheduleType.GPU_ThreadBlock): return False # Recursively check parent for GPU schedules sdict = graph.scope_dict() current_node = sdict[reduce] while current_node is not None: if (current_node.map.schedule == dtypes.ScheduleType.GPU_Device or current_node.map.schedule == dtypes.ScheduleType.GPU_ThreadBlock): return False current_node = sdict[current_node] return True @staticmethod def match_to_str(graph, candidate): if GPUTransformLocalStorage._reduce in candidate: return str( graph.nodes()[candidate[GPUTransformLocalStorage._reduce]]) else: map_entry = graph.nodes()[candidate[ GPUTransformLocalStorage._map_entry]] return str(map_entry) def apply(self, sdfg): graph = sdfg.nodes()[self.state_id] if self.expr_index == 0: cnode = graph.nodes()[self.subgraph[ GPUTransformLocalStorage._map_entry]] node_schedprop = cnode.map exit_nodes = graph.exit_nodes(cnode) else: cnode = graph.nodes()[self.subgraph[ GPUTransformLocalStorage._reduce]] node_schedprop = cnode exit_nodes = [cnode] # Change schedule node_schedprop._schedule = dtypes.ScheduleType.GPU_Device if Config.get_bool("debugprint"): GPUTransformLocalStorage._maps_transformed += 1 # If nested graph is designated as sequential, transform schedules and # storage from Default to Sequential/Register if self.nested_seq and self.expr_index == 0: for node in graph.scope_subgraph(cnode).nodes(): if isinstance(node, nodes.AccessNode): arr = node.desc(sdfg) if arr.storage == dtypes.StorageType.Default: arr.storage = dtypes.StorageType.Register elif isinstance(node, nodes.MapEntry): if node.map.schedule == dtypes.ScheduleType.Default: node.map.schedule = dtypes.ScheduleType.Sequential gpu_storage_types = [ dtypes.StorageType.GPU_Global, dtypes.StorageType.GPU_Shared, dtypes.StorageType.GPU_Stack, ] ####################################################### # Add GPU copies of CPU arrays (i.e., not already on GPU) # First, understand which arrays to clone all_out_edges = [] for enode in exit_nodes: all_out_edges.extend(list(graph.out_edges(enode))) in_arrays_to_clone = set() out_arrays_to_clone = set() for e in graph.in_edges(cnode): data_node = sd.find_input_arraynode(graph, e) if data_node.desc(sdfg).storage not in gpu_storage_types: in_arrays_to_clone.add((data_node, e.data)) for e in all_out_edges: data_node = sd.find_output_arraynode(graph, e) if data_node.desc(sdfg).storage not in gpu_storage_types: out_arrays_to_clone.add((data_node, e.data)) if Config.get_bool("debugprint"): GPUTransformLocalStorage._arrays_removed += len( in_arrays_to_clone) + len(out_arrays_to_clone) # Second, create a GPU clone of each array # TODO: Overapproximate union of memlets cloned_arrays = {} in_cloned_arraynodes = {} out_cloned_arraynodes = {} for array_node, memlet in in_arrays_to_clone: array = array_node.desc(sdfg) cloned_name = "gpu_" + array_node.data for i, r in enumerate(memlet.bounding_box_size()): size = symbolic.overapproximate(r) try: if int(size) == 1: suffix = [] for c in str(memlet.subset[i][0]): if c.isalpha() or c.isdigit() or c == "_": suffix.append(c) elif c == "+": suffix.append("p") elif c == "-": suffix.append("m") elif c == "*": suffix.append("t") elif c == "/": suffix.append("d") cloned_name += "_" + "".join(suffix) except: continue if cloned_name in sdfg.arrays.keys(): cloned_array = sdfg.arrays[cloned_name] elif array_node.data in cloned_arrays: cloned_array = cloned_arrays[array_node.data] else: full_shape = [] for r in memlet.bounding_box_size(): size = symbolic.overapproximate(r) try: full_shape.append(int(size)) except: full_shape.append(size) actual_dims = [ idx for idx, r in enumerate(full_shape) if not (isinstance(r, int) and r == 1) ] if len(actual_dims) == 0: # abort actual_dims = [len(full_shape) - 1] if isinstance(array, data.Scalar): sdfg.add_array(name=cloned_name, shape=[1], dtype=array.dtype, transient=True, storage=dtypes.StorageType.GPU_Global) elif isinstance(array, data.Stream): sdfg.add_stream( name=cloned_name, dtype=array.dtype, shape=[full_shape[d] for d in actual_dims], veclen=array.veclen, buffer_size=array.buffer_size, storage=dtypes.StorageType.GPU_Global, transient=True, offset=[array.offset[d] for d in actual_dims]) else: sdfg.add_array( name=cloned_name, shape=[full_shape[d] for d in actual_dims], dtype=array.dtype, materialize_func=array.materialize_func, transient=True, storage=dtypes.StorageType.GPU_Global, allow_conflicts=array.allow_conflicts, strides=[array.strides[d] for d in actual_dims], offset=[array.offset[d] for d in actual_dims], ) cloned_arrays[array_node.data] = cloned_name cloned_node = type(array_node)(cloned_name) in_cloned_arraynodes[array_node.data] = cloned_node for array_node, memlet in out_arrays_to_clone: array = array_node.desc(sdfg) cloned_name = "gpu_" + array_node.data for i, r in enumerate(memlet.bounding_box_size()): size = symbolic.overapproximate(r) try: if int(size) == 1: suffix = [] for c in str(memlet.subset[i][0]): if c.isalpha() or c.isdigit() or c == "_": suffix.append(c) elif c == "+": suffix.append("p") elif c == "-": suffix.append("m") elif c == "*": suffix.append("t") elif c == "/": suffix.append("d") cloned_name += "_" + "".join(suffix) except: continue if cloned_name in sdfg.arrays.keys(): cloned_array = sdfg.arrays[cloned_name] elif array_node.data in cloned_arrays: cloned_array = cloned_arrays[array_node.data] else: full_shape = [] for r in memlet.bounding_box_size(): size = symbolic.overapproximate(r) try: full_shape.append(int(size)) except: full_shape.append(size) actual_dims = [ idx for idx, r in enumerate(full_shape) if not (isinstance(r, int) and r == 1) ] if len(actual_dims) == 0: # abort actual_dims = [len(full_shape) - 1] if isinstance(array, data.Scalar): sdfg.add_array(name=cloned_name, shape=[1], dtype=array.dtype, transient=True, storage=dtypes.StorageType.GPU_Global) elif isinstance(array, data.Stream): sdfg.add_stream( name=cloned_name, dtype=array.dtype, shape=[full_shape[d] for d in actual_dims], veclen=array.veclen, buffer_size=array.buffer_size, storage=dtypes.StorageType.GPU_Global, transient=True, offset=[array.offset[d] for d in actual_dims]) else: sdfg.add_array( name=cloned_name, shape=[full_shape[d] for d in actual_dims], dtype=array.dtype, materialize_func=array.materialize_func, transient=True, storage=dtypes.StorageType.GPU_Global, allow_conflicts=array.allow_conflicts, strides=[array.strides[d] for d in actual_dims], offset=[array.offset[d] for d in actual_dims], ) cloned_arrays[array_node.data] = cloned_name cloned_node = type(array_node)(cloned_name) cloned_node.setzero = True out_cloned_arraynodes[array_node.data] = cloned_node # Third, connect the cloned arrays to the originals for array_name, node in in_cloned_arraynodes.items(): graph.add_node(node) is_scalar = isinstance(sdfg.arrays[array_name], data.Scalar) for edge in graph.in_edges(cnode): if edge.data.data == array_name: newmemlet = copy.deepcopy(edge.data) newmemlet.data = node.data if is_scalar: newmemlet.subset = sbs.Indices([0]) else: offset = [] lost_dims = [] lost_ranges = [] newsubset = [None] * len(edge.data.subset) for ind, r in enumerate(edge.data.subset): offset.append(r[0]) if isinstance(edge.data.subset[ind], tuple): begin = edge.data.subset[ind][0] - r[0] end = edge.data.subset[ind][1] - r[0] step = edge.data.subset[ind][2] if begin == end: lost_dims.append(ind) lost_ranges.append((begin, end, step)) else: newsubset[ind] = (begin, end, step) else: newsubset[ind] -= r[0] if len(lost_dims) == len(edge.data.subset): lost_dims.pop() newmemlet.subset = type( edge.data.subset)([lost_ranges[-1]]) else: newmemlet.subset = type(edge.data.subset)( [r for r in newsubset if r is not None]) graph.add_edge(node, None, edge.dst, edge.dst_conn, newmemlet) for e in graph.bfs_edges(edge.dst, reverse=False): parent, _, _child, _, memlet = e if parent != edge.dst and not in_scope( graph, parent, edge.dst): break if memlet.data != edge.data.data: continue path = graph.memlet_path(e) if not isinstance(path[-1].dst, nodes.CodeNode): if in_path(path, e, nodes.ExitNode, forward=True): if isinstance(parent, nodes.CodeNode): # Output edge break else: continue if is_scalar: memlet.subset = sbs.Indices([0]) else: newsubset = [None] * len(memlet.subset) for ind, r in enumerate(memlet.subset): if ind in lost_dims: continue if isinstance(memlet.subset[ind], tuple): begin = r[0] - offset[ind] end = r[1] - offset[ind] step = r[2] newsubset[ind] = (begin, end, step) else: newsubset[ind] = ( r - offset[ind], r - offset[ind], 1, ) memlet.subset = type(edge.data.subset)( [r for r in newsubset if r is not None]) memlet.data = node.data if self.fullcopy: edge.data.subset = sbs.Range.from_array( node.desc(sdfg)) edge.data.other_subset = newmemlet.subset graph.add_edge(edge.src, edge.src_conn, node, None, edge.data) graph.remove_edge(edge) for array_name, node in out_cloned_arraynodes.items(): graph.add_node(node) is_scalar = isinstance(sdfg.arrays[array_name], data.Scalar) for edge in all_out_edges: if edge.data.data == array_name: newmemlet = copy.deepcopy(edge.data) newmemlet.data = node.data if is_scalar: newmemlet.subset = sbs.Indices([0]) else: offset = [] lost_dims = [] lost_ranges = [] newsubset = [None] * len(edge.data.subset) for ind, r in enumerate(edge.data.subset): offset.append(r[0]) if isinstance(edge.data.subset[ind], tuple): begin = edge.data.subset[ind][0] - r[0] end = edge.data.subset[ind][1] - r[0] step = edge.data.subset[ind][2] if begin == end: lost_dims.append(ind) lost_ranges.append((begin, end, step)) else: newsubset[ind] = (begin, end, step) else: newsubset[ind] -= r[0] if len(lost_dims) == len(edge.data.subset): lost_dims.pop() newmemlet.subset = type( edge.data.subset)([lost_ranges[-1]]) else: newmemlet.subset = type(edge.data.subset)( [r for r in newsubset if r is not None]) graph.add_edge(edge.src, edge.src_conn, node, None, newmemlet) end_node = graph.scope_dict()[edge.src] for e in graph.bfs_edges(edge.src, reverse=True): parent, _, _child, _, memlet = e if parent == end_node: break if memlet.data != edge.data.data: continue path = graph.memlet_path(e) if not isinstance(path[0].dst, nodes.CodeNode): if in_path(path, e, nodes.EntryNode, forward=False): if isinstance(parent, nodes.CodeNode): # Output edge break else: continue if is_scalar: memlet.subset = sbs.Indices([0]) else: newsubset = [None] * len(memlet.subset) for ind, r in enumerate(memlet.subset): if ind in lost_dims: continue if isinstance(memlet.subset[ind], tuple): begin = r[0] - offset[ind] end = r[1] - offset[ind] step = r[2] newsubset[ind] = (begin, end, step) else: newsubset[ind] = ( r - offset[ind], r - offset[ind], 1, ) memlet.subset = type(edge.data.subset)( [r for r in newsubset if r is not None]) memlet.data = node.data edge.data.wcr = None if self.fullcopy: edge.data.subset = sbs.Range.from_array( node.desc(sdfg)) edge.data.other_subset = newmemlet.subset graph.add_edge(node, None, edge.dst, edge.dst_conn, edge.data) graph.remove_edge(edge) # Fourth, replace memlet arrays as necessary if self.expr_index == 0: scope_subgraph = graph.scope_subgraph(cnode) for edge in scope_subgraph.edges(): if edge.data.data is not None and edge.data.data in cloned_arrays: edge.data.data = cloned_arrays[edge.data.data] def modifies_graph(self): return True
def _build_dataflow_graph_recurse(sdfg, state, primitives, modules, superEntry, super_exit): # Array of pairs (exit node, memlet) exit_nodes = [] if len(primitives) == 0: # Inject empty tasklets into empty states primitives = [astnodes._EmptyTaskletNode("Empty Tasklet", None)] for prim in primitives: label = prim.name # Expand node to get entry and exit points if isinstance(prim, astnodes._MapNode): if len(prim.children) == 0: raise ValueError("Map node expected to have children") mapNode = nd.Map(label, prim.params, prim.range, is_async=prim.is_async) # Add connectors for inputs that exist as array nodes entry = nd.MapEntry( mapNode, _get_input_symbols(prim.inputs, prim.range.free_symbols)) exit = nd.MapExit(mapNode) elif isinstance(prim, astnodes._ConsumeNode): if len(prim.children) == 0: raise ValueError("Consume node expected to have children") consumeNode = nd.Consume(label, (prim.params[1], prim.num_pes), prim.condition) entry = nd.ConsumeEntry(consumeNode) exit = nd.ConsumeExit(consumeNode) elif isinstance(prim, astnodes._ReduceNode): rednode = nd.Reduce(prim.ast, prim.axes, prim.identity) state.add_node(rednode) entry = rednode exit = rednode elif isinstance(prim, astnodes._TaskletNode): if isinstance(prim, astnodes._EmptyTaskletNode): tasklet = nd.EmptyTasklet(prim.name) else: # Remove memlets from tasklet AST if prim.language == types.Language.Python: clean_code = MemletRemover().visit(prim.ast) clean_code = ModuleInliner(modules).visit(clean_code) else: # Use external code from tasklet definition if prim.extcode is None: raise SyntaxError("Cannot define an intrinsic " "tasklet without an implementation") clean_code = prim.extcode tasklet = nd.Tasklet( prim.name, set(prim.inputs.keys()), set(prim.outputs.keys()), code=clean_code, language=prim.language, code_global=prim.gcode) # TODO: location=prim.location # Need to add the tasklet in case we're in an empty state, where no # edge will be drawn to it state.add_node(tasklet) entry = tasklet exit = tasklet elif isinstance(prim, astnodes._NestedSDFGNode): prim.sdfg.parent = state prim.sdfg._parent_sdfg = sdfg prim.sdfg.update_sdfg_list([]) nsdfg = nd.NestedSDFG(prim.name, prim.sdfg, set(prim.inputs.keys()), set(prim.outputs.keys())) state.add_node(nsdfg) entry = nsdfg exit = nsdfg elif isinstance(prim, astnodes._ProgramNode): return elif isinstance(prim, astnodes._ControlFlowNode): continue else: raise TypeError("Node type not implemented: " + str(prim.__class__)) # Add incoming edges for varname, memlet in prim.inputs.items(): arr = memlet.dataname if (prim.parent is not None and memlet.dataname in prim.parent.transients.keys()): node = input_node_for_array(state, memlet.dataname) # Add incoming edge into transient as well # FIXME: A bit hacked? if arr in prim.parent.inputs: astmem = prim.parent.inputs[arr] _add_astmemlet_edge(sdfg, state, superEntry, None, node, None, astmem) # Remove local name from incoming edge to parent prim.parent.inputs[arr].local_name = None elif superEntry: node = superEntry else: node = input_node_for_array(state, memlet.dataname) # Destination connector inference # Connected to a tasklet or a nested SDFG dst_conn = (memlet.local_name if isinstance(entry, nd.CodeNode) else None) # Connected to a scope as part of its range if str(varname).startswith('__DACEIN_'): dst_conn = str(varname)[9:] # Handle special case of consume input stream if (isinstance(entry, nd.ConsumeEntry) and memlet.data == prim.stream): dst_conn = 'IN_stream' # If a memlet that covers this input already exists, skip # generating this one; otherwise replace memlet with ours skip_incoming_edge = False remove_edge = None for e in state.edges_between(node, entry): if e.data.data != memlet.dataname or dst_conn != e.dst_conn: continue if e.data.subset.covers(memlet.subset): skip_incoming_edge = True break elif memlet.subset.covers(e.data.subset): remove_edge = e break else: print('WARNING: Performing bounding-box union on', memlet.subset, 'and', e.data.subset, '(in)') e.data.subset = sbs.bounding_box_union( e.data.subset, memlet.subset) e.data.num_accesses += memlet.num_accesses skip_incoming_edge = True break if remove_edge is not None: state.remove_edge(remove_edge) if skip_incoming_edge == False: _add_astmemlet_edge(sdfg, state, node, None, entry, dst_conn, memlet) # If there are no inputs, generate a dummy edge if superEntry and len(prim.inputs) == 0: state.add_edge(superEntry, None, entry, None, EmptyMemlet()) if len(prim.children) > 0: # Recurse inner_outputs = _build_dataflow_graph_recurse( sdfg, state, prim.children, modules, entry, exit) # Infer output node for each memlet for i, (out_src, mem) in enumerate(inner_outputs): # If there is no such array in this primitive's outputs, # it's an external array (e.g., a map in a map). In this case, # connect to the exit node if mem.dataname in prim.outputs: inner_outputs[i] = (out_src, prim.outputs[mem.dataname]) else: inner_outputs[i] = (out_src, mem) else: inner_outputs = [(exit, mem) for mem in prim.outputs.values()] # Add outgoing edges for out_src, astmem in inner_outputs: data = astmem.data dataname = astmem.dataname # If WCR is not none, it needs to be handled in the code. Check for # this after, as we only expect it for one distinct case wcr_was_handled = astmem.wcr is None # TODO: This is convoluted. We should find a more readable # way of connecting the outgoing edges. if super_exit is None: # Assert that we're in a top-level node if ((not isinstance(prim.parent, astnodes._ProgramNode)) and (not isinstance(prim.parent, astnodes._ControlFlowNode))): raise RuntimeError("Expected to be at the top node") # Looks hacky src_conn = (astmem.local_name if isinstance( out_src, (nd.Tasklet, nd.NestedSDFG)) else None) # Here we just need to connect memlets directly to their # respective data nodes out_tgt = output_node_for_array(state, astmem.dataname) # If a memlet that covers this outuput already exists, skip # generating this one; otherwise replace memlet with ours skip_outgoing_edge = False remove_edge = None for e in state.edges_between(out_src, out_tgt): if e.data.data != astmem.dataname or src_conn != e.src_conn: continue if e.data.subset.covers(astmem.subset): skip_outgoing_edge = True break elif astmem.subset.covers(e.data.subset): remove_edge = e break else: print('WARNING: Performing bounding-box union on', astmem.subset, 'and', e.data.subset, '(out)') e.data.subset = sbs.bounding_box_union( e.data.subset, astmem.subset) e.data.num_accesses += astmem.num_accesses skip_outgoing_edge = True break if skip_outgoing_edge == True: continue if remove_edge is not None: state.remove_edge(remove_edge) _add_astmemlet_edge(sdfg, state, out_src, src_conn, out_tgt, None, astmem, wcr=astmem.wcr, wcr_identity=astmem.wcr_identity) wcr_was_handled = (True if astmem.wcr is not None else wcr_was_handled) # If the program defines another output, connect it too. # This refers to the case where we have streams, which # must define an input and output, and sometimes this output # is defined in pdp.outputs if (isinstance(out_tgt, nd.AccessNode) and isinstance(out_tgt.desc(sdfg), dt.Stream)): try: stream_memlet = next( v for k, v in prim.parent.outputs.items() if k == out_tgt.data) stream_output = output_node_for_array( state, stream_memlet.dataname) _add_astmemlet_edge(sdfg, state, out_tgt, None, stream_output, None, stream_memlet) except StopIteration: # Stream output not found, skip pass else: # We're in a nest if isinstance(prim, astnodes._ScopeNode): # We're a map or a consume node, that needs to connect our # exit to either an array or to the super_exit if data.transient and dataname in prim.parent.transients: # Connect the exit directly out_tgt = output_node_for_array(state, data.dataname) _add_astmemlet_edge(sdfg, state, out_src, None, out_tgt, None, astmem) else: # This is either a transient defined in an outer scope, # or an I/O array, so redirect thruogh the exit node _add_astmemlet_edge(sdfg, state, out_src, None, super_exit, None, astmem) # Instruct outer recursion layer to continue the route exit_nodes.append((super_exit, astmem)) elif isinstance( prim, (astnodes._TaskletNode, astnodes._NestedSDFGNode)): # We're a tasklet, and need to connect either to the exit # if the array is I/O or is defined in a scope further out, # or directly to the transient if it's defined locally if dataname in prim.parent.transients: # This is a local transient variable, so connect to it # directly out_tgt = output_node_for_array(state, data.dataname) _add_astmemlet_edge(sdfg, state, out_src, astmem.local_name, out_tgt, None, astmem) else: # This is an I/O array, or an outer level transient, so # redirect through the exit node _add_astmemlet_edge(sdfg, state, out_src, astmem.local_name, super_exit, None, astmem, wcr=astmem.wcr, wcr_identity=astmem.wcr_identity) exit_nodes.append((super_exit, astmem)) if astmem.wcr is not None: wcr_was_handled = True # Sanity check else: raise TypeError("Unexpected node type: {}".format( type(out_src).__name__)) if not wcr_was_handled and not isinstance(prim, astnodes._ScopeNode): raise RuntimeError("Detected unhandled WCR for primitive '{}' " "of type {}. WCR is only expected for " "tasklets in a map/consume scope.".format( prim.name, type(prim).__name__)) return exit_nodes
class MapReduceFusion(pm.Transformation): """ Implements the map-reduce-fusion transformation. Fuses a map with an immediately following reduction, where the array between the map and the reduction is not used anywhere else. """ _tasklet = nodes.Tasklet('_') _tmap_exit = nodes.MapExit(nodes.Map("", [], [])) _in_array = nodes.AccessNode('_') _reduce = nodes.Reduce('lambda: None', None) _out_array = nodes.AccessNode('_') @staticmethod def expressions(): return [ nxutil.node_path_graph(MapReduceFusion._tasklet, MapReduceFusion._tmap_exit, MapReduceFusion._in_array, MapReduceFusion._reduce, MapReduceFusion._out_array) ] @staticmethod def can_be_applied(graph, candidate, expr_index, sdfg, strict=False): tmap_exit = graph.nodes()[candidate[MapReduceFusion._tmap_exit]] in_array = graph.nodes()[candidate[MapReduceFusion._in_array]] reduce_node = graph.nodes()[candidate[MapReduceFusion._reduce]] tasklet = graph.nodes()[candidate[MapReduceFusion._tasklet]] # Make sure that the array is only accessed by the map and the reduce if any([ src != tmap_exit for src, _, _, _, memlet in graph.in_edges(in_array) ]): return False if any([ dest != reduce_node for _, _, dest, _, memlet in graph.out_edges(in_array) ]): return False tmem = next(e for e in graph.edges_between(tasklet, tmap_exit) if e.data.data == in_array.data).data # (strict) Make sure that the transient is not accessed anywhere else # in this state or other states if strict and (len([ n for n in graph.nodes() if isinstance(n, nodes.AccessNode) and n.data == in_array.data ]) > 1 or in_array.data in sdfg.shared_transients()): return False # If memlet already has WCR and it is different from reduce node, # do not match if tmem.wcr is not None and tmem.wcr != reduce_node.wcr: return False # Verify that reduction ranges match tasklet map tout_memlet = graph.in_edges(in_array)[0].data rin_memlet = graph.out_edges(in_array)[0].data if tout_memlet.subset != rin_memlet.subset: return False return True @staticmethod def match_to_str(graph, candidate): tasklet = candidate[MapReduceFusion._tasklet] map_exit = candidate[MapReduceFusion._tmap_exit] reduce = candidate[MapReduceFusion._reduce] return ' -> '.join(str(node) for node in [tasklet, map_exit, reduce]) def apply(self, sdfg): graph = sdfg.nodes()[self.state_id] tmap_exit = graph.nodes()[self.subgraph[MapReduceFusion._tmap_exit]] in_array = graph.nodes()[self.subgraph[MapReduceFusion._in_array]] reduce_node = graph.nodes()[self.subgraph[MapReduceFusion._reduce]] out_array = graph.nodes()[self.subgraph[MapReduceFusion._out_array]] # Set nodes to remove according to the expression index nodes_to_remove = [in_array] nodes_to_remove.append(reduce_node) memlet_edge = None for edge in graph.in_edges(tmap_exit): if edge.data.data == in_array.data: memlet_edge = edge break if memlet_edge is None: raise RuntimeError('Reduction memlet cannot be None') # Find which indices should be removed from new memlet input_edge = graph.in_edges(reduce_node)[0] axes = reduce_node.axes or list(range(input_edge.data.subset)) array_edge = graph.out_edges(reduce_node)[0] # Delete relevant edges and nodes graph.remove_nodes_from(nodes_to_remove) # Filter out reduced dimensions from subset filtered_subset = [ dim for i, dim in enumerate(memlet_edge.data.subset) if i not in axes ] if len(filtered_subset) == 0: # Output is a scalar filtered_subset = [0] # Modify edge from tasklet to map exit memlet_edge.data.data = out_array.data memlet_edge.data.wcr = reduce_node.wcr memlet_edge.data.wcr_identity = reduce_node.identity memlet_edge.data.subset = type( memlet_edge.data.subset)(filtered_subset) # Add edge from map exit to output array graph.add_edge( memlet_edge.dst, 'OUT_' + memlet_edge.dst_conn[3:], array_edge.dst, array_edge.dst_conn, Memlet(array_edge.data.data, array_edge.data.num_accesses, array_edge.data.subset, array_edge.data.veclen, reduce_node.wcr, reduce_node.identity))
class MapReduceFusion(pm.Transformation): """ Implements the map-reduce-fusion transformation. Fuses a map with an immediately following reduction, where the array between the map and the reduction is not used anywhere else. """ _tasklet = nodes.Tasklet('_') _tmap_exit = nodes.MapExit(nodes.Map("", [], [])) _in_array = nodes.AccessNode('_') _rmap_in_entry = nodes.MapEntry(nodes.Map("", [], [])) _rmap_in_tasklet = nodes.Tasklet('_') _rmap_in_cr = nodes.MapExit(nodes.Map("", [], [])) _rmap_out_entry = nodes.MapEntry(nodes.Map("", [], [])) _rmap_out_exit = nodes.MapExit(nodes.Map("", [], [])) _out_array = nodes.AccessNode('_') _reduce = nodes.Reduce('lambda: None', None) @staticmethod def expressions(): return [ # Map, then reduce of all axes nxutil.node_path_graph( MapReduceFusion._tasklet, MapReduceFusion._tmap_exit, MapReduceFusion._in_array, MapReduceFusion._rmap_in_entry, MapReduceFusion._rmap_in_tasklet, MapReduceFusion._rmap_in_cr, MapReduceFusion._out_array), # Map, then partial reduction of axes nxutil.node_path_graph( MapReduceFusion._tasklet, MapReduceFusion._tmap_exit, MapReduceFusion._in_array, MapReduceFusion._rmap_out_entry, MapReduceFusion._rmap_in_entry, MapReduceFusion._rmap_in_tasklet, MapReduceFusion._rmap_in_cr, MapReduceFusion._rmap_out_exit, MapReduceFusion._out_array), # Map, then reduce node nxutil.node_path_graph( MapReduceFusion._tasklet, MapReduceFusion._tmap_exit, MapReduceFusion._in_array, MapReduceFusion._reduce, MapReduceFusion._out_array) ] @staticmethod def can_be_applied(graph, candidate, expr_index, sdfg, strict=False): tmap_exit = graph.nodes()[candidate[MapReduceFusion._tmap_exit]] in_array = graph.nodes()[candidate[MapReduceFusion._in_array]] if expr_index == 0: # Reduce without outer map rmap_entry = graph.nodes()[candidate[ MapReduceFusion._rmap_in_entry]] # rmap_in_entry = rmap_entry elif expr_index == 1: # Reduce with outer map rmap_entry = graph.nodes()[candidate[ MapReduceFusion._rmap_out_entry]] # rmap_in_entry = graph.nodes()[candidate[ # MapReduceFusion._rmap_in_entry]] else: # Reduce node rmap_entry = graph.nodes()[candidate[MapReduceFusion._reduce]] # Make sure that the array is only accessed by the map and the reduce if any([ src != tmap_exit for src, _, _, _, memlet in graph.in_edges(in_array) ]): return False if any([ dest != rmap_entry for _, _, dest, _, memlet in graph.out_edges(in_array) ]): return False # Make sure that there is a reduction in the second map if expr_index < 2: rmap_cr = graph.nodes()[candidate[MapReduceFusion._rmap_in_cr]] reduce_edge = graph.in_edges(rmap_cr)[0] if reduce_edge.data.wcr is None: return False # Make sure that the transient is not accessed by other states # if garr.get_unique_name() in cgen_state.sdfg.shared_transients(): # return False # reduce_inarr = reduce.in_array # reduce_outarr = reduce.out_array # reduce_inslice = reduce.inslice # reduce_outslice = reduce.outslice # insize = cgen_state.var_sizes[reduce_inarr] # outsize = cgen_state.var_sizes[reduce_outarr] # Currently only supports full-range arrays # TODO(later): Support fusion of partial reductions and refactor slice/subarray handling #if not nxutil.fullrange(reduce_inslice, insize) or \ # not nxutil.fullrange(reduce_outslice, outsize): # return False # Verify acceses from tasklet through MapExit #already_found = False #for _src, _, _dest, _, memlet in graph.in_edges(map_exit): # if isinstance(memlet.subset, subsets.Indices): # # Make sure that only one value is reduced at a time # if memlet.data == in_array.desc: # if already_found: # return False # already_found = True ## Find axes after reduction #indims = len(reduce.inslice) #axis_after_reduce = [None] * indims #ctr = 0 #for i in range(indims): # if reduce.axes is not None and i in reduce.axes: # axis_after_reduce[i] = None # else: # axis_after_reduce[i] = ctr # ctr += 1 ## Match map ranges with reduce ranges #curaxis = 0 #for dim, var in enumerate(memlet.subset): # # Make sure that indices are direct symbols # #if not isinstance(symbolic.pystr_to_symbolic(var), sympy.Symbol): # # return False # perm = None # for i, mapvar in enumerate(map_exit.map.params): # if symbolic.pystr_to_symbolic(mapvar) == var: # perm = i # break # if perm is None: # If symbol is not found in map range # return False # # Make sure that map ranges match output slice after reduction # map_range = map_exit.map.range[perm] # if map_range[0] != 0: # return False # Disallow start from middle # if map_range[2] is not None and map_range[2] != 1: # return False # Disallow skip # if reduce.axes is not None and dim not in reduce.axes: # if map_range[1] != symbolic.pystr_to_symbolic( # reduce.outslice[axis_after_reduce[dim]][1]): # return False # Range check (output axis) # else: # if map_range[1] != symbolic.pystr_to_symbolic(reduce.inslice[dim][1]): # return False # Range check (reduction axis) # Verify that reduction ranges match tasklet map tout_memlet = graph.in_edges(in_array)[0].data rin_memlet = graph.out_edges(in_array)[0].data if tout_memlet.subset != rin_memlet.subset: return False return True @staticmethod def match_to_str(graph, candidate): tasklet = candidate[MapReduceFusion._tasklet] map_exit = candidate[MapReduceFusion._tmap_exit] if len(candidate) == 5: # Expression 2 reduce = candidate[MapReduceFusion._reduce] else: reduce = candidate[MapReduceFusion._rmap_in_cr] return ' -> '.join(str(node) for node in [tasklet, map_exit, reduce]) @staticmethod def find_memlet_map_permutation(memlet: Memlet, map: nodes.Map): perm = [None] * len(memlet.subset) indices = set() for i, dim in enumerate(memlet.subset): for j, mapdim in enumerate(map.params): if symbolic.pystr_to_symbolic( mapdim) == dim and j not in indices: perm[i] = j indices.add(j) break return perm @staticmethod def find_permutation(tasklet_map: nodes.Map, red_outer_map: nodes.Map, red_inner_map: nodes.Map, tmem: Memlet): """ Find permutation between tasklet-exit memlet and tasklet map. """ result = [], [] assert len(tasklet_map.range) == len(red_inner_map.range) + len( red_outer_map.range) # Match map ranges with reduce ranges unavailable_ranges_out = set() unavailable_ranges_in = set() for i, tmap_rng in enumerate(tasklet_map.range): found = False for j, rng in enumerate(red_outer_map.range): if tmap_rng == rng and j not in unavailable_ranges_out: result[0].append(i) unavailable_ranges_out.add(j) found = True break if found: continue for j, rng in enumerate(red_inner_map.range): if tmap_rng == rng and j not in unavailable_ranges_in: result[1].append(i) unavailable_ranges_in.add(j) found = True break if not found: break # Ensure all map variables matched with reduce variables assert len(result[0]) + len(result[1]) == len(tasklet_map.range) # Returns ([outer map indices], [inner (CR) map indices]) return result @staticmethod def find_permutation_reduce(tasklet_map: nodes.Map, reduce_node: nodes.Reduce, graph: SDFGState, tmem: Memlet): in_memlet = graph.in_edges(reduce_node)[0].data out_memlet = graph.out_edges(reduce_node)[0].data assert len(tasklet_map.range) == in_memlet.subset.dims() # Find permutation between tasklet-exit memlet and tasklet map tmem_perm = MapReduceFusion.find_memlet_map_permutation( tmem, tasklet_map) mapred_perm = [] # Match map ranges with reduce ranges unavailable_ranges = set() for i, tmap_rng in enumerate(tasklet_map.range): found = False for j, in_rng in enumerate(in_memlet.subset): if tmap_rng == in_rng and j not in unavailable_ranges: mapred_perm.append(i) unavailable_ranges.add(j) found = True break if not found: break # Ensure all map variables matched with reduce variables assert len(tmem_perm) == len(tmem.subset) assert len(mapred_perm) == len(in_memlet.subset) # Prepare result from the two permutations and the reduction axes result = [] for i in range(len(mapred_perm)): if reduce_node.axes is None or i in reduce_node.axes: continue result.append(mapred_perm[tmem_perm[i]]) return result def apply(self, sdfg): def gnode(nname): return graph.nodes()[self.subgraph[nname]] expr_index = self.expr_index graph = sdfg.nodes()[self.state_id] tasklet = gnode(MapReduceFusion._tasklet) tmap_exit = graph.nodes()[self.subgraph[MapReduceFusion._tmap_exit]] in_array = graph.nodes()[self.subgraph[MapReduceFusion._in_array]] if expr_index == 0: # Reduce without outer map rmap_entry = graph.nodes()[self.subgraph[ MapReduceFusion._rmap_in_entry]] elif expr_index == 1: # Reduce with outer map rmap_out_entry = graph.nodes()[self.subgraph[ MapReduceFusion._rmap_out_entry]] rmap_out_exit = graph.nodes()[self.subgraph[ MapReduceFusion._rmap_out_exit]] rmap_in_entry = graph.nodes()[self.subgraph[ MapReduceFusion._rmap_in_entry]] rmap_tasklet = graph.nodes()[self.subgraph[ MapReduceFusion._rmap_in_tasklet]] if expr_index == 2: rmap_cr = graph.nodes()[self.subgraph[MapReduceFusion._reduce]] else: rmap_cr = graph.nodes()[self.subgraph[MapReduceFusion._rmap_in_cr]] out_array = gnode(MapReduceFusion._out_array) # Set nodes to remove according to the expression index nodes_to_remove = [in_array] if expr_index == 0: nodes_to_remove.append(gnode(MapReduceFusion._rmap_in_entry)) elif expr_index == 1: nodes_to_remove.append(gnode(MapReduceFusion._rmap_out_entry)) nodes_to_remove.append(gnode(MapReduceFusion._rmap_in_entry)) nodes_to_remove.append(gnode(MapReduceFusion._rmap_out_exit)) else: nodes_to_remove.append(gnode(MapReduceFusion._reduce)) # If no other edges lead to mapexit, remove it. Otherwise, keep # it and remove reduction incoming/outgoing edges if expr_index != 2 and len(graph.in_edges(tmap_exit)) == 1: nodes_to_remove.append(tmap_exit) memlet_edge = None for edge in graph.in_edges(tmap_exit): if edge.data.data == in_array.data: memlet_edge = edge break if memlet_edge is None: raise RuntimeError('Reduction memlet cannot be None') if expr_index == 0: # Reduce without outer map # Index order does not matter, merge as-is pass elif expr_index == 1: # Reduce with outer map tmap = tmap_exit.map perm_outer, perm_inner = MapReduceFusion.find_permutation( tmap, rmap_out_entry.map, rmap_in_entry.map, memlet_edge.data) # Split tasklet map into tmap_out -> tmap_in (according to # reduction) omap = nodes.Map( tmap.label + '_nonreduce', [p for i, p in enumerate(tmap.params) if i in perm_outer], [r for i, r in enumerate(tmap.range) if i in perm_outer], tmap.schedule, tmap.unroll, tmap.is_async) tmap.params = [ p for i, p in enumerate(tmap.params) if i in perm_inner ] tmap.range = [ r for i, r in enumerate(tmap.range) if i in perm_inner ] omap_entry = nodes.MapEntry(omap) omap_exit = rmap_out_exit rmap_out_exit.map = omap # Reconnect graph to new map tmap_entry = graph.entry_node(tmap_exit) tmap_in_edges = list(graph.in_edges(tmap_entry)) for e in tmap_in_edges: nxutil.change_edge_dest(graph, tmap_entry, omap_entry) for e in tmap_in_edges: graph.add_edge(omap_entry, e.src_conn, tmap_entry, e.dst_conn, copy.copy(e.data)) elif expr_index == 2: # Reduce node # Find correspondence between map indices and array outputs tmap = tmap_exit.map perm = MapReduceFusion.find_permutation_reduce( tmap, rmap_cr, graph, memlet_edge.data) output_subset = [tmap.params[d] for d in perm] if len(output_subset) == 0: # Output is a scalar output_subset = [0] array_edge = graph.out_edges(rmap_cr)[0] # Delete relevant edges and nodes graph.remove_edge(memlet_edge) graph.remove_nodes_from(nodes_to_remove) # Add new edges and nodes # From tasklet to map exit graph.add_edge( memlet_edge.src, memlet_edge.src_conn, memlet_edge.dst, memlet_edge.dst_conn, Memlet(out_array.data, memlet_edge.data.num_accesses, subsets.Indices(output_subset), memlet_edge.data.veclen, rmap_cr.wcr, rmap_cr.identity)) # From map exit to output array graph.add_edge( memlet_edge.dst, 'OUT_' + memlet_edge.dst_conn[3:], array_edge.dst, array_edge.dst_conn, Memlet(array_edge.data.data, array_edge.data.num_accesses, array_edge.data.subset, array_edge.data.veclen, rmap_cr.wcr, rmap_cr.identity)) return # Remove tmp array node prior to the others, so that a new one # can be created in its stead (see below) graph.remove_node(nodes_to_remove[0]) nodes_to_remove = nodes_to_remove[1:] # Create tasklet -> tmp -> tasklet connection tmp = graph.add_array( 'tmp', memlet_edge.data.subset.bounding_box_size(), sdfg.arrays[memlet_edge.data.data].dtype, transient=True) tasklet_tmp_memlet = copy.deepcopy(memlet_edge.data) tasklet_tmp_memlet.data = tmp.data tasklet_tmp_memlet.subset = ShapeProperty.to_string(tmp.shape) # Modify memlet to point to output array memlet_edge.data.data = out_array.data # Recover reduction axes from CR reduce subset reduce_cr_subset = graph.in_edges(rmap_tasklet)[0].data.subset reduce_axes = [] for ind, crvar in enumerate(reduce_cr_subset.indices): if '__i' in str(crvar): reduce_axes.append(ind) # Modify memlet access index by filtering out reduction axes if True: # expr_index == 0: newindices = [] for ind, ovar in enumerate(memlet_edge.data.subset.indices): if ind not in reduce_axes: newindices.append(ovar) if len(newindices) == 0: newindices = [0] memlet_edge.data.subset = subsets.Indices(newindices) graph.remove_edge(memlet_edge) graph.add_edge(memlet_edge.src, memlet_edge.src_conn, tmp, memlet_edge.dst_conn, tasklet_tmp_memlet) red_edges = list(graph.in_edges(rmap_tasklet)) if len(red_edges) != 1: raise RuntimeError('CR edge must be unique') tmp_tasklet_memlet = copy.deepcopy(tasklet_tmp_memlet) graph.add_edge(tmp, None, rmap_tasklet, red_edges[0].dst_conn, tmp_tasklet_memlet) for e in graph.edges_between(rmap_tasklet, rmap_cr): e.data.subset = memlet_edge.data.subset # Move output edges to point directly to CR node if expr_index == 1: # Set output memlet between CR node and outer reduction map to # contain the same subset as the one pointing to the CR node for e in graph.out_edges(rmap_cr): e.data.subset = memlet_edge.data.subset rmap_out = gnode(MapReduceFusion._rmap_out_exit) nxutil.change_edge_src(graph, rmap_out, omap_exit) # Remove nodes graph.remove_nodes_from(nodes_to_remove) # For unrelated outputs, connect original output to rmap_out if expr_index == 1 and tmap_exit not in nodes_to_remove: other_out_edges = list(graph.out_edges(tmap_exit)) for e in other_out_edges: graph.remove_edge(e) graph.add_edge(e.src, e.src_conn, omap_exit, None, e.data) graph.add_edge(omap_exit, None, e.dst, e.dst_conn, copy.copy(e.data)) def modifies_graph(self): return True
class ReduceExpansion(pm.Transformation): """ Implements the reduce-expansion transformation. Reduce-expansion replaces a reduce node with nested maps and edges with WCR. """ _reduce = nodes.Reduce(wcr='lambda x: x', axes=None) @staticmethod def expressions(): return [nxutil.node_path_graph(ReduceExpansion._reduce)] @staticmethod def can_be_applied(graph, candidate, expr_index, sdfg, strict=False): return True @staticmethod def match_to_str(graph, candidate): red_node = graph.nodes()[candidate[ReduceExpansion._reduce]] return "{}: {} on {}".format(red_node, red_node.wcr, red_node.axes) def apply(self, sdfg): """ The method creates two nested maps. The inner map ranges over the reduction axes, while the outer map ranges over the rest of the input dimensions. The inner map contains a trivial tasklet, while the outgoing edges copy the reduction WCR. """ graph = sdfg.nodes()[self.state_id] red_node = graph.nodes()[self.subgraph[ReduceExpansion._reduce]] inputs = [] in_memlets = [] for src, _, _, _, memlet in graph.in_edges(red_node): if src not in inputs: inputs.append(src) in_memlets.append(memlet) if len(inputs) > 1: raise NotImplementedError outputs = [] out_memlets = [] for _, _, dst, _, memlet in graph.out_edges(red_node): if dst not in outputs: outputs.append(dst) out_memlets.append(memlet) if len(outputs) > 1: raise NotImplementedError axes = red_node.axes if axes is None: axes = tuple(i for i in range(in_memlets[0].subset.dims())) outer_map_range = {} inner_map_range = {} for idx, r in enumerate(in_memlets[0].subset): if idx in axes: inner_map_range.update({ "__dim_{}".format(str(idx)): subsets.Range.dim_to_string(r) }) else: outer_map_range.update({ "__dim_{}".format(str(idx)): subsets.Range.dim_to_string(r) }) if len(outer_map_range) > 0: outer_map_entry, outer_map_exit = graph.add_map( 'reduce_outer', outer_map_range, schedule=red_node.schedule) inner_map_entry, inner_map_exit = graph.add_map( 'reduce_inner', inner_map_range, schedule=(dtypes.ScheduleType.Default if len(outer_map_range) > 0 else red_node.schedule)) tasklet = graph.add_tasklet(name='red_tasklet', inputs={'in_1'}, outputs={'out_1'}, code='out_1 = in_1') inner_map_entry.in_connectors = {'IN_1'} inner_map_entry.out_connectors = {'OUT_1'} outer_in_memlet = dcpy(in_memlets[0]) if len(outer_map_range) > 0: outer_map_entry.in_connectors = {'IN_1'} outer_map_entry.out_connectors = {'OUT_1'} graph.add_edge(inputs[0], None, outer_map_entry, 'IN_1', outer_in_memlet) else: graph.add_edge(inputs[0], None, inner_map_entry, 'IN_1', outer_in_memlet) med_in_memlet = dcpy(in_memlets[0]) med_in_range = [] for idx, r in enumerate(med_in_memlet.subset): if idx in axes: med_in_range.append(r) else: med_in_range.append(("__dim_{}".format(str(idx)), "__dim_{}".format(str(idx)), 1)) med_in_memlet.subset = subsets.Range(med_in_range) med_in_memlet.num_accesses = med_in_memlet.subset.num_elements() if len(outer_map_range) > 0: graph.add_edge(outer_map_entry, 'OUT_1', inner_map_entry, 'IN_1', med_in_memlet) inner_in_memlet = dcpy(med_in_memlet) inner_in_idx = [] for idx in range(len(inner_in_memlet.subset)): inner_in_idx.append("__dim_{}".format(str(idx))) inner_in_memlet.subset = subsets.Indices(inner_in_idx) inner_in_memlet.num_accesses = inner_in_memlet.subset.num_elements() graph.add_edge(inner_map_entry, 'OUT_1', tasklet, 'in_1', inner_in_memlet) inner_map_exit.in_connectors = {'IN_1'} inner_map_exit.out_connectors = {'OUT_1'} inner_out_memlet = dcpy(out_memlets[0]) inner_out_idx = [] for idx, r in enumerate(inner_in_memlet.subset): if idx not in axes: inner_out_idx.append(r) if len(inner_out_idx) == 0: inner_out_idx = [0] inner_out_memlet.subset = subsets.Indices(inner_out_idx) inner_out_memlet.wcr = red_node.wcr inner_out_memlet.num_accesses = inner_out_memlet.subset.num_elements() graph.add_edge(tasklet, 'out_1', inner_map_exit, 'IN_1', inner_out_memlet) outer_out_memlet = dcpy(out_memlets[0]) outer_out_range = [] for idx, r in enumerate(outer_out_memlet.subset): if idx not in axes: outer_out_range.append(r) if len(outer_out_range) == 0: outer_out_range = [(0, 0, 1)] outer_out_memlet.subset = subsets.Range(outer_out_range) outer_out_memlet.wcr = red_node.wcr if len(outer_map_range) > 0: outer_map_exit.in_connectors = {'IN_1'} outer_map_exit.out_connectors = {'OUT_1'} med_out_memlet = dcpy(inner_out_memlet) med_out_memlet.num_accesses = med_out_memlet.subset.num_elements() graph.add_edge(inner_map_exit, 'OUT_1', outer_map_exit, 'IN_1', med_out_memlet) graph.add_edge(outer_map_exit, 'OUT_1', outputs[0], None, outer_out_memlet) else: graph.add_edge(inner_map_exit, 'OUT_1', outputs[0], None, outer_out_memlet) graph.remove_edge(graph.in_edges(red_node)[0]) graph.remove_edge(graph.out_edges(red_node)[0]) graph.remove_node(red_node)
class GPUTransformMap(pattern_matching.Transformation): """ Implements the GPUTransformMap transformation. Converts a single map to a GPU-scheduled map and creates GPU arrays outside it, generating CPU<->GPU memory copies automatically. """ fullcopy = Property( desc="Copy whole arrays rather than used subset", dtype=bool, default=False) toplevel_trans = Property( desc="Make all GPU transients top-level", dtype=bool, default=False) register_trans = Property( desc="Make all transients inside GPU maps registers", dtype=bool, default=False) sequential_innermaps = Property( desc="Make all internal maps Sequential", dtype=bool, default=False) _map_entry = nodes.MapEntry(nodes.Map("", [], [])) _reduce = nodes.Reduce('lambda: None', None) @staticmethod def expressions(): return [ nxutil.node_path_graph(GPUTransformMap._map_entry), nxutil.node_path_graph(GPUTransformMap._reduce) ] @staticmethod def can_be_applied(graph, candidate, expr_index, sdfg, strict=False): if expr_index == 0: map_entry = graph.nodes()[candidate[GPUTransformMap._map_entry]] candidate_map = map_entry.map # Map schedules that are disallowed to transform to GPUs if (candidate_map.schedule in [dtypes.ScheduleType.MPI] + dtypes.GPU_SCHEDULES): return False if sd.is_devicelevel(sdfg, graph, map_entry): return False # Dynamic map ranges cannot become kernels if sd.has_dynamic_map_inputs(graph, map_entry): return False # Ensure that map does not include internal arrays that are # allocated on non-default space subgraph = graph.scope_subgraph(map_entry) for node in subgraph.nodes(): if (isinstance(node, nodes.AccessNode) and node.desc(sdfg).storage != dtypes.StorageType.Default and node.desc(sdfg).storage != dtypes.StorageType.Register): return False # If one of the outputs is a stream, do not match map_exit = graph.exit_nodes(map_entry)[0] for edge in graph.out_edges(map_exit): dst = graph.memlet_path(edge)[-1].dst if (isinstance(dst, nodes.AccessNode) and isinstance(sdfg.arrays[dst.data], data.Stream)): return False return True elif expr_index == 1: reduce = graph.nodes()[candidate[GPUTransformMap._reduce]] # Map schedules that are disallowed to transform to GPUs if (reduce.schedule in [dtypes.ScheduleType.MPI] + dtypes.GPU_SCHEDULES): return False if sd.is_devicelevel(sdfg, graph, reduce): return False return True @staticmethod def match_to_str(graph, candidate): if GPUTransformMap._reduce in candidate: return str(graph.nodes()[candidate[GPUTransformMap._reduce]]) else: return str(graph.nodes()[candidate[GPUTransformMap._map_entry]]) def apply(self, sdfg): graph = sdfg.nodes()[self.state_id] if self.expr_index == 0: map_entry = graph.nodes()[self.subgraph[ GPUTransformMap._map_entry]] nsdfg_node = helpers.nest_state_subgraph( sdfg, graph, graph.scope_subgraph(map_entry), full_data=self.fullcopy) else: cnode = graph.nodes()[self.subgraph[GPUTransformMap._reduce]] nsdfg_node = helpers.nest_state_subgraph( sdfg, graph, SubgraphView(graph, [cnode]), full_data=self.fullcopy) # Avoiding import loops from dace.transformation.interstate import GPUTransformSDFG transformation = GPUTransformSDFG(0, 0, {}, 0) transformation.register_trans = self.register_trans transformation.sequential_innermaps = self.sequential_innermaps transformation.toplevel_trans = self.toplevel_trans transformation.apply(nsdfg_node.sdfg) # Inline back as necessary sdfg.apply_strict_transformations()
class GPUTransformMap(pattern_matching.Transformation): """ Implements the GPUTransformMap transformation. Converts a single map to a GPU-scheduled map and creates GPU arrays outside it, generating CPU<->GPU memory copies automatically. """ fullcopy = Property(desc="Copy whole arrays rather than used subset", dtype=bool, default=False) _map_entry = nodes.MapEntry(nodes.Map("", [], [])) _reduce = nodes.Reduce('lambda: None', None) @staticmethod def expressions(): return [ nxutil.node_path_graph(GPUTransformMap._map_entry), nxutil.node_path_graph(GPUTransformMap._reduce) ] @staticmethod def can_be_applied(graph, candidate, expr_index, sdfg, strict=False): if expr_index == 0: map_entry = graph.nodes()[candidate[GPUTransformMap._map_entry]] candidate_map = map_entry.map # Map schedules that are disallowed to transform to GPUs if (candidate_map.schedule == types.ScheduleType.MPI or candidate_map.schedule == types.ScheduleType.GPU_Device or candidate_map.schedule == types.ScheduleType.GPU_ThreadBlock): return False # Recursively check parent for GPU schedules sdict = graph.scope_dict() current_node = map_entry while current_node != None: if (current_node.map.schedule == types.ScheduleType.GPU_Device or current_node.map.schedule == types.ScheduleType.GPU_ThreadBlock): return False current_node = sdict[current_node] # Ensure that map does not include internal arrays that are allocated # on non-default space subgraph = graph.scope_subgraph(map_entry) for node in subgraph.nodes(): if (isinstance(node, nodes.AccessNode) and node.desc(sdfg).storage != types.StorageType.Default and node.desc(sdfg).storage != types.StorageType.Register): return False return True elif expr_index == 1: reduce = graph.nodes()[candidate[GPUTransformMap._reduce]] # Map schedules that are disallowed to transform to GPUs if (reduce.schedule == types.ScheduleType.MPI or reduce.schedule == types.ScheduleType.GPU_Device or reduce.schedule == types.ScheduleType.GPU_ThreadBlock): return False # Recursively check parent for GPU schedules sdict = graph.scope_dict() current_node = sdict[reduce] while current_node != None: if (current_node.map.schedule == types.ScheduleType.GPU_Device or current_node.map.schedule == types.ScheduleType.GPU_ThreadBlock): return False current_node = sdict[current_node] return True @staticmethod def match_to_str(graph, candidate): if GPUTransformMap._reduce in candidate: return str(graph.nodes()[candidate[GPUTransformMap._reduce]]) else: map_entry = graph.nodes()[candidate[GPUTransformMap._map_entry]] return str(map_entry) def apply(self, sdfg): graph = sdfg.nodes()[self.state_id] if self.expr_index == 0: cnode = graph.nodes()[self.subgraph[GPUTransformMap._map_entry]] node_schedprop = cnode.map exit_nodes = graph.exit_nodes(cnode) else: cnode = graph.nodes()[self.subgraph[GPUTransformMap._reduce]] node_schedprop = cnode exit_nodes = [cnode] # Change schedule node_schedprop._schedule = types.ScheduleType.GPU_Device gpu_storage_types = [ types.StorageType.GPU_Global, types.StorageType.GPU_Shared, types.StorageType.GPU_Stack #, types.StorageType.CPU_Pinned ] ####################################################### # Add GPU copies of CPU arrays (i.e., not already on GPU) # First, understand which arrays to clone all_out_edges = [] for enode in exit_nodes: all_out_edges.extend(list(graph.out_edges(enode))) in_arrays_to_clone = set() out_arrays_to_clone = set() for e in graph.in_edges(cnode): data_node = sd.find_input_arraynode(graph, e) if data_node.desc(sdfg).storage not in gpu_storage_types: in_arrays_to_clone.add(data_node) for e in all_out_edges: data_node = sd.find_output_arraynode(graph, e) if data_node.desc(sdfg).storage not in gpu_storage_types: out_arrays_to_clone.add(data_node) # Second, create a GPU clone of each array cloned_arrays = {} in_cloned_arraynodes = {} out_cloned_arraynodes = {} for array_node in in_arrays_to_clone: array = array_node.desc(sdfg) if array_node.data in cloned_arrays: cloned_array = cloned_arrays[array_node.data] else: cloned_array = sdfg.add_array( 'gpu_' + array_node.data, array.shape, array.dtype, materialize_func=array.materialize_func, transient=True, storage=types.StorageType.GPU_Global, allow_conflicts=array.allow_conflicts, access_order=array.access_order, strides=array.strides, offset=array.offset) cloned_arrays[array_node.data] = 'gpu_' + array_node.data cloned_node = type(array_node)('gpu_' + array_node.data) in_cloned_arraynodes[array_node.data] = cloned_node for array_node in out_arrays_to_clone: array = array_node.desc(sdfg) if array_node.data in cloned_arrays: cloned_array = cloned_arrays[array_node.data] else: cloned_array = sdfg.add_array( 'gpu_' + array_node.data, array.shape, array.dtype, materialize_func=array.materialize_func, transient=True, storage=types.StorageType.GPU_Global, allow_conflicts=array.allow_conflicts, access_order=array.access_order, strides=array.strides, offset=array.offset) cloned_arrays[array_node.data] = 'gpu_' + array_node.data cloned_node = type(array_node)('gpu_' + array_node.data) out_cloned_arraynodes[array_node.data] = cloned_node # Third, connect the cloned arrays to the originals # TODO(later): Shift indices and create only the necessary sub-arrays for array_name, node in in_cloned_arraynodes.items(): graph.add_node(node) for edge in graph.in_edges(cnode): if edge.data.data == array_name: graph.remove_edge(edge) newmemlet = copy.copy(edge.data) newmemlet.data = node.data graph.add_edge(node, edge.src_conn, edge.dst, edge.dst_conn, newmemlet) if self.fullcopy: edge.data.subset = sbs.Range.from_array( node.desc(sdfg)) edge.data.other_subset = edge.data.subset graph.add_edge(edge.src, None, node, None, edge.data) for array_name, node in out_cloned_arraynodes.items(): graph.add_node(node) for edge in all_out_edges: if edge.data.data == array_name: graph.remove_edge(edge) newmemlet = copy.copy(edge.data) newmemlet.data = node.data graph.add_edge(edge.src, edge.src_conn, node, edge.dst_conn, newmemlet) edge.data.wcr = None if self.fullcopy: edge.data.subset = sbs.Range.from_array( node.desc(sdfg)) edge.data.other_subset = edge.data.subset graph.add_edge(node, None, edge.dst, None, edge.data) # Fourth, replace memlet arrays as necessary if self.expr_index == 0: scope_subgraph = graph.scope_subgraph(cnode) for edge in scope_subgraph.edges(): if (edge.data.data is not None and edge.data.data in cloned_arrays): edge.data.data = cloned_arrays[edge.data.data] def modifies_graph(self): return True
class GPUTransformMap(pattern_matching.Transformation): """ Implements the GPUTransformMap transformation. Converts a single map to a GPU-scheduled map and creates GPU arrays outside it, generating CPU<->GPU memory copies automatically. """ _maps_transformed = 0 _arrays_removed = 0 fullcopy = Property(desc="Copy whole arrays rather than used subset", dtype=bool, default=False) _map_entry = nodes.MapEntry(nodes.Map("", [], [])) _reduce = nodes.Reduce('lambda: None', None) @staticmethod def expressions(): return [ nxutil.node_path_graph(GPUTransformMap._map_entry), nxutil.node_path_graph(GPUTransformMap._reduce) ] @staticmethod def can_be_applied(graph, candidate, expr_index, sdfg, strict=False): if expr_index == 0: map_entry = graph.nodes()[candidate[GPUTransformMap._map_entry]] candidate_map = map_entry.map # Map schedules that are disallowed to transform to GPUs if (candidate_map.schedule == dtypes.ScheduleType.MPI or candidate_map.schedule == dtypes.ScheduleType.GPU_Device or candidate_map.schedule == dtypes.ScheduleType.GPU_ThreadBlock): return False # Recursively check parent for GPU schedules sdict = graph.scope_dict() current_node = map_entry while current_node is not None: if (current_node.map.schedule == dtypes.ScheduleType.GPU_Device or current_node.map.schedule == dtypes.ScheduleType.GPU_ThreadBlock): return False current_node = sdict[current_node] # Ensure that map does not include internal arrays that are allocated # on non-default space subgraph = graph.scope_subgraph(map_entry) for node in subgraph.nodes(): if (isinstance(node, nodes.AccessNode) and node.desc(sdfg).storage != dtypes.StorageType.Default and node.desc(sdfg).storage != dtypes.StorageType.Register): return False return True elif expr_index == 1: reduce = graph.nodes()[candidate[GPUTransformMap._reduce]] # Map schedules that are disallowed to transform to GPUs if (reduce.schedule == dtypes.ScheduleType.MPI or reduce.schedule == dtypes.ScheduleType.GPU_Device or reduce.schedule == dtypes.ScheduleType.GPU_ThreadBlock): return False # Recursively check parent for GPU schedules sdict = graph.scope_dict() current_node = sdict[reduce] while current_node is not None: if (current_node.map.schedule == dtypes.ScheduleType.GPU_Device or current_node.map.schedule == dtypes.ScheduleType.GPU_ThreadBlock): return False current_node = sdict[current_node] return True @staticmethod def match_to_str(graph, candidate): if GPUTransformMap._reduce in candidate: return str(graph.nodes()[candidate[GPUTransformMap._reduce]]) else: map_entry = graph.nodes()[candidate[GPUTransformMap._map_entry]] return str(map_entry) def apply(self, sdfg): graph = sdfg.nodes()[self.state_id] if self.expr_index == 0: cnode = graph.nodes()[self.subgraph[GPUTransformMap._map_entry]] node_schedprop = cnode.map exit_nodes = graph.exit_nodes(cnode) else: cnode = graph.nodes()[self.subgraph[GPUTransformMap._reduce]] node_schedprop = cnode exit_nodes = [cnode] # Change schedule node_schedprop._schedule = dtypes.ScheduleType.GPU_Device if Config.get_bool("debugprint"): GPUTransformMap._maps_transformed += 1 gpu_storage_types = [ dtypes.StorageType.GPU_Global, dtypes.StorageType.GPU_Shared, dtypes.StorageType.GPU_Stack #, dtypes.StorageType.CPU_Pinned ] ####################################################### # Add GPU copies of CPU arrays (i.e., not already on GPU) # First, understand which arrays to clone all_out_edges = [] for enode in exit_nodes: all_out_edges.extend(list(graph.out_edges(enode))) in_arrays_to_clone = set() out_arrays_to_clone = set() out_streamarrays = {} for e in graph.in_edges(cnode): data_node = sd.find_input_arraynode(graph, e) if isinstance(data_node.desc(sdfg), data.Scalar): continue if data_node.desc(sdfg).storage not in gpu_storage_types: in_arrays_to_clone.add(data_node) for e in all_out_edges: data_node = sd.find_output_arraynode(graph, e) if isinstance(data_node.desc(sdfg), data.Scalar): continue if data_node.desc(sdfg).storage not in gpu_storage_types: # Stream directly connected to an array if sd.is_array_stream_view(sdfg, graph, data_node): datadesc = data_node.desc(sdfg) if datadesc.transient is False: raise TypeError('Non-transient stream-array view are ' 'unsupported') # Add parent node to clone out_arrays_to_clone.add(graph.out_edges(data_node)[0].dst) out_streamarrays[graph.out_edges(data_node) [0].dst] = data_node # Do not clone stream continue out_arrays_to_clone.add(data_node) if Config.get_bool("debugprint"): GPUTransformMap._arrays_removed += len(in_arrays_to_clone) + len( out_arrays_to_clone) # Second, create a GPU clone of each array cloned_arrays = {} in_cloned_arraynodes = {} out_cloned_arraynodes = {} for array_node in in_arrays_to_clone: array = array_node.desc(sdfg) if array_node.data in cloned_arrays: cloned_array = cloned_arrays[array_node.data] else: cloned_array = array.clone() cloned_array.storage = dtypes.StorageType.GPU_Global cloned_array.transient = True sdfg.add_datadesc('gpu_' + array_node.data, cloned_array) cloned_arrays[array_node.data] = 'gpu_' + array_node.data cloned_node = type(array_node)('gpu_' + array_node.data) in_cloned_arraynodes[array_node.data] = cloned_node for array_node in out_arrays_to_clone: array = array_node.desc(sdfg) if array_node.data in cloned_arrays: cloned_array = cloned_arrays[array_node.data] else: cloned_array = array.clone() cloned_array.storage = dtypes.StorageType.GPU_Global cloned_array.transient = True sdfg.add_datadesc('gpu_' + array_node.data, cloned_array) cloned_arrays[array_node.data] = 'gpu_' + array_node.data cloned_node = type(array_node)('gpu_' + array_node.data) out_cloned_arraynodes[array_node.data] = cloned_node # Third, connect the cloned arrays to the originals # TODO(later): Shift indices and create only the necessary sub-arrays for array_name, node in in_cloned_arraynodes.items(): graph.add_node(node) for edge in graph.in_edges(cnode): if edge.data.data == array_name: graph.remove_edge(edge) newmemlet = copy.copy(edge.data) newmemlet.data = node.data graph.add_edge(node, edge.src_conn, edge.dst, edge.dst_conn, newmemlet) if self.fullcopy: edge.data.subset = sbs.Range.from_array( node.desc(sdfg)) edge.data.other_subset = edge.data.subset graph.add_edge(edge.src, None, node, None, edge.data) for array_name, node in out_cloned_arraynodes.items(): graph.add_node(node) for edge in all_out_edges: if edge.data.data == array_name: graph.remove_edge(edge) newmemlet = copy.copy(edge.data) newmemlet.data = node.data graph.add_edge(edge.src, edge.src_conn, node, edge.dst_conn, newmemlet) edge.data.wcr = None if self.fullcopy: edge.data.subset = sbs.Range.from_array( node.desc(sdfg)) edge.data.other_subset = edge.data.subset graph.add_edge(node, None, edge.dst, None, edge.data) # Reconnect stream-arrays for array_node, streamnode in out_streamarrays.items(): # Set stream storage to GPU streamnode.desc(sdfg).storage = dtypes.StorageType.GPU_Global cloned_node = out_cloned_arraynodes[array_node.data] e = graph.out_edges(streamnode)[0] graph.remove_edge(e) newmemlet = copy.copy(e.data) newmemlet.data = cloned_node.data # stream -> cloned array graph.add_edge(e.src, e.src_conn, cloned_node, e.dst_conn, newmemlet) # cloned array -> array graph.add_nedge(cloned_node, array_node, e.data) # Fourth, replace memlet arrays as necessary if self.expr_index == 0: scope_subgraph = graph.scope_subgraph(cnode) for edge in scope_subgraph.edges(): if (edge.data.data is not None and edge.data.data in cloned_arrays): edge.data.data = cloned_arrays[edge.data.data] def modifies_graph(self): return True @staticmethod def print_debuginfo(): print("Automatically cloned {} arrays for the GPU.".format( GPUTransformMap._arrays_removed)) print("Automatically changed {} maps for the GPU.".format( GPUTransformMap._maps_transformed))