def apply(self, sdfg): graph = sdfg.nodes()[self.state_id] subgraph = self.subgraph_view(sdfg) map_entries = helpers.get_outermost_scope_maps(sdfg, graph, subgraph) result = StencilTiling.topology(sdfg, graph, map_entries) (children_dict, parent_dict, sink_maps) = result # next up, calculate inferred ranges for each map # for each map entry, this contains a tuple of dicts: # each of those maps from data_name of the array to # inferred outer ranges. An inferred outer range is created # by taking the union of ranges of inner subsets corresponding # to that data and substituting this subset by the min / max of the # parametrized map boundaries # finally, from these outer ranges we can easily calculate # strides and tile sizes required for every map inferred_ranges = defaultdict(dict) # create array of reverse topologically sorted map entries # to iterate over topo_reversed = [] queue = set(sink_maps.copy()) while len(queue) > 0: element = next(e for e in queue if not children_dict[e] - set(topo_reversed)) topo_reversed.append(element) queue.remove(element) for parent in parent_dict[element]: queue.add(parent) # main loop # first get coverage dicts for each map entry # for each map, contains a tuple of two dicts # each of those two maps from data name to outer range coverage = {} for map_entry in map_entries: coverage[map_entry] = StencilTiling.coverage_dicts( sdfg, graph, map_entry, outer_range=True) # we have a mapping from data name to outer range # however we want a mapping from map parameters to outer ranges # for this we need to find out how all array dimensions map to # outer ranges variable_mapping = defaultdict(list) for map_entry in topo_reversed: map = map_entry.map # first find out variable mapping for e in itertools.chain( graph.out_edges(map_entry), graph.in_edges(graph.exit_node(map_entry))): mapping = [] for dim in e.data.subset: syms = set() for d in dim: syms |= symbolic.symlist(d).keys() if len(syms) > 1: raise NotImplementedError( "One incoming or outgoing stencil subset is indexed " "by multiple map parameters. " "This is not supported yet.") try: mapping.append(syms.pop()) except KeyError: # just append None if there is no map symbol in it. # we don't care for now. mapping.append(None) if e.data in variable_mapping: # assert that this is the same everywhere. # else we might run into problems assert variable_mapping[e.data.data] == mapping else: variable_mapping[e.data.data] = mapping # now do mapping data name -> outer range # and from that infer mapping variable -> outer range local_ranges = {dn: None for dn in coverage[map_entry][1].keys()} for data_name, cov in coverage[map_entry][1].items(): local_ranges[data_name] = subsets.union( local_ranges[data_name], cov) # now look at proceeding maps # and union those subsets -> could be larger with stencil indent for child_map in children_dict[map_entry]: if data_name in coverage[child_map][0]: local_ranges[data_name] = subsets.union( local_ranges[data_name], coverage[child_map][0][data_name]) # final assignent: combine local_ranges and variable_mapping # together into inferred_ranges inferred_ranges[map_entry] = {p: None for p in map.params} for data_name, ranges in local_ranges.items(): for param, r in zip(variable_mapping[data_name], ranges): # create new range from this subset and assign rng = subsets.Range((r, )) if param: inferred_ranges[map_entry][param] = subsets.union( inferred_ranges[map_entry][param], rng) # get parameters -- should all be the same params = next(iter(map_entries)).map.params.copy() # define reference range as inferred range of one of the sink maps self.reference_range = inferred_ranges[next(iter(sink_maps))] if self.debug: print("StencilTiling::Reference Range", self.reference_range) # next up, search for the ranges that don't change invariant_dims = [] for idx, p in enumerate(params): different = False if self.reference_range[p] is None: invariant_dims.append(idx) warnings.warn( f"StencilTiling::No Stencil pattern detected for parameter {p}" ) continue for m in map_entries: if inferred_ranges[m][p] != self.reference_range[p]: different = True break if not different: invariant_dims.append(idx) warnings.warn( f"StencilTiling::No Stencil pattern detected for parameter {p}" ) # during stripmining, we will create new outer map entries # for easy access self._outer_entries = set() # with inferred_ranges constructed, we can begin to strip mine for map_entry in map_entries: # Retrieve map entry and exit nodes. map = map_entry.map stripmine_subgraph = { StripMining._map_entry: graph.nodes().index(map_entry) } sdfg_id = sdfg.sdfg_id last_map_entry = None original_schedule = map_entry.schedule self.tile_sizes = [] self.tile_offset_lower = [] self.tile_offset_upper = [] # strip mining each dimension where necessary removed_maps = 0 for dim_idx, param in enumerate(map_entry.map.params): # get current_node tile size if dim_idx >= len(self.strides): tile_stride = symbolic.pystr_to_symbolic(self.strides[-1]) else: tile_stride = symbolic.pystr_to_symbolic( self.strides[dim_idx]) trivial = False if dim_idx in invariant_dims: self.tile_sizes.append(tile_stride) self.tile_offset_lower.append(0) self.tile_offset_upper.append(0) else: target_range_current = inferred_ranges[map_entry][param] reference_range_current = self.reference_range[param] min_diff = symbolic.SymExpr(reference_range_current.min_element()[0] \ - target_range_current.min_element()[0]) max_diff = symbolic.SymExpr(target_range_current.max_element()[0] \ - reference_range_current.max_element()[0]) try: min_diff = symbolic.evaluate(min_diff, {}) max_diff = symbolic.evaluate(max_diff, {}) except TypeError: raise RuntimeError("Symbolic evaluation of map " "ranges failed. Please check " "your parameters and match.") self.tile_sizes.append(tile_stride + max_diff + min_diff) self.tile_offset_lower.append( symbolic.pystr_to_symbolic(str(min_diff))) self.tile_offset_upper.append( symbolic.pystr_to_symbolic(str(max_diff))) # get calculated parameters tile_size = self.tile_sizes[-1] dim_idx -= removed_maps # If map or tile sizes are trivial, skip strip-mining map dimension # special cases: # if tile size is trivial AND we have an invariant dimension, skip if tile_size == map.range.size()[dim_idx] and ( dim_idx + removed_maps) in invariant_dims: continue # trivial map: we just continue if map.range.size()[dim_idx] in [0, 1]: continue if tile_size == 1 and tile_stride == 1 and ( dim_idx + removed_maps) in invariant_dims: trivial = True removed_maps += 1 # indent all map ranges accordingly and then perform # strip mining on these. Offset inner maps accordingly afterwards range_tuple = (map.range[dim_idx][0] + self.tile_offset_lower[-1], map.range[dim_idx][1] - self.tile_offset_upper[-1], map.range[dim_idx][2]) map.range[dim_idx] = range_tuple stripmine = StripMining(sdfg_id, self.state_id, stripmine_subgraph, 0) stripmine.tiling_type = 'ceilrange' stripmine.dim_idx = dim_idx stripmine.new_dim_prefix = self.prefix if not trivial else '' # use tile_stride for both -- we will extend # the inner tiles later stripmine.tile_size = str(tile_stride) stripmine.tile_stride = str(tile_stride) outer_map = stripmine.apply(sdfg) outer_map.schedule = original_schedule # apply to the new map the schedule of the original one map_entry.schedule = self.schedule # if tile stride is 1, we can make a nice simplification by just # taking the overapproximated inner range as inner range # this eliminates the min/max in the range which # enables loop unrolling if tile_stride == 1: map_entry.range[dim_idx] = tuple( symbolic.SymExpr(el._approx_expr) if isinstance( el, symbolic.SymExpr) else el for el in map_entry.range[dim_idx]) # in map_entry: enlarge tiles by upper and lower offset # doing it this way and not via stripmine strides ensures # that the max gets changed as well old_range = map_entry.range[dim_idx] map_entry.range[dim_idx] = ((old_range[0] - self.tile_offset_lower[-1]), (old_range[1] + self.tile_offset_upper[-1]), old_range[2]) # We have to propagate here for correct outer volume and subset sizes _propagate_node(graph, map_entry) _propagate_node(graph, graph.exit_node(map_entry)) # usual tiling pipeline if last_map_entry: new_map_entry = graph.in_edges(map_entry)[0].src mapcollapse_subgraph = { MapCollapse._outer_map_entry: graph.node_id(last_map_entry), MapCollapse._inner_map_entry: graph.node_id(new_map_entry) } mapcollapse = MapCollapse(sdfg_id, self.state_id, mapcollapse_subgraph, 0) mapcollapse.apply(sdfg) last_map_entry = graph.in_edges(map_entry)[0].src # add last instance of map entries to _outer_entries if last_map_entry: self._outer_entries.add(last_map_entry) # Map Unroll Feature: only unroll if conditions are met: # Only unroll if at least one of the inner map ranges is strictly larger than 1 # Only unroll if strides all are one if self.unroll_loops and all(s == 1 for s in self.strides) and any( s not in [0, 1] for s in map_entry.range.size()): l = len(map_entry.params) if l > 1: subgraph = { MapExpansion.map_entry: graph.nodes().index(map_entry) } trafo_expansion = MapExpansion(sdfg.sdfg_id, sdfg.nodes().index(graph), subgraph, 0) trafo_expansion.apply(sdfg) maps = [map_entry] for _ in range(l - 1): map_entry = graph.out_edges(map_entry)[0].dst maps.append(map_entry) for map in reversed(maps): # MapToForLoop subgraph = { MapToForLoop._map_entry: graph.nodes().index(map) } trafo_for_loop = MapToForLoop(sdfg.sdfg_id, sdfg.nodes().index(graph), subgraph, 0) trafo_for_loop.apply(sdfg) nsdfg = trafo_for_loop.nsdfg # LoopUnroll guard = trafo_for_loop.guard end = trafo_for_loop.after_state begin = next(e.dst for e in nsdfg.out_edges(guard) if e.dst != end) subgraph = { DetectLoop._loop_guard: nsdfg.nodes().index(guard), DetectLoop._loop_begin: nsdfg.nodes().index(begin), DetectLoop._exit_state: nsdfg.nodes().index(end) } transformation = LoopUnroll(0, 0, subgraph, 0) transformation.apply(nsdfg) elif self.unroll_loops: warnings.warn( "Did not unroll loops. Either all ranges are equal to " "one or range difference is symbolic.") self._outer_entries = list(self._outer_entries)
def fuse(self, sdfg, graph, map_entries, do_not_override=None, **kwargs): """ takes the map_entries specified and tries to fuse maps. all maps have to be extended into outer and inner map (use MapExpansion as a pre-pass) Arrays that don't exist outside the subgraph get pushed into the map and their data dimension gets cropped. Otherwise the original array is taken. For every output respective connections are crated automatically. :param sdfg: SDFG :param graph: State :param map_entries: Map Entries (class MapEntry) of the outer maps which we want to fuse :param do_not_override: List of data names whose corresponding nodes are fully contained within the subgraph but should not be augmented/transformed nevertheless. """ # if there are no maps, return immediately if len(map_entries) == 0: return do_not_override = do_not_override or [] # get maps and map exits maps = [map_entry.map for map_entry in map_entries] map_exits = [graph.exit_node(map_entry) for map_entry in map_entries] # See function documentation for an explanation of these variables node_config = SubgraphFusion.get_adjacent_nodes(sdfg, graph, map_entries) (in_nodes, intermediate_nodes, out_nodes) = node_config if self.debug: print("SubgraphFusion::In_nodes", in_nodes) print("SubgraphFusion::Out_nodes", out_nodes) print("SubgraphFusion::Intermediate_nodes", intermediate_nodes) # all maps are assumed to have the same params and range in order global_map = nodes.Map(label="outer_fused", params=maps[0].params, ndrange=maps[0].range) global_map_entry = nodes.MapEntry(global_map) global_map_exit = nodes.MapExit(global_map) schedule = map_entries[0].schedule global_map_entry.schedule = schedule graph.add_node(global_map_entry) graph.add_node(global_map_exit) # next up, for any intermediate node, find whether it only appears # in the subgraph or also somewhere else / as an input # create new transients for nodes that are in out_nodes and # intermediate_nodes simultaneously # also check which dimensions of each transient data element correspond # to map axes and write this information into a dict. node_info = self.prepare_intermediate_nodes(sdfg, graph, in_nodes, out_nodes, \ intermediate_nodes,\ map_entries, map_exits, \ do_not_override) (subgraph_contains_data, transients_created, invariant_dimensions) = node_info if self.debug: print( "SubgraphFusion:: {Intermediate_node: subgraph_contains_data} dict" ) print(subgraph_contains_data) inconnectors_dict = {} # Dict for saving incoming nodes and their assigned connectors # Format: {access_node: (edge, in_conn, out_conn)} for map_entry, map_exit in zip(map_entries, map_exits): # handle inputs # TODO: dynamic map range -- this is fairly unrealistic in such a setting for edge in graph.in_edges(map_entry): src = edge.src mmt = graph.memlet_tree(edge) out_edges = [child.edge for child in mmt.root().children] if src in in_nodes: in_conn = None out_conn = None if src in inconnectors_dict: # no need to augment subset of outer edge. # will do this at the end in one pass. in_conn = inconnectors_dict[src][1] out_conn = inconnectors_dict[src][2] else: next_conn = global_map_entry.next_connector() in_conn = 'IN_' + next_conn out_conn = 'OUT_' + next_conn global_map_entry.add_in_connector(in_conn) global_map_entry.add_out_connector(out_conn) inconnectors_dict[src] = (edge, in_conn, out_conn) # reroute in edge via global_map_entry self.copy_edge(graph, edge, new_dst = global_map_entry, \ new_dst_conn = in_conn) # map out edges to new map for out_edge in out_edges: self.copy_edge(graph, out_edge, new_src = global_map_entry, \ new_src_conn = out_conn) else: # connect directly for out_edge in out_edges: mm = dcpy(out_edge.data) self.copy_edge(graph, out_edge, new_src=src, new_src_conn=None, new_data=mm) for edge in graph.out_edges(map_entry): # special case: for nodes that have no data connections if not edge.src_conn: self.copy_edge(graph, edge, new_src=global_map_entry) ###################################### for edge in graph.in_edges(map_exit): if not edge.dst_conn: # no destination connector, path ends here. self.copy_edge(graph, edge, new_dst=global_map_exit) continue # find corresponding out_edges for current edge, cannot use mmt anymore out_edges = [ oedge for oedge in graph.out_edges(map_exit) if oedge.src_conn[3:] == edge.dst_conn[2:] ] # Tuple to store in/out connector port that might be created port_created = None for out_edge in out_edges: dst = out_edge.dst if dst in intermediate_nodes & out_nodes: # create connection through global map from # dst to dst_transient that was created dst_transient = transients_created[dst] next_conn = global_map_exit.next_connector() in_conn = 'IN_' + next_conn out_conn = 'OUT_' + next_conn global_map_exit.add_in_connector(in_conn) global_map_exit.add_out_connector(out_conn) # for each transient created, create a union # of outgoing memlets' subsets. this is # a cheap fix to override assignments in invariant # dimensions union = None for oe in graph.out_edges(transients_created[dst]): union = subsets.union(union, oe.data.subset) inner_memlet = dcpy(edge.data) for i, s in enumerate(edge.data.subset): if i in invariant_dimensions[dst.label]: inner_memlet.subset[i] = union[i] inner_memlet.other_subset = dcpy(inner_memlet.subset) e_inner = graph.add_edge(dst, None, global_map_exit, in_conn, inner_memlet) mm_outer = propagate_memlet(graph, inner_memlet, global_map_entry, \ union_inner_edges = False) e_outer = graph.add_edge(global_map_exit, out_conn, dst_transient, None, mm_outer) # remove edge from dst to dst_transient that was created # in intermediate preparation. for e in graph.out_edges(dst): if e.dst == dst_transient: graph.remove_edge(e) break # handle separately: intermediate_nodes and pure out nodes # case 1: intermediate_nodes: can just redirect edge if dst in intermediate_nodes: self.copy_edge(graph, out_edge, new_src=edge.src, new_src_conn=edge.src_conn, new_data=dcpy(edge.data)) # case 2: pure out node: connect to outer array node if dst in (out_nodes - intermediate_nodes): if edge.dst != global_map_exit: next_conn = global_map_exit.next_connector() in_conn = 'IN_' + next_conn out_conn = 'OUT_' + next_conn global_map_exit.add_in_connector(in_conn) global_map_exit.add_out_connector(out_conn) self.copy_edge(graph, edge, new_dst=global_map_exit, new_dst_conn=in_conn) port_created = (in_conn, out_conn) else: conn_nr = edge.dst_conn[3:] in_conn = port_created.st out_conn = port_created.nd # map graph.add_edge(global_map_exit, out_conn, dst, None, dcpy(out_edge.data)) # maps are now ready to be discarded # all connected edges will be finally removed as well graph.remove_node(map_entry) graph.remove_node(map_exit) # create a mapping from data arrays to offsets # for later memlet adjustments later min_offsets = dict() # do one pass to augment all transient arrays data_intermediate = set([node.data for node in intermediate_nodes]) for data_name in data_intermediate: if subgraph_contains_data[data_name]: all_nodes = [ n for n in intermediate_nodes if n.data == data_name ] in_edges = list(chain(*(graph.in_edges(n) for n in all_nodes))) in_edges_iter = iter(in_edges) in_edge = next(in_edges_iter) target_subset = dcpy(in_edge.data.subset) target_subset.pop(invariant_dimensions[data_name]) ###### while True: try: # executed if there are multiple in_edges in_edge = next(in_edges_iter) target_subset_curr = dcpy(in_edge.data.subset) target_subset_curr.pop(invariant_dimensions[data_name]) target_subset = subsets.union(target_subset, \ target_subset_curr) except StopIteration: break min_offsets_cropped = target_subset.min_element_approx() # calculate the new transient array size. target_subset.offset(min_offsets_cropped, True) # re-add invariant dimensions with offset 0 and save to min_offsets min_offset = [] index = 0 for i in range(len(sdfg.data(data_name).shape)): if i in invariant_dimensions[data_name]: min_offset.append(0) else: min_offset.append(min_offsets_cropped[index]) index += 1 min_offsets[data_name] = min_offset # determine the shape of the new array. new_data_shape = [] index = 0 for i, sz in enumerate(sdfg.data(data_name).shape): if i in invariant_dimensions[data_name]: new_data_shape.append(sz) else: new_data_shape.append(target_subset.size()[index]) index += 1 new_data_strides = [ data._prod(new_data_shape[i + 1:]) for i in range(len(new_data_shape)) ] new_data_totalsize = data._prod(new_data_shape) new_data_offset = [0] * len(new_data_shape) # augment. transient_to_transform = sdfg.data(data_name) transient_to_transform.shape = new_data_shape transient_to_transform.strides = new_data_strides transient_to_transform.total_size = new_data_totalsize transient_to_transform.offset = new_data_offset transient_to_transform.lifetime = dtypes.AllocationLifetime.Scope transient_to_transform.storage = self.transient_allocation else: # don't modify data container - array is needed outside # of subgraph. # hack: set lifetime to State if allocation has only been # scope so far to avoid allocation issues if sdfg.data( data_name).lifetime == dtypes.AllocationLifetime.Scope: sdfg.data( data_name).lifetime = dtypes.AllocationLifetime.State # do one pass to adjust and the memlets of in-between transients for node in intermediate_nodes: # all incoming edges to node in_edges = graph.in_edges(node) # outgoing edges going to another fused part out_edges = graph.out_edges(node) # memlets of created transient: # correct data names if node in transients_created: transient_in_edges = graph.in_edges(transients_created[node]) transient_out_edges = graph.out_edges(transients_created[node]) for edge in chain(transient_in_edges, transient_out_edges): for e in graph.memlet_tree(edge): if e.data.data == node.data: e.data.data += '_OUT' # memlets of all in between transients: # offset memlets if array has been augmented if subgraph_contains_data[node.data]: # get min_offset min_offset = min_offsets[node.data] # re-add invariant dimensions with offset 0 for iedge in in_edges: for edge in graph.memlet_tree(iedge): if edge.data.data == node.data: edge.data.subset.offset(min_offset, True) elif edge.data.other_subset: edge.data.other_subset.offset(min_offset, True) # nested SDFG: adjust arrays connected if isinstance(iedge.src, nodes.NestedSDFG): nsdfg = iedge.src.sdfg nested_data_name = edge.src_conn self.adjust_arrays_nsdfg(sdfg, nsdfg, node.data, nested_data_name) for cedge in out_edges: for edge in graph.memlet_tree(cedge): if edge.data.data == node.data: edge.data.subset.offset(min_offset, True) elif edge.data.other_subset: edge.data.other_subset.offset(min_offset, True) # nested SDFG: adjust arrays connected if isinstance(edge.dst, nodes.NestedSDFG): nsdfg = edge.dst.sdfg nested_data_name = edge.dst_conn self.adjust_arrays_nsdfg(sdfg, nsdfg, node.data, nested_data_name) # if in_edges has several entries: # put other_subset into out_edges for correctness if len(in_edges) > 1: for oedge in out_edges: if oedge.dst == global_map_exit and \ oedge.data.other_subset is None: oedge.data.other_subset = dcpy(oedge.data.subset) oedge.data.other_subset.offset(min_offset, True) # consolidate edges if desired if self.consolidate: consolidate_edges_scope(graph, global_map_entry) consolidate_edges_scope(graph, global_map_exit) # propagate edges adjacent to global map entry and exit # if desired if self.propagate: _propagate_node(graph, global_map_entry) _propagate_node(graph, global_map_exit) # create a hook for outside access to global_map self._global_map_entry = global_map_entry if self.schedule_innermaps is not None: for node in graph.scope_children()[global_map_entry]: if isinstance(node, nodes.MapEntry): node.map.schedule = self.schedule_innermaps