def evaluate(self, config, cutout, measurements: int, **kwargs) -> float: dreport = self._sdfg.get_instrumented_data() candidate = dace.SDFG.from_json(cutout) for node in candidate.start_state: if isinstance(node, dace.nodes.MapEntry): break else: # Skip no-map-states return math.inf if config[0] == 0: # Baseline return self.measure(candidate, dreport, measurements) map_ids = config[1] if len(map_ids) < 2: return math.inf maps_ = list(map(candidate.start_state.node, map_ids)) subgraph = helpers.subgraph_from_maps(sdfg=candidate, graph=candidate.start_state, map_entries=maps_) map_fusion = sg.SubgraphOTFFusion() map_fusion.setup_match(subgraph, candidate.sdfg_id, candidate.node_id(candidate.start_state)) if map_fusion.can_be_applied(candidate.start_state, candidate): fuse_counter = map_fusion.apply(candidate.start_state, candidate) if fuse_counter == 0: return math.inf return self.measure(candidate, dreport, measurements)
def brute_force(self): ''' Iterate over self.map_entries and yield all fusible combinations along with their score. ''' for i in range(2, len(self._map_entries) + 1): for sg in itertools.combinations(self._map_entries, i): current_subgraph = helpers.subgraph_from_maps( self._sdfg, self._graph, sg) # evaluate condition if specified conditional_eval = True if self._condition_function: conditional_eval = self._condition_function( self._sdfg, current_subgraph) # evaluate score if possible score = 0 if conditional_eval and self._scoring_function: score = self._scoring_function(current_subgraph) # yield element if condition is True if conditional_eval: yield (tuple(sg), score) if self.mode == 'map_entries' else ( current_subgraph, score)
def apply(self, sdfg): subgraph = self.subgraph_view(sdfg) graph = subgraph.graph scope_dict = graph.scope_dict() map_entries = helpers.get_outermost_scope_maps(sdfg, graph, subgraph, scope_dict) first_entry = next(iter(map_entries)) if self.allow_expansion: expansion = MultiExpansion() expansion.setup_match(subgraph, self.sdfg_id, self.state_id) expansion.permutation_only = not self.expansion_split if expansion.can_be_applied(sdfg, subgraph): expansion.apply(sdfg) sf = SubgraphFusion() sf.setup_match(subgraph, self.sdfg_id, self.state_id) if sf.can_be_applied(sdfg, self.subgraph_view(sdfg)): # set SubgraphFusion properties sf.debug = self.debug sf.transient_allocation = self.transient_allocation sf.schedule_innermaps = self.schedule_innermaps sf.apply(sdfg) self._global_map_entry = sf._global_map_entry return elif self.allow_tiling == True: st = StencilTiling() st.setup_match(subgraph, self.sdfg_id, self.state_id) if st.can_be_applied(sdfg, self.subgraph_view(sdfg)): # set StencilTiling properties st.debug = self.debug st.unroll_loops = self.stencil_unroll_loops st.strides = self.stencil_strides st.apply(sdfg) # StencilTiling: update nodes new_entries = st._outer_entries subgraph = helpers.subgraph_from_maps(sdfg, graph, new_entries) sf = SubgraphFusion() sf.setup_match(subgraph, self.sdfg_id, self.state_id) # set SubgraphFusion properties sf.debug = self.debug sf.transient_allocation = self.transient_allocation sf.schedule_innermaps = self.schedule_innermaps sf.apply(sdfg) self._global_map_entry = sf._global_map_entry return warnings.warn("CompositeFusion::Apply did not perform as expected")
def traverse(self, current: List, forbidden: Set): if len(current) > 0: # get current subgraph we are inspecting current_subgraph = helpers.subgraph_from_maps( self._sdfg, self._graph, current, self._scope_children) # evaluate condition if specified conditional_eval = True if self._condition_function: conditional_eval = self._condition_function( self._sdfg, current_subgraph) # evaluate score if possible score = 0 if conditional_eval and self._scoring_function: score = self._scoring_function(current_subgraph) # calculate where to backtrack next if not prune go_next = list() if conditional_eval or self.prune == False or len(current) == 1: go_next = list( set(m for c in current for m in self._adjacency_list[c] if m not in current and m not in forbidden)) # for determinism and correctness during pruning go_next.sort(key=lambda me: self._labels[me]) # yield element if condition is True if conditional_eval: yield (tuple(current), score) if self.mode == 'map_entries' else ( current_subgraph, score) else: # special case at very beginning: explore every node go_next = list(set(m for m in self._adjacency_list.keys())) go_next.sort(key=lambda me: self._labels[me]) if len(go_next) > 0: # recurse further forbidden_current = set() for child in go_next: current.append(child) yield from self.traverse(current, forbidden | forbidden_current) current.pop() forbidden_current.add(child)
def apply(self, config: Tuple[int, List[int]], label: str, **kwargs) -> None: if config[0] == 0: return nsdfg_id, state_id, state_label = label.split(".") nsdfg_id = int(nsdfg_id) state_id = int(state_id) sdfg = list(self._sdfg.all_sdfgs_recursive())[nsdfg_id] state = sdfg.node(state_id) nodes = state.nodes() cutout = cutter.cutout_state(state, *(nodes), make_copy=False) map_ids = config[1] maps_ = list(map(cutout.start_state.node, map_ids)) subgraph = helpers.subgraph_from_maps(sdfg=sdfg, graph=state, map_entries=maps_) map_fusion = sg.SubgraphOTFFusion() map_fusion.setup_match(subgraph, sdfg.sdfg_id, state_id) if map_fusion.can_be_applied(state, sdfg): fuse_counter = map_fusion.apply(state, sdfg) print(f"Fusing {fuse_counter} maps")
def apply(self, config: Tuple[int, List[int]], label: str, **kwargs) -> None: if config[0] == 0: return nsdfg_id, state_id, _ = label.split(".") sdfg = list(self._sdfg.all_sdfgs_recursive())[int(nsdfg_id)] state_id = int(state_id) state = sdfg.node(state_id) nodes = state.nodes() cutout = cutter.cutout_state(state, *(nodes), make_copy=False) map_ids = config[1] maps_ = list(map(cutout.start_state.node, map_ids)) subgraph = helpers.subgraph_from_maps(sdfg=sdfg, graph=state, map_entries=maps_) subgraph_fusion = sg.CompositeFusion() subgraph_fusion.setup_match(subgraph, sdfg.sdfg_id, state_id) subgraph_fusion.allow_tiling = True subgraph_fusion.schedule_innermaps = dace.ScheduleType.GPU_Device if subgraph_fusion.can_be_applied(sdfg, subgraph): subgraph_fusion.apply(sdfg)
def evaluate(self, config, cutout, measurements: int, **kwargs) -> float: dreport = self._sdfg.get_instrumented_data() candidate = dace.SDFG.from_json(cutout) candidate.start_state.instrument = dace.InstrumentationType.GPU_Events for node in candidate.start_state: if isinstance(node, dace.nodes.MapEntry): break else: # Skip no-map-states return math.inf if config[0] == 0: # Baseline return self.measure(candidate, dreport, measurements) map_ids = config[1] if len(map_ids) < 2: return math.inf maps_ = list(map(candidate.start_state.node, map_ids)) subgraph = helpers.subgraph_from_maps(sdfg=candidate, graph=candidate.start_state, map_entries=maps_) subgraph_fusion = sg.CompositeFusion() subgraph_fusion.setup_match(subgraph, candidate.sdfg_id, candidate.node_id(candidate.start_state)) subgraph_fusion.allow_tiling = True subgraph_fusion.schedule_innermaps = dace.ScheduleType.GPU_Device if subgraph_fusion.can_be_applied(candidate, subgraph): subgraph_fusion.apply(candidate) else: return math.inf return self.measure(candidate, dreport, measurements)
def can_be_applied(graph: dace.SDFGState, candidate: Dict[pm.PatternNode, int], expr_index: int, sdfg: dace.SDFG, permissive: bool = False): map_entry = graph.node(candidate[ElementWiseArrayOperation._map_entry]) map_exit = graph.exit_node(map_entry) params = [dace.symbol(p) for p in map_entry.map.params] if "commsize" in map_entry.map.range.free_symbols: return False if "Px" in map_entry.map.range.free_symbols: return False if "Py" in map_entry.map.range.free_symbols: return False # If the map iterators are used in the code of a Tasklet, # then we cannot flatten them (currently). # See, for example, samples/simple/mandelbrot.py for node in subgraph_from_maps(sdfg, graph, [map_entry]): if isinstance(node, dace.nodes.CodeNode): for p in params: if str(p) in node.free_symbols: return False inputs = dict() for _, _, _, _, m in graph.out_edges(map_entry): if not m.data: continue desc = sdfg.arrays[m.data] if desc not in inputs.keys(): inputs[desc] = [] inputs[desc].append(m.subset) for desc, accesses in inputs.items(): if isinstance(desc, dace.data.Scalar): continue elif isinstance(desc, (dace.data.Array, dace.data.View)): if list(desc.shape) == [1]: continue for a in accesses: if a.num_elements() != 1: return False indices = a.min_element() unmatched_indices = set(params) for idx in indices: if idx in unmatched_indices: unmatched_indices.remove(idx) if len(unmatched_indices) > 0: return False else: return False outputs = dict() for _, _, _, _, m in graph.in_edges(map_exit): if m.wcr: return False desc = sdfg.arrays[m.data] if desc not in outputs.keys(): outputs[desc] = [] outputs[desc].append(m.subset) for desc, accesses in outputs.items(): if isinstance(desc, (dace.data.Array, dace.data.View)): for a in accesses: if a.num_elements() != 1: return False indices = a.min_element() unmatched_indices = set(params) for idx in indices: if idx in unmatched_indices: unmatched_indices.remove(idx) if len(unmatched_indices) > 0: return False else: return False return True
def transfer(sdfg: dace.SDFG, tuner, k: int = 5): assert isinstance(tuner, OnTheFlyMapFusionTuner) dreport = sdfg.get_instrumented_data() assert dreport is not None tuning_report = tuner.optimize(apply=False) best_configs = cutout_tuner.CutoutTuner.top_k_configs(tuning_report, k=k) subgraph_patterns = tuner._extract_patterns(best_configs) i = 0 for nsdfg in sdfg.all_sdfgs_recursive(): for state in nsdfg.states(): i = i + 1 top_maps = [] for node in state.nodes(): if isinstance(node, dace.nodes.MapEntry) and xfh.get_parent_map( state, node) is None: top_maps.append(node) if len(top_maps) < 2: continue try: cutout = cutter.cutout_state(state, *(state.nodes()), make_copy=False) except AttributeError: continue while True: base_runtime = None best_pattern = None best_pattern_runtime = math.inf for j, pattern in enumerate(subgraph_patterns): maps = [] for node in state.nodes(): if isinstance( node, dace.nodes.MapEntry ) and xfh.get_parent_map(state, node) is None: maps.append(node) if len(maps) < 2: continue maps_desc = {} state_desc = Counter() for map_entry in maps: map_desc = OnTheFlyMapFusionTuner.map_descriptor( state, map_entry) state_desc.update({map_desc: 1}) if not map_desc in maps_desc: maps_desc[map_desc] = [] maps_desc[map_desc].append(map_entry) included = True for key in pattern: if not key in state_desc or pattern[ key] > state_desc[key]: included = False break if not included: continue if base_runtime is None: baseline = cutter.cutout_state(state, *(state.nodes()), make_copy=False) baseline.start_state.instrument = dace.InstrumentationType.GPU_Events dreport_ = {} for cstate in baseline.nodes(): for dnode in cstate.data_nodes(): array = baseline.arrays[dnode.data] if array.transient: continue try: data = dreport.get_first_version( dnode.data) dreport_[dnode.data] = data except: continue base_runtime = optim_utils.subprocess_measure( baseline, dreport_, i=192, j=192) best_pattern_runtime = base_runtime if base_runtime == math.inf: break # Construct subgraph greedily subgraph_maps = [] for desc in pattern: num = pattern[desc] subgraph_maps.extend(maps_desc[desc][:num]) # Apply experiment_sdfg_ = cutter.cutout_state( state, *(state.nodes()), make_copy=False) experiment_state_ = experiment_sdfg_.start_state experiment_maps_ids = list( map(lambda me: experiment_state_.node_id(me), subgraph_maps)) experiment_sdfg = copy.deepcopy(experiment_sdfg_) experiment_state = experiment_sdfg.start_state experiment_state.instrument = dace.InstrumentationType.GPU_Events experiment_maps = list( map(lambda m_id: experiment_state.node(m_id), experiment_maps_ids)) experiment_subgraph = helpers.subgraph_from_maps( sdfg=experiment_sdfg, graph=experiment_state, map_entries=experiment_maps) map_fusion = sg.SubgraphOTFFusion() map_fusion.setup_match( experiment_subgraph, experiment_sdfg.sdfg_id, experiment_sdfg.node_id(experiment_state)) if map_fusion.can_be_applied(experiment_state, experiment_sdfg): try: experiment_fuse_counter = map_fusion.apply( experiment_state, experiment_sdfg) except: continue if experiment_fuse_counter == 0: continue dreport_ = {} for cstate in experiment_sdfg.nodes(): for dnode in cstate.data_nodes(): array = experiment_sdfg.arrays[dnode.data] if array.transient: continue try: data = dreport.get_first_version( dnode.data) dreport_[dnode.data] = data except: continue fused_runtime = optim_utils.subprocess_measure( experiment_sdfg, dreport_, i=192, j=192) if fused_runtime >= best_pattern_runtime: continue best_pattern = subgraph_maps best_pattern_runtime = fused_runtime if best_pattern is not None: subgraph = helpers.subgraph_from_maps( sdfg=nsdfg, graph=state, map_entries=best_pattern) map_fusion = sg.SubgraphOTFFusion() map_fusion.setup_match(subgraph, nsdfg.sdfg_id, nsdfg.node_id(state)) actual_fuse_counter = map_fusion.apply(state, nsdfg) best_pattern = None base_runtime = None best_pattern_runtime = math.inf else: break
def greedy_fuse(graph_or_subgraph: GraphViewType, validate_all: bool, device: dace.dtypes.DeviceType = dace.dtypes.DeviceType.CPU, recursive: bool = True, stencil: bool = False, stencil_tile=None, permutations_only: bool = True, expand_reductions: bool = False) -> None: ''' Greedily fuses maps of an SDFG or graph, operating in-place. :param graph_or_subgraph: SDFG, SDFGState or Subgraph :param validate_all: Validate SDFG or graph at each fusion step :param device: Device type to specialize for :param recursive: Fuse recursively within (fused and unfused) scopes :param stencil: Perform stencil fusion instead of regular fusion :param stencil_tile: StencilTiling Tile size, default if None :param permutations_only: Disallow splitting of maps during MultiExpansion stage :param expand_reductions: Expand all reduce nodes before fusion ''' debugprint = config.Config.get_bool('debugprint') if isinstance(graph_or_subgraph, SDFG): # If we have an SDFG, recurse into graphs graph_or_subgraph.simplify(validate_all=validate_all) # MapFusion for trivial cases graph_or_subgraph.apply_transformations_repeated( MapFusion, validate_all=validate_all) # recurse into graphs for graph in graph_or_subgraph.nodes(): greedy_fuse(graph, validate_all=validate_all, device=device, recursive=recursive, stencil=stencil, stencil_tile=stencil_tile, permutations_only=permutations_only, expand_reductions=expand_reductions) else: # we are in graph or subgraph sdfg, graph, subgraph = None, None, None if isinstance(graph_or_subgraph, SDFGState): sdfg = graph_or_subgraph.parent sdfg.apply_transformations_repeated(MapFusion, validate_all=validate_all) graph = graph_or_subgraph subgraph = SubgraphView(graph, graph.nodes()) else: sdfg = graph_or_subgraph.graph.parent graph = graph_or_subgraph.graph subgraph = graph_or_subgraph # create condition function object fusion_condition = CompositeFusion(SubgraphView(graph, graph.nodes())) # within SDFGState: greedily enumerate fusible components # and apply transformation applied_transformations = 0 reverse = True if stencil else False if stencil: # adjust tiling settings fusion_condition.allow_tiling = True fusion_condition.schedule_innermaps = dtypes.ScheduleType.Sequential if device == dtypes.DeviceType.GPU: fusion_condition.stencil_unroll_loops = True # tile size if stencil_tile: fusion_condition.stencil_strides = stencil_tile # always only permutate for now with stencil tiles fusion_condition.expansion_split = False else: fusion_condition.allow_tiling = False # expand reductions if expand_reductions: for graph in sdfg.nodes(): for node in graph.nodes(): if isinstance(node, dace.libraries.standard.nodes.Reduce): try: ReduceExpansion.apply_to(sdfg, reduce=node) except ValueError as e: pass # permutation settings fusion_condition.expansion_split = not permutations_only condition_function = lambda sdfg, subgraph: fusion_condition.can_be_applied( sdfg, subgraph) enumerator = GreedyEnumerator(sdfg, graph, subgraph, condition_function=condition_function) for map_entries in enumerator: if len(map_entries) > 1: current_subgraph = xfsh.subgraph_from_maps( sdfg, graph, map_entries) cf = CompositeFusion(current_subgraph) # transfer settings cf.allow_tiling = fusion_condition.allow_tiling cf.schedule_innermaps = fusion_condition.schedule_innermaps cf.expansion_split = fusion_condition.expansion_split cf.stencil_strides = fusion_condition.stencil_strides cf.apply(sdfg) applied_transformations += 1 if recursive: global_entry = cf._global_map_entry if len( map_entries) > 1 else map_entries[0] greedy_fuse(graph.scope_subgraph(global_entry, include_entry=False, include_exit=False), validate_all=validate_all, device=device, recursive=recursive, stencil=stencil, stencil_tile=stencil_tile, permutations_only=permutations_only, expand_reductions=expand_reductions) for node in graph_or_subgraph.nodes(): if isinstance(node, nodes.NestedSDFG): greedy_fuse(node.sdfg, validate_all=validate_all, device=device, stencil=stencil, stencil_tile=stencil_tile, recursive=recursive, permutations_only=permutations_only, expand_reductions=expand_reductions) if applied_transformations > 0: if debugprint: if stencil: print(f"Applied {applied_transformations} TileFusion") else: print(f"Applied {applied_transformations} SubgraphFusion") if validate_all: graph.validate()
def iterator(self): # iterate through adjacency list starting with map with lowest label. # then greedily explore neighbors with next lowest label and see whether set is fusible # if not fusible, cancel and create a new set if len(self._adjacency_list) == 0: return first_map = next(me for me in self._adjacency_list if self._labels[me] == 0) # define queue / visited set which helps us find starting points # for the next inner iterations added = set() outer_queued = set(self._source_maps) outer_queue = [ QueuedEntry(me, self._labels[me], reverse=self._reverse) for me in self._source_maps ] while len(outer_queue) > 0: # current iteration: define queue / set with which we are going # to find current components while len(outer_queue) > 0: next_iterate = heapq.heappop(outer_queue) if next_iterate.map_entry not in added: break elif len(outer_queue) == 0: next_iterate = None break if not next_iterate: break current_set = set() inner_queue = [next_iterate] inner_queued = {next_iterate.map_entry} while len(inner_queue) > 0: # select starting map current = heapq.heappop(inner_queue) current_map = current.map_entry # check whether current | current_set can be fused add_current_map = False if len(current_set) == 0: add_current_map = True else: subgraph = helpers.subgraph_from_maps( self._sdfg, self._graph, current_set | {current_map}) if self._condition_function(self._sdfg, subgraph): add_current_map = True if add_current_map: # add it to current set and continue BFS added.add(current_map) current_set.add(current_map) # recurse further for current_neighbor_map in self._adjacency_list[ current_map]: # add to outer queue and set if current_neighbor_map not in added: if current_neighbor_map not in outer_queued: heapq.heappush( outer_queue, QueuedEntry( current_neighbor_map, self._labels[current_neighbor_map], reverse=self._reverse)) outer_queued.add(current_neighbor_map) # add to inner queue and set if current_neighbor_map not in inner_queued: heapq.heappush( inner_queue, QueuedEntry( current_neighbor_map, self._labels[current_neighbor_map], reverse=self._reverse)) inner_queued.add(current_neighbor_map) # yield if self.mode == 'map_entries': yield tuple(current_set) else: yield helpers.subgraph_from_maps(self._sdfg, self._graph, current_set)