def evaluate(self, config, cutout, measurements: int, **kwargs) -> float:
        dreport = self._sdfg.get_instrumented_data()

        candidate = dace.SDFG.from_json(cutout)
        for node in candidate.start_state:
            if isinstance(node, dace.nodes.MapEntry):
                break
        else:
            # Skip no-map-states
            return math.inf

        if config[0] == 0:
            # Baseline
            return self.measure(candidate, dreport, measurements)

        map_ids = config[1]
        if len(map_ids) < 2:
            return math.inf

        maps_ = list(map(candidate.start_state.node, map_ids))
        subgraph = helpers.subgraph_from_maps(sdfg=candidate,
                                              graph=candidate.start_state,
                                              map_entries=maps_)

        map_fusion = sg.SubgraphOTFFusion()
        map_fusion.setup_match(subgraph, candidate.sdfg_id,
                               candidate.node_id(candidate.start_state))
        if map_fusion.can_be_applied(candidate.start_state, candidate):
            fuse_counter = map_fusion.apply(candidate.start_state, candidate)

            if fuse_counter == 0:
                return math.inf

        return self.measure(candidate, dreport, measurements)
示例#2
0
    def brute_force(self):
        '''
        Iterate over self.map_entries and yield all fusible 
        combinations along with their score.
        '''
        for i in range(2, len(self._map_entries) + 1):
            for sg in itertools.combinations(self._map_entries, i):

                current_subgraph = helpers.subgraph_from_maps(
                    self._sdfg, self._graph, sg)

                # evaluate condition if specified
                conditional_eval = True
                if self._condition_function:
                    conditional_eval = self._condition_function(
                        self._sdfg, current_subgraph)

                # evaluate score if possible
                score = 0
                if conditional_eval and self._scoring_function:
                    score = self._scoring_function(current_subgraph)

                # yield element if condition is True
                if conditional_eval:
                    yield (tuple(sg),
                           score) if self.mode == 'map_entries' else (
                               current_subgraph, score)
示例#3
0
    def apply(self, sdfg):
        subgraph = self.subgraph_view(sdfg)
        graph = subgraph.graph
        scope_dict = graph.scope_dict()
        map_entries = helpers.get_outermost_scope_maps(sdfg, graph, subgraph,
                                                       scope_dict)
        first_entry = next(iter(map_entries))

        if self.allow_expansion:
            expansion = MultiExpansion()
            expansion.setup_match(subgraph, self.sdfg_id, self.state_id)
            expansion.permutation_only = not self.expansion_split
            if expansion.can_be_applied(sdfg, subgraph):
                expansion.apply(sdfg)

        sf = SubgraphFusion()
        sf.setup_match(subgraph, self.sdfg_id, self.state_id)
        if sf.can_be_applied(sdfg, self.subgraph_view(sdfg)):
            # set SubgraphFusion properties
            sf.debug = self.debug
            sf.transient_allocation = self.transient_allocation
            sf.schedule_innermaps = self.schedule_innermaps
            sf.apply(sdfg)
            self._global_map_entry = sf._global_map_entry
            return

        elif self.allow_tiling == True:
            st = StencilTiling()
            st.setup_match(subgraph, self.sdfg_id, self.state_id)
            if st.can_be_applied(sdfg, self.subgraph_view(sdfg)):
                # set StencilTiling properties
                st.debug = self.debug
                st.unroll_loops = self.stencil_unroll_loops
                st.strides = self.stencil_strides
                st.apply(sdfg)
                # StencilTiling: update nodes
                new_entries = st._outer_entries
                subgraph = helpers.subgraph_from_maps(sdfg, graph, new_entries)
                sf = SubgraphFusion()
                sf.setup_match(subgraph, self.sdfg_id, self.state_id)
                # set SubgraphFusion properties
                sf.debug = self.debug
                sf.transient_allocation = self.transient_allocation
                sf.schedule_innermaps = self.schedule_innermaps

                sf.apply(sdfg)
                self._global_map_entry = sf._global_map_entry
                return

        warnings.warn("CompositeFusion::Apply did not perform as expected")
示例#4
0
    def traverse(self, current: List, forbidden: Set):
        if len(current) > 0:
            # get current subgraph we are inspecting
            current_subgraph = helpers.subgraph_from_maps(
                self._sdfg, self._graph, current, self._scope_children)

            # evaluate condition if specified
            conditional_eval = True
            if self._condition_function:
                conditional_eval = self._condition_function(
                    self._sdfg, current_subgraph)
            # evaluate score if possible
            score = 0
            if conditional_eval and self._scoring_function:
                score = self._scoring_function(current_subgraph)

            # calculate where to backtrack next if not prune

            go_next = list()
            if conditional_eval or self.prune == False or len(current) == 1:
                go_next = list(
                    set(m for c in current for m in self._adjacency_list[c]
                        if m not in current and m not in forbidden))

                # for determinism and correctness during pruning
                go_next.sort(key=lambda me: self._labels[me])

            # yield element if condition is True
            if conditional_eval:
                yield (tuple(current),
                       score) if self.mode == 'map_entries' else (
                           current_subgraph, score)

        else:
            # special case at very beginning: explore every node
            go_next = list(set(m for m in self._adjacency_list.keys()))
            go_next.sort(key=lambda me: self._labels[me])

        if len(go_next) > 0:
            # recurse further
            forbidden_current = set()
            for child in go_next:
                current.append(child)
                yield from self.traverse(current,
                                         forbidden | forbidden_current)
                current.pop()
                forbidden_current.add(child)
    def apply(self, config: Tuple[int, List[int]], label: str,
              **kwargs) -> None:
        if config[0] == 0:
            return

        nsdfg_id, state_id, state_label = label.split(".")
        nsdfg_id = int(nsdfg_id)
        state_id = int(state_id)
        sdfg = list(self._sdfg.all_sdfgs_recursive())[nsdfg_id]
        state = sdfg.node(state_id)
        nodes = state.nodes()
        cutout = cutter.cutout_state(state, *(nodes), make_copy=False)

        map_ids = config[1]
        maps_ = list(map(cutout.start_state.node, map_ids))
        subgraph = helpers.subgraph_from_maps(sdfg=sdfg,
                                              graph=state,
                                              map_entries=maps_)

        map_fusion = sg.SubgraphOTFFusion()
        map_fusion.setup_match(subgraph, sdfg.sdfg_id, state_id)
        if map_fusion.can_be_applied(state, sdfg):
            fuse_counter = map_fusion.apply(state, sdfg)
            print(f"Fusing {fuse_counter} maps")
示例#6
0
    def apply(self, config: Tuple[int, List[int]], label: str,
              **kwargs) -> None:
        if config[0] == 0:
            return

        nsdfg_id, state_id, _ = label.split(".")
        sdfg = list(self._sdfg.all_sdfgs_recursive())[int(nsdfg_id)]
        state_id = int(state_id)
        state = sdfg.node(state_id)
        nodes = state.nodes()
        cutout = cutter.cutout_state(state, *(nodes), make_copy=False)

        map_ids = config[1]
        maps_ = list(map(cutout.start_state.node, map_ids))
        subgraph = helpers.subgraph_from_maps(sdfg=sdfg,
                                              graph=state,
                                              map_entries=maps_)

        subgraph_fusion = sg.CompositeFusion()
        subgraph_fusion.setup_match(subgraph, sdfg.sdfg_id, state_id)
        subgraph_fusion.allow_tiling = True
        subgraph_fusion.schedule_innermaps = dace.ScheduleType.GPU_Device
        if subgraph_fusion.can_be_applied(sdfg, subgraph):
            subgraph_fusion.apply(sdfg)
示例#7
0
    def evaluate(self, config, cutout, measurements: int, **kwargs) -> float:
        dreport = self._sdfg.get_instrumented_data()

        candidate = dace.SDFG.from_json(cutout)
        candidate.start_state.instrument = dace.InstrumentationType.GPU_Events
        for node in candidate.start_state:
            if isinstance(node, dace.nodes.MapEntry):
                break
        else:
            # Skip no-map-states
            return math.inf

        if config[0] == 0:
            # Baseline
            return self.measure(candidate, dreport, measurements)

        map_ids = config[1]
        if len(map_ids) < 2:
            return math.inf

        maps_ = list(map(candidate.start_state.node, map_ids))
        subgraph = helpers.subgraph_from_maps(sdfg=candidate,
                                              graph=candidate.start_state,
                                              map_entries=maps_)

        subgraph_fusion = sg.CompositeFusion()
        subgraph_fusion.setup_match(subgraph, candidate.sdfg_id,
                                    candidate.node_id(candidate.start_state))
        subgraph_fusion.allow_tiling = True
        subgraph_fusion.schedule_innermaps = dace.ScheduleType.GPU_Device
        if subgraph_fusion.can_be_applied(candidate, subgraph):
            subgraph_fusion.apply(candidate)
        else:
            return math.inf

        return self.measure(candidate, dreport, measurements)
示例#8
0
    def can_be_applied(graph: dace.SDFGState,
                       candidate: Dict[pm.PatternNode, int],
                       expr_index: int,
                       sdfg: dace.SDFG,
                       permissive: bool = False):

        map_entry = graph.node(candidate[ElementWiseArrayOperation._map_entry])
        map_exit = graph.exit_node(map_entry)
        params = [dace.symbol(p) for p in map_entry.map.params]

        if "commsize" in map_entry.map.range.free_symbols:
            return False
        if "Px" in map_entry.map.range.free_symbols:
            return False
        if "Py" in map_entry.map.range.free_symbols:
            return False

        # If the map iterators are used in the code of a Tasklet,
        # then we cannot flatten them (currently).
        # See, for example, samples/simple/mandelbrot.py
        for node in subgraph_from_maps(sdfg, graph, [map_entry]):
            if isinstance(node, dace.nodes.CodeNode):
                for p in params:
                    if str(p) in node.free_symbols:
                        return False

        inputs = dict()
        for _, _, _, _, m in graph.out_edges(map_entry):
            if not m.data:
                continue
            desc = sdfg.arrays[m.data]
            if desc not in inputs.keys():
                inputs[desc] = []
            inputs[desc].append(m.subset)

        for desc, accesses in inputs.items():
            if isinstance(desc, dace.data.Scalar):
                continue
            elif isinstance(desc, (dace.data.Array, dace.data.View)):
                if list(desc.shape) == [1]:
                    continue
                for a in accesses:
                    if a.num_elements() != 1:
                        return False
                    indices = a.min_element()
                    unmatched_indices = set(params)
                    for idx in indices:
                        if idx in unmatched_indices:
                            unmatched_indices.remove(idx)
                    if len(unmatched_indices) > 0:
                        return False
            else:
                return False

        outputs = dict()
        for _, _, _, _, m in graph.in_edges(map_exit):
            if m.wcr:
                return False
            desc = sdfg.arrays[m.data]
            if desc not in outputs.keys():
                outputs[desc] = []
            outputs[desc].append(m.subset)

        for desc, accesses in outputs.items():
            if isinstance(desc, (dace.data.Array, dace.data.View)):
                for a in accesses:
                    if a.num_elements() != 1:
                        return False
                    indices = a.min_element()
                    unmatched_indices = set(params)
                    for idx in indices:
                        if idx in unmatched_indices:
                            unmatched_indices.remove(idx)
                    if len(unmatched_indices) > 0:
                        return False
            else:
                return False

        return True
    def transfer(sdfg: dace.SDFG, tuner, k: int = 5):
        assert isinstance(tuner, OnTheFlyMapFusionTuner)

        dreport = sdfg.get_instrumented_data()
        assert dreport is not None

        tuning_report = tuner.optimize(apply=False)
        best_configs = cutout_tuner.CutoutTuner.top_k_configs(tuning_report,
                                                              k=k)
        subgraph_patterns = tuner._extract_patterns(best_configs)

        i = 0
        for nsdfg in sdfg.all_sdfgs_recursive():
            for state in nsdfg.states():
                i = i + 1

                top_maps = []
                for node in state.nodes():
                    if isinstance(node,
                                  dace.nodes.MapEntry) and xfh.get_parent_map(
                                      state, node) is None:
                        top_maps.append(node)

                if len(top_maps) < 2:
                    continue

                try:
                    cutout = cutter.cutout_state(state,
                                                 *(state.nodes()),
                                                 make_copy=False)
                except AttributeError:
                    continue

                while True:
                    base_runtime = None
                    best_pattern = None
                    best_pattern_runtime = math.inf
                    for j, pattern in enumerate(subgraph_patterns):
                        maps = []
                        for node in state.nodes():
                            if isinstance(
                                    node, dace.nodes.MapEntry
                            ) and xfh.get_parent_map(state, node) is None:
                                maps.append(node)

                        if len(maps) < 2:
                            continue

                        maps_desc = {}
                        state_desc = Counter()
                        for map_entry in maps:
                            map_desc = OnTheFlyMapFusionTuner.map_descriptor(
                                state, map_entry)
                            state_desc.update({map_desc: 1})

                            if not map_desc in maps_desc:
                                maps_desc[map_desc] = []

                            maps_desc[map_desc].append(map_entry)

                        included = True
                        for key in pattern:
                            if not key in state_desc or pattern[
                                    key] > state_desc[key]:
                                included = False
                                break

                        if not included:
                            continue

                        if base_runtime is None:
                            baseline = cutter.cutout_state(state,
                                                           *(state.nodes()),
                                                           make_copy=False)
                            baseline.start_state.instrument = dace.InstrumentationType.GPU_Events

                            dreport_ = {}
                            for cstate in baseline.nodes():
                                for dnode in cstate.data_nodes():
                                    array = baseline.arrays[dnode.data]
                                    if array.transient:
                                        continue
                                    try:
                                        data = dreport.get_first_version(
                                            dnode.data)
                                        dreport_[dnode.data] = data
                                    except:
                                        continue

                            base_runtime = optim_utils.subprocess_measure(
                                baseline, dreport_, i=192, j=192)
                            best_pattern_runtime = base_runtime
                            if base_runtime == math.inf:
                                break

                        # Construct subgraph greedily
                        subgraph_maps = []
                        for desc in pattern:
                            num = pattern[desc]
                            subgraph_maps.extend(maps_desc[desc][:num])

                        # Apply
                        experiment_sdfg_ = cutter.cutout_state(
                            state, *(state.nodes()), make_copy=False)
                        experiment_state_ = experiment_sdfg_.start_state
                        experiment_maps_ids = list(
                            map(lambda me: experiment_state_.node_id(me),
                                subgraph_maps))
                        experiment_sdfg = copy.deepcopy(experiment_sdfg_)
                        experiment_state = experiment_sdfg.start_state
                        experiment_state.instrument = dace.InstrumentationType.GPU_Events

                        experiment_maps = list(
                            map(lambda m_id: experiment_state.node(m_id),
                                experiment_maps_ids))
                        experiment_subgraph = helpers.subgraph_from_maps(
                            sdfg=experiment_sdfg,
                            graph=experiment_state,
                            map_entries=experiment_maps)

                        map_fusion = sg.SubgraphOTFFusion()
                        map_fusion.setup_match(
                            experiment_subgraph, experiment_sdfg.sdfg_id,
                            experiment_sdfg.node_id(experiment_state))
                        if map_fusion.can_be_applied(experiment_state,
                                                     experiment_sdfg):
                            try:
                                experiment_fuse_counter = map_fusion.apply(
                                    experiment_state, experiment_sdfg)
                            except:
                                continue

                            if experiment_fuse_counter == 0:
                                continue

                            dreport_ = {}
                            for cstate in experiment_sdfg.nodes():
                                for dnode in cstate.data_nodes():
                                    array = experiment_sdfg.arrays[dnode.data]
                                    if array.transient:
                                        continue
                                    try:
                                        data = dreport.get_first_version(
                                            dnode.data)
                                        dreport_[dnode.data] = data
                                    except:
                                        continue

                            fused_runtime = optim_utils.subprocess_measure(
                                experiment_sdfg, dreport_, i=192, j=192)
                            if fused_runtime >= best_pattern_runtime:
                                continue

                            best_pattern = subgraph_maps
                            best_pattern_runtime = fused_runtime

                    if best_pattern is not None:
                        subgraph = helpers.subgraph_from_maps(
                            sdfg=nsdfg, graph=state, map_entries=best_pattern)
                        map_fusion = sg.SubgraphOTFFusion()
                        map_fusion.setup_match(subgraph, nsdfg.sdfg_id,
                                               nsdfg.node_id(state))
                        actual_fuse_counter = map_fusion.apply(state, nsdfg)

                        best_pattern = None
                        base_runtime = None
                        best_pattern_runtime = math.inf
                    else:
                        break
示例#10
0
def greedy_fuse(graph_or_subgraph: GraphViewType,
                validate_all: bool,
                device: dace.dtypes.DeviceType = dace.dtypes.DeviceType.CPU,
                recursive: bool = True,
                stencil: bool = False,
                stencil_tile=None,
                permutations_only: bool = True,
                expand_reductions: bool = False) -> None:
    '''
    Greedily fuses maps of an SDFG or graph, operating in-place.
    :param graph_or_subgraph: SDFG, SDFGState or Subgraph
    :param validate_all: Validate SDFG or graph at each fusion step 
    :param device: Device type to specialize for 
    :param recursive: Fuse recursively within (fused and unfused) scopes
    :param stencil: Perform stencil fusion instead of regular fusion 
    :param stencil_tile: StencilTiling Tile size, default if None
    :param permutations_only: Disallow splitting of maps during MultiExpansion stage
    :param expand_reductions: Expand all reduce nodes before fusion
    '''
    debugprint = config.Config.get_bool('debugprint')
    if isinstance(graph_or_subgraph, SDFG):
        # If we have an SDFG, recurse into graphs
        graph_or_subgraph.simplify(validate_all=validate_all)
        # MapFusion for trivial cases
        graph_or_subgraph.apply_transformations_repeated(
            MapFusion, validate_all=validate_all)
        # recurse into graphs
        for graph in graph_or_subgraph.nodes():

            greedy_fuse(graph,
                        validate_all=validate_all,
                        device=device,
                        recursive=recursive,
                        stencil=stencil,
                        stencil_tile=stencil_tile,
                        permutations_only=permutations_only,
                        expand_reductions=expand_reductions)
    else:
        # we are in graph or subgraph
        sdfg, graph, subgraph = None, None, None
        if isinstance(graph_or_subgraph, SDFGState):
            sdfg = graph_or_subgraph.parent
            sdfg.apply_transformations_repeated(MapFusion,
                                                validate_all=validate_all)
            graph = graph_or_subgraph
            subgraph = SubgraphView(graph, graph.nodes())
        else:
            sdfg = graph_or_subgraph.graph.parent
            graph = graph_or_subgraph.graph
            subgraph = graph_or_subgraph

        # create condition function object
        fusion_condition = CompositeFusion(SubgraphView(graph, graph.nodes()))

        # within SDFGState: greedily enumerate fusible components
        # and apply transformation
        applied_transformations = 0
        reverse = True if stencil else False

        if stencil:
            # adjust tiling settings
            fusion_condition.allow_tiling = True
            fusion_condition.schedule_innermaps = dtypes.ScheduleType.Sequential
            if device == dtypes.DeviceType.GPU:
                fusion_condition.stencil_unroll_loops = True
            # tile size
            if stencil_tile:
                fusion_condition.stencil_strides = stencil_tile
            # always only permutate for now with stencil tiles
            fusion_condition.expansion_split = False

        else:
            fusion_condition.allow_tiling = False
            # expand reductions
            if expand_reductions:
                for graph in sdfg.nodes():
                    for node in graph.nodes():
                        if isinstance(node,
                                      dace.libraries.standard.nodes.Reduce):
                            try:
                                ReduceExpansion.apply_to(sdfg, reduce=node)
                            except ValueError as e:
                                pass
            # permutation settings
            fusion_condition.expansion_split = not permutations_only

        condition_function = lambda sdfg, subgraph: fusion_condition.can_be_applied(
            sdfg, subgraph)
        enumerator = GreedyEnumerator(sdfg,
                                      graph,
                                      subgraph,
                                      condition_function=condition_function)
        for map_entries in enumerator:
            if len(map_entries) > 1:
                current_subgraph = xfsh.subgraph_from_maps(
                    sdfg, graph, map_entries)
                cf = CompositeFusion(current_subgraph)
                # transfer settings
                cf.allow_tiling = fusion_condition.allow_tiling
                cf.schedule_innermaps = fusion_condition.schedule_innermaps
                cf.expansion_split = fusion_condition.expansion_split
                cf.stencil_strides = fusion_condition.stencil_strides

                cf.apply(sdfg)
                applied_transformations += 1

            if recursive:
                global_entry = cf._global_map_entry if len(
                    map_entries) > 1 else map_entries[0]

                greedy_fuse(graph.scope_subgraph(global_entry,
                                                 include_entry=False,
                                                 include_exit=False),
                            validate_all=validate_all,
                            device=device,
                            recursive=recursive,
                            stencil=stencil,
                            stencil_tile=stencil_tile,
                            permutations_only=permutations_only,
                            expand_reductions=expand_reductions)

        for node in graph_or_subgraph.nodes():
            if isinstance(node, nodes.NestedSDFG):
                greedy_fuse(node.sdfg,
                            validate_all=validate_all,
                            device=device,
                            stencil=stencil,
                            stencil_tile=stencil_tile,
                            recursive=recursive,
                            permutations_only=permutations_only,
                            expand_reductions=expand_reductions)

        if applied_transformations > 0:
            if debugprint:
                if stencil:
                    print(f"Applied {applied_transformations} TileFusion")
                else:
                    print(f"Applied {applied_transformations} SubgraphFusion")

        if validate_all:
            graph.validate()
示例#11
0
    def iterator(self):
        # iterate through adjacency list starting with map with lowest label.
        # then greedily explore neighbors with next lowest label and see whether set is fusible
        # if not fusible, cancel and create a new set

        if len(self._adjacency_list) == 0:
            return
        first_map = next(me for me in self._adjacency_list
                         if self._labels[me] == 0)

        # define queue / visited set which helps us find starting points
        # for the next inner iterations
        added = set()
        outer_queued = set(self._source_maps)
        outer_queue = [
            QueuedEntry(me, self._labels[me], reverse=self._reverse)
            for me in self._source_maps
        ]
        while len(outer_queue) > 0:

            # current iteration: define queue / set with which we are going
            # to find current components

            while len(outer_queue) > 0:
                next_iterate = heapq.heappop(outer_queue)
                if next_iterate.map_entry not in added:
                    break
                elif len(outer_queue) == 0:
                    next_iterate = None
                    break

            if not next_iterate:
                break

            current_set = set()
            inner_queue = [next_iterate]
            inner_queued = {next_iterate.map_entry}

            while len(inner_queue) > 0:

                # select starting map
                current = heapq.heappop(inner_queue)
                current_map = current.map_entry

                # check whether current | current_set can be fused
                add_current_map = False
                if len(current_set) == 0:
                    add_current_map = True
                else:
                    subgraph = helpers.subgraph_from_maps(
                        self._sdfg, self._graph, current_set | {current_map})
                    if self._condition_function(self._sdfg, subgraph):
                        add_current_map = True

                if add_current_map:
                    # add it to current set and continue BFS
                    added.add(current_map)
                    current_set.add(current_map)
                    # recurse further
                    for current_neighbor_map in self._adjacency_list[
                            current_map]:
                        # add to outer queue and set
                        if current_neighbor_map not in added:
                            if current_neighbor_map not in outer_queued:
                                heapq.heappush(
                                    outer_queue,
                                    QueuedEntry(
                                        current_neighbor_map,
                                        self._labels[current_neighbor_map],
                                        reverse=self._reverse))
                                outer_queued.add(current_neighbor_map)
                            # add to inner queue and set
                            if current_neighbor_map not in inner_queued:
                                heapq.heappush(
                                    inner_queue,
                                    QueuedEntry(
                                        current_neighbor_map,
                                        self._labels[current_neighbor_map],
                                        reverse=self._reverse))
                                inner_queued.add(current_neighbor_map)

            # yield
            if self.mode == 'map_entries':
                yield tuple(current_set)
            else:
                yield helpers.subgraph_from_maps(self._sdfg, self._graph,
                                                 current_set)