예제 #1
0
    def _kill_subtree(self, node: AnyTimeSplitNode):
        """ Kill subtree that starts from node.


        Parameters
        ----------
        node: AnyTimeActiveLearningNode
            The node to reevaluate.
        Returns
        -------
        AnyTimeActiveLearningNode
            The new leaf.
        """

        leaf = self._new_learning_node()

        leaf.set_observed_class_distribution(node.get_observed_class_distribution())
        leaf.set_attribute_observers(node.get_attribute_observers())

        return leaf
예제 #2
0
    def _activate_learning_node(self, to_activate: AnyTimeInactiveLearningNode, parent: AnyTimeSplitNode,
                                parent_branch: int):
        """ Activate a learning node.

        Parameters
        ----------
        to_activate: AnyTimeInactiveLearningNode
            The node to activate.
        parent: AnyTimeSplitNode
            The node's parent.
        parent_branch: int
            Parent node's branch index.

        """
        new_leaf = self._new_learning_node(to_activate.get_observed_class_distribution())
        if parent is None:
            self._tree_root = new_leaf
        else:
            parent.set_child(parent_branch, new_leaf)
        self._active_leaf_node_cnt += 1
        self._inactive_leaf_node_cnt -= 1
예제 #3
0
    def _reevaluate_best_split(self, node: AnyTimeSplitNode, parent,
                               branch_index):
        """ Reevaluate the best split for a node.

        If the samples seen so far are not from the same class then:

        1. Find split candidates and select the best one.
        2. Compute the Hoeffding bound.
        3. If the don't split candidate is higher than the top split candidate:
            3.1 Kill subtree and replace it with a leaf.
            3.2 Update the tree.
            3.3 Update tree's metrics
        4. If the difference between the top split candidate and the current split is larger than
        the Hoeffding bound:
           4.1 Create a new split node.
           4.2 Update the tree.
           4.3 Update tree's metrics
        5. If the top split candidate is the current split but with different split test:
           5.1 Update the split test of the current split.

        Parameters
        ----------
        node: AnyTimeSplitNode
            The node to reevaluate.
        parent: AnyTimeSplitNode
            The node's parent.
        branch_index: int
            Parent node's branch index.
        Returns
        -------
        boolean
            flag to stop moving in depth.
        """

        stop_flag = False
        if not node.observed_class_distribution_is_pure():
            if self._split_criterion == GINI_SPLIT:
                split_criterion = GiniSplitCriterion()
            elif self._split_criterion == INFO_GAIN_SPLIT:
                split_criterion = InfoGainSplitCriterion()
            else:
                split_criterion = InfoGainSplitCriterion()

            best_split_suggestions = node.get_best_split_suggestions(
                split_criterion, self)
            if len(best_split_suggestions) > 0:

                # Compute Gini (or Information Gain) for each attribute (except the null one)
                best_split_suggestions.sort(key=attrgetter('merit'))
                # x_best is the attribute with the highest G_int

                x_best = best_split_suggestions[-1]
                id_best = x_best.split_test.get_atts_test_depends_on()[0]

                # x_current is the current attribute used in this SplitNode
                id_current = node.get_split_test().get_atts_test_depends_on(
                )[0]
                x_current = node.find_attribute(id_current,
                                                best_split_suggestions)

                # Get x_null
                x_null = node.get_null_split(split_criterion)

                # Force x_null merit to get 0 instead of -infinity
                if x_null.merit == -np.inf:
                    x_null.merit = 0.0

                #  Compute Hoeffding bound
                hoeffding_bound = self.compute_hoeffding_bound(
                    split_criterion.get_range_of_merit(
                        node.get_observed_class_distribution()),
                    self.split_confidence, node.get_weight_seen())

                if x_null.merit - x_best.merit > hoeffding_bound:

                    # Kill subtree & replace the AnyTimeSplitNode by AnyTimeActiveLearningNode

                    best_split = self._kill_subtree(node)

                    # update EFDT
                    if parent is None:
                        # Root case : replace the root node by a new split node
                        self._tree_root = best_split
                    else:
                        parent.set_child(branch_index, best_split)

                    deleted_node_cnt = node.count_nodes()

                    self._active_leaf_node_cnt += 1
                    self._active_leaf_node_cnt -= deleted_node_cnt[1]
                    self._decision_node_cnt -= deleted_node_cnt[0]
                    stop_flag = True

                    # Manage memory
                    self.enforce_tracker_limit()

                elif (x_best.merit - x_current.merit > hoeffding_bound
                      or hoeffding_bound < self.tie_threshold) and (
                          id_current != id_best):

                    # Create a new branch
                    new_split = self.new_split_node(
                        x_best.split_test,
                        node.get_observed_class_distribution(),
                        node.get_attribute_observers())
                    # Update weights in new_split
                    new_split.update_weight_seen_at_last_split_reevaluation()

                    # Update EFDT
                    for i in range(x_best.num_splits()):
                        new_child = self._new_learning_node(
                            x_best.resulting_class_distribution_from_split(i))
                        new_split.set_child(i, new_child)

                    deleted_node_cnt = node.count_nodes()

                    self._active_leaf_node_cnt -= deleted_node_cnt[1]
                    self._decision_node_cnt -= deleted_node_cnt[0]
                    self._decision_node_cnt += 1
                    self._active_leaf_node_cnt += x_best.num_splits()

                    if parent is None:
                        # Root case : replace the root node by a new split node
                        self._tree_root = new_split
                    else:
                        parent.set_child(branch_index, new_split)

                    stop_flag = True

                    # Manage memory
                    self.enforce_tracker_limit()

                elif (x_best.merit - x_current.merit > hoeffding_bound or
                      hoeffding_bound < self.tie_threshold) and (id_current
                                                                 == id_best):
                    node._split_test = x_best.split_test

        return stop_flag
예제 #4
0
 def new_split_node(self, split_test, class_observations,
                    attribute_observers):
     """ Create a new split node."""
     return AnyTimeSplitNode(split_test, class_observations,
                             attribute_observers)