Пример #1
0
    def __init__(self):
        self._header = None
        self._root = None
        self._grace_period = 200
        self._split_confidence = 0.0000001
        self._hoeffding_tie_threshold = 0.05
        self._min_frac_weight_for_two_branches_gain = 0.01

        # Split metric stuff goes here
        self.GINI_SPLIT = 0
        self.INFO_GAIN_SPLIT = 1

        self._selected_split_metric = self.INFO_GAIN_SPLIT
        self._split_metric = InfoGainSplitMetric(
            self._min_frac_weight_for_two_branches_gain)
        # self._selected_split_metric = self.GINI_SPLIT
        # self._split_metric = GiniSplitMetric()

        # Leaf prediction strategy stuff goes here

        # Only used when the leaf prediction strategy is baded on Naive Bayes, not useful right now
        #self._nb_threshold = 0

        self._active_leaf_count = 0
        self._inactive_leaf_count = 0
        self._decision_node_count = 0

        # Print out leaf models in the case of naive Bayes or naive Bayes adaptive leaves
        self._print_leaf_models = False
Пример #2
0
    def __init__(self):
        self._header = None
        self._root = None
        self._grace_period = 200
        self._split_confidence = 0.0000001
        self._hoeffding_tie_threshold = 0.05
        self._min_frac_weight_for_two_branches_gain = 0.01

        # Split metric stuff goes here
        self.GINI_SPLIT = 0
        self.INFO_GAIN_SPLIT = 1

        self._selected_split_metric = self.INFO_GAIN_SPLIT
        self._split_metric = InfoGainSplitMetric(
            self._min_frac_weight_for_two_branches_gain)
        #self._selected_split_metric = self.GINI_SPLIT
        #self._split_metric = GiniSplitMetric()

        # Leaf prediction strategy stuff goes here

        # Only used when the leaf prediction strategy is baded on Naive Bayes, not useful right now
        #self._nb_threshold = 0

        self._active_leaf_count = 0
        self._inactive_leaf_count = 0
        self._decision_node_count = 0

        # Print out leaf models in the case of naive Bayes or naive Bayes adaptive leaves
        self._print_leaf_models = False

        logging.basicConfig(
            level=logging.DEBUG,
            format=
            '%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
            #filename='../log/tmp.log',
            datefmt='%a, %d %b %Y %H:%M:%S',
            filemode='w')
        self.logger = logging.getLogger("vfdt")  #self.name
        # 定义一个FileHandler,将INFO级别或更高的日志信息记录到log文件,并将其添加到当前的日志处理对象#
        # fh = logging.FileHandler('../log/' + "vfdt" + '.log', mode='w')# self.name
        # formatter = logging.Formatter('%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s')
        # fh.setFormatter(formatter)
        # fh.setLevel(logging.INFO)
        # self.logger.addHandler(fh)
        # 定义一个StreamHandler,将INFO级别或更高的日志信息打印到标准错误,并将其添加到当前的日志处理对象#
        console = logging.StreamHandler()
        console.setLevel(logging.INFO)
        formatter = logging.Formatter(
            '%(message)s')  # ('%(name)-12s: %(levelname)-8s )
        console.setFormatter(formatter)

        self.logger.addHandler(console)
Пример #3
0
    def build_classifier(self, dataset, testdataset=None):
        """Build the classifier.

        Args:
            dataset (Dataset): The data to start training the classifier.
        """
        #日志部分
        self.logger.info(
            "Start build classifier--------------------------\n"
            "grace_period = {},\n"
            "split_confidence = {},\n"
            "hoeffding_tie_threshold = {},\n"
            "min_frac_weight_for_two_branches_gain = {}\n"
            "--------------------------------------------------".format(
                self._grace_period, self._split_confidence,
                self._hoeffding_tie_threshold,
                self._min_frac_weight_for_two_branches_gain))

        #训练部分
        self.reset()
        self._header = dataset
        if self._selected_split_metric is self.GINI_SPLIT:
            self._split_metric = GiniSplitMetric()
        else:
            self._split_metric = InfoGainSplitMetric(
                self._min_frac_weight_for_two_branches_gain)

        count = 0
        test_epoch = 100
        test_acc_list = []
        for i in range(dataset.num_instances()):
            self.update_classifier(dataset.instance(i))
            count += 1
            if count % test_epoch == 0 and testdataset != None:
                acc = self.valuate_acc(testdataset)
                test_acc_list.append(acc)

        #画图部分
        if testdataset != None:
            x_axis = range(len(test_acc_list))
            plt.plot(
                x_axis, test_acc_list,
                label='test_acc')  # Plot some data on the (implicit) axes.
            plt.xlabel('iter')
            plt.ylabel('acc')
            figpath = './respic/' + 'acc.jpg'
            plt.title(figpath.split('/')[-1])
            plt.legend()
            plt.savefig(figpath)
            plt.close('all')
Пример #4
0
    def build_classifier(self, dataset):
        """Build the classifier.

        Args:
            dataset (Dataset): The data to start training the classifier.
        """
        self.reset()
        self._header = dataset
        if self._selected_split_metric is self.GINI_SPLIT:
            self._split_metric = GiniSplitMetric()
        else:
            self._split_metric = InfoGainSplitMetric(self._min_frac_weight_for_two_branches_gain)

        for i in range(dataset.num_instances()):
            self.update_classifier(dataset.instance(i))
Пример #5
0
    def __init__(self):
        self._header = None
        self._root = None
        self._grace_period = 200
        self._split_confidence = 0.0000001
        self._hoeffding_tie_threshold = 0.05
        self._min_frac_weight_for_two_branches_gain = 0.01

        # Split metric stuff goes here
        self.GINI_SPLIT = 0
        self.INFO_GAIN_SPLIT = 1

        self._selected_split_metric = self.INFO_GAIN_SPLIT
        self._split_metric = InfoGainSplitMetric(self._min_frac_weight_for_two_branches_gain)
        #self._selected_split_metric = self.GINI_SPLIT
        #self._split_metric = GiniSplitMetric()

        # Leaf prediction strategy stuff goes here

        # Only used when the leaf prediction strategy is baded on Naive Bayes, not useful right now
        #self._nb_threshold = 0

        self._active_leaf_count = 0
        self._inactive_leaf_count = 0
        self._decision_node_count = 0

        # Print out leaf models in the case of naive Bayes or naive Bayes adaptive leaves 
        self._print_leaf_models = False
Пример #6
0
    def build_classifier(self, dataset):
        """Build the classifier.

        Args:
            dataset (Dataset): The data to start training the classifier.
        """
        self.reset()
        self._header = dataset
        if self._selected_split_metric is self.GINI_SPLIT:
            self._split_metric = GiniSplitMetric()
        else:
            self._split_metric = InfoGainSplitMetric(
                self._min_frac_weight_for_two_branches_gain)

        for i in range(dataset.num_instances()):
            self.update_classifier(dataset.instance(i))
Пример #7
0
class HoeffdingTree(object):
    """Main class for a Hoeffding Tree, also known as Very Fast Decision Tree (VFDT)."""
    def __init__(self):
        self._header = None
        self._root = None
        self._grace_period = 200
        self._split_confidence = 0.0000001
        self._hoeffding_tie_threshold = 0.05
        self._min_frac_weight_for_two_branches_gain = 0.01

        # Split metric stuff goes here
        self.GINI_SPLIT = 0
        self.INFO_GAIN_SPLIT = 1

        self._selected_split_metric = self.INFO_GAIN_SPLIT
        self._split_metric = InfoGainSplitMetric(self._min_frac_weight_for_two_branches_gain)
        #self._selected_split_metric = self.GINI_SPLIT
        #self._split_metric = GiniSplitMetric()

        # Leaf prediction strategy stuff goes here

        # Only used when the leaf prediction strategy is baded on Naive Bayes, not useful right now
        #self._nb_threshold = 0

        self._active_leaf_count = 0
        self._inactive_leaf_count = 0
        self._decision_node_count = 0

        # Print out leaf models in the case of naive Bayes or naive Bayes adaptive leaves 
        self._print_leaf_models = False

    def __str__(self):
        if self._root is None:
            return 'No model built yet!'
        return self._root.__str__(self._print_leaf_models)

    def reset(self):
        """Reset the classifier and set all node/leaf counters to zero."""
        self._root = None
        self._active_leaf_count = 0
        self._inactive_leaf_count = 0
        self._decision_node_count = 0

    def set_minimum_fraction_of_weight_info_gain(self, m):
        self._min_frac_weight_for_two_branches_gain = m

    def get_minimum_fraction_of_weight_info_gain(self):
        return self._min_frac_weight_for_two_branches_gain

    def set_grace_period(self, grace):
        self._grace_period = grace

    def get_grace_period(self):
        return self._grace_period

    def set_hoeffding_tie_threshold(self, ht):
        self._hoeffding_tie_threshold = ht

    def get_hoeffding_tie_threshold(self):
        return self._hoeffding_tie_threshold

    def set_split_confidence(self, sc):
        self._split_confidence = sc

    def get_split_confidence(self):
        return self._split_confidence

    def compute_hoeffding_bound(self, max_value, confidence, weight):
        """Calculate the Hoeffding bound.

        Args:
            max_value (float): 
            confidence (float):
            weight (float): 

        Returns:
            (float): The Hoeffding bound.
        """
        return math.sqrt(((max_value * max_value) * math.log(1.0 / confidence)) / (2.0 * weight))

    def build_classifier(self, dataset):
        """Build the classifier.

        Args:
            dataset (Dataset): The data to start training the classifier.
        """
        self.reset()
        self._header = dataset
        if self._selected_split_metric is self.GINI_SPLIT:
            self._split_metric = GiniSplitMetric()
        else:
            self._split_metric = InfoGainSplitMetric(self._min_frac_weight_for_two_branches_gain)

        for i in range(dataset.num_instances()):
            self.update_classifier(dataset.instance(i))

    def update_classifier(self, instance):
        """Update the classifier with the given instance.

        Args:
            instance (Instance): The new instance to be used to train the classifier.
        """
        if instance.class_is_missing():
            return
        if self._root is None:
            self._root = self.new_learning_node()

        l = self._root.leaf_for_instance(instance, None, None)
        actual_node = l.the_node
        if actual_node is None:
            actual_node = ActiveHNode()
            l.parent_node.set_child(l.parent_branch, actual_node)

        # ActiveHNode should be changed to a LearningNode interface if Naive Bayes nodes are used
        if isinstance(actual_node, InactiveHNode):
            actual_node.update_node(instance)
        if isinstance(actual_node, ActiveHNode):
            actual_node.update_node(instance)
            total_weight = actual_node.total_weight()
            if total_weight - actual_node.weight_seen_at_last_split_eval > self._grace_period:
                self.try_split(actual_node, l.parent_node, l.parent_branch)
                actual_node.weight_seen_at_last_split_eval = total_weight

    def distribution_for_instance(self, instance):
        """Return the class probabilities for an instance.

        Args:
            instance (Instance): The instance to calculate the class probabilites for.

        Returns:
            list[float]: The class probabilities.
        """
        class_attribute = instance.class_attribute()
        pred = []

        if self._root is not None:
            l = self._root.leaf_for_instance(instance, None, None)
            actual_node = l.the_node
            if actual_node is None:
                actual_node = l.parent_node
            pred = actual_node.get_distribution(instance, class_attribute)
        else:
            # All class values equally likely
            pred = [1 for i in range(class_attribute.num_values())]
            utils.normalize(pred)

        return pred


    def deactivate_node(self, to_deactivate, parent, parent_branch):
        """Prevent supplied node of growing.

        Args:
            to_deactivate (ActiveHNode): The node to be deactivated.
            parent (SplitNode): The parent of the node.
            parent_branch (str): The branch leading from the parent to the node.
        """
        leaf = InactiveHNode(to_deactivate.class_distribution)

        if parent is None:
            self._root = leaf
        else:
            parent.set_child(parent_branch, leaf)

        self._active_leaf_count -= 1
        self._inactive_leaf_count += 1

    def activate_node(self, to_activate, parent, parent_branch):
        """Allow supplied node to grow.

        Args:
            to_activate (InactiveHNode): The node to be activated.
            parent (SplitNode): The parent of the node.
            parent_branch (str): The branch leading from the parent to the node.
        """
        leaf = ActiveHNode()
        leaf.class_distribution = to_activate.class_distribution

        if parent is None:
            self._root = leaf
        else:
            parent.set_child(parent_branch, leaf)

        self._active_leaf_count += 1
        self._inactive_leaf_count -= 1

    def try_split(self, node, parent, parent_branch):
        """Try a split from the supplied node.

        Args:
            node (ActiveHNode): The node to split.
            parent (SplitNode): The parent of the node.
            parent_branch (str): The branch leading from the parent to the node.
        """
        # Non-pure?
        if node.num_entries_in_class_distribution() > 1:
            best_splits = node.get_possible_splits(self._split_metric)
            best_splits.sort(key=attrgetter('split_merit'))

            do_split = False
            if len(best_splits) < 2:
                do_split = len(best_splits) > 0
            else:
                # Compute Hoeffding bound
                metric_max = self._split_metric.get_metric_range(node.class_distribution)
                hoeffding_bound = self.compute_hoeffding_bound(
                    metric_max, self._split_confidence, node.total_weight())
                best = best_splits[len(best_splits) - 1]
                second_best = best_splits[len(best_splits) - 2]
                if best.split_merit - second_best.split_merit > hoeffding_bound or hoeffding_bound < self._hoeffding_tie_threshold:
                    do_split = True

            if do_split:
                best = best_splits[len(best_splits) - 1]
                if best.split_test is None:
                    # preprune
                    self.deactivate_node(node, parent, parent_branch)
                else:
                    new_split = SplitNode(node.class_distribution, best.split_test)

                    for i in range(best.num_splits()):
                        new_child = self.new_learning_node()
                        new_child.class_distribution = best.post_split_class_distributions[i]
                        new_child.weight_seen_at_last_split_eval = new_child.total_weight()
                        branch_name = ''
                        if self._header.attribute(name=best.split_test.split_attributes()[0]).is_numeric():
                            if i is 0:
                                branch_name = 'left'
                            else:
                                branch_name = 'right'
                        else:
                            split_attribute = self._header.attribute(name=best.split_test.split_attributes()[0])
                            branch_name = split_attribute.value(i)
                        new_split.set_child(branch_name, new_child)

                    self._active_leaf_count -= 1
                    self._decision_node_count += 1
                    self._active_leaf_count += best.num_splits()

                    if parent is None:
                        self._root = new_split
                    else:
                        parent.set_child(parent_branch, new_split)

    def new_learning_node(self):
        """Create a new learning node. Will always be an ActiveHNode while Naive Bayes
        nodes are not implemented.

        Returns:
            ActiveHNode: The new learning node.
        """
        # Leaf strategy should be handled here if/when the Naive Bayes approach is implemented
        return ActiveHNode()
Пример #8
0
class HoeffdingTree(object):
    """Main class for a Hoeffding Tree, also known as Very Fast Decision Tree (VFDT)."""
    def __init__(self):
        self._header = None
        self._root = None
        self._grace_period = 200
        self._split_confidence = 0.0000001
        self._hoeffding_tie_threshold = 0.05
        self._min_frac_weight_for_two_branches_gain = 0.01

        # Split metric stuff goes here
        self.GINI_SPLIT = 0
        self.INFO_GAIN_SPLIT = 1

        self._selected_split_metric = self.INFO_GAIN_SPLIT
        self._split_metric = InfoGainSplitMetric(
            self._min_frac_weight_for_two_branches_gain)
        # self._selected_split_metric = self.GINI_SPLIT
        # self._split_metric = GiniSplitMetric()

        # Leaf prediction strategy stuff goes here

        # Only used when the leaf prediction strategy is baded on Naive Bayes, not useful right now
        #self._nb_threshold = 0

        self._active_leaf_count = 0
        self._inactive_leaf_count = 0
        self._decision_node_count = 0

        # Print out leaf models in the case of naive Bayes or naive Bayes adaptive leaves
        self._print_leaf_models = False

    def __str__(self):
        if self._root is None:
            return 'No model built yet!'
        return self._root.__str__(self._print_leaf_models)

    def reset(self):
        """Reset the classifier and set all node/leaf counters to zero."""
        self._root = None
        self._active_leaf_count = 0
        self._inactive_leaf_count = 0
        self._decision_node_count = 0

    def set_minimum_fraction_of_weight_info_gain(self, m):
        self._min_frac_weight_for_two_branches_gain = m

    def get_minimum_fraction_of_weight_info_gain(self):
        return self._min_frac_weight_for_two_branches_gain

    def set_grace_period(self, grace):
        self._grace_period = grace

    def get_grace_period(self):
        return self._grace_period

    def set_hoeffding_tie_threshold(self, ht):
        self._hoeffding_tie_threshold = ht

    def get_hoeffding_tie_threshold(self):
        return self._hoeffding_tie_threshold

    def set_split_confidence(self, sc):
        self._split_confidence = sc

    def get_split_confidence(self):
        return self._split_confidence

    def compute_hoeffding_bound(self, max_value, confidence, weight):
        """Calculate the Hoeffding bound.

        Args:
            max_value (float): 
            confidence (float):
            weight (float): 

        Returns:
            (float): The Hoeffding bound.
        """
        return math.sqrt(
            ((max_value * max_value) * math.log(1.0 / confidence)) /
            (2.0 * weight))

    def build_classifier(self, dataset):
        """Build the classifier.

        Args:
            dataset (Dataset): The data to start training the classifier.
        """
        self.reset()
        self._header = dataset
        if self._selected_split_metric is self.GINI_SPLIT:
            self._split_metric = GiniSplitMetric()
        else:
            self._split_metric = InfoGainSplitMetric(
                self._min_frac_weight_for_two_branches_gain)

        for i in range(dataset.num_instances()):
            self.update_classifier(dataset.instance(i))

    def update_classifier(self, instance):
        """Update the classifier with the given instance.

        Args:
            instance (Instance): The new instance to be used to train the classifier.
        """
        if instance.class_is_missing():
            return
        if self._root is None:
            self._root = self.new_learning_node()

        l = self._root.leaf_for_instance(instance, None, None)
        actual_node = l.the_node
        if actual_node is None:
            actual_node = ActiveHNode()
            l.parent_node.set_child(l.parent_branch, actual_node)

        # ActiveHNode should be changed to a LearningNode interface if Naive Bayes nodes are used
        if isinstance(actual_node, InactiveHNode):
            actual_node.update_node(instance)
        if isinstance(actual_node, ActiveHNode):
            actual_node.update_node(instance)
            total_weight = actual_node.total_weight()
            if total_weight - actual_node.weight_seen_at_last_split_eval > self._grace_period:
                self.try_split(actual_node, l.parent_node, l.parent_branch)
                actual_node.weight_seen_at_last_split_eval = total_weight

    def distribution_for_instance(self, instance):
        """Return the class probabilities for an instance.

        Args:
            instance (Instance): The instance to calculate the class probabilites for.

        Returns:
            list[float]: The class probabilities.
        """
        class_attribute = instance.class_attribute()
        pred = []

        if self._root is not None:
            l = self._root.leaf_for_instance(instance, None, None)
            actual_node = l.the_node
            if actual_node is None:
                actual_node = l.parent_node
            pred = actual_node.get_distribution(instance, class_attribute)
        else:
            # All class values equally likely
            pred = [1 for i in range(class_attribute.num_values())]
            utils.normalize(pred)

        return pred

    def deactivate_node(self, to_deactivate, parent, parent_branch):
        """Prevent supplied node of growing.

        Args:
            to_deactivate (ActiveHNode): The node to be deactivated.
            parent (SplitNode): The parent of the node.
            parent_branch (str): The branch leading from the parent to the node.
        """
        leaf = InactiveHNode(to_deactivate.class_distribution)

        if parent is None:
            self._root = leaf
        else:
            parent.set_child(parent_branch, leaf)

        self._active_leaf_count -= 1
        self._inactive_leaf_count += 1

    def activate_node(self, to_activate, parent, parent_branch):
        """Allow supplied node to grow.

        Args:
            to_activate (InactiveHNode): The node to be activated.
            parent (SplitNode): The parent of the node.
            parent_branch (str): The branch leading from the parent to the node.
        """
        leaf = ActiveHNode()
        leaf.class_distribution = to_activate.class_distribution

        if parent is None:
            self._root = leaf
        else:
            parent.set_child(parent_branch, leaf)

        self._active_leaf_count += 1
        self._inactive_leaf_count -= 1

    def try_split(self, node, parent, parent_branch):
        """Try a split from the supplied node.

        Args:
            node (ActiveHNode): The node to split.
            parent (SplitNode): The parent of the node.
            parent_branch (str): The branch leading from the parent to the node.
        """
        # Non-pure?
        if node.num_entries_in_class_distribution() > 1:
            best_splits = node.get_possible_splits(self._split_metric)
            best_splits.sort(key=attrgetter('split_merit'))

            do_split = False
            if len(best_splits) < 2:
                do_split = len(best_splits) > 0
            else:
                # Compute Hoeffding bound
                metric_max = self._split_metric.get_metric_range(
                    node.class_distribution)
                hoeffding_bound = self.compute_hoeffding_bound(
                    metric_max, self._split_confidence, node.total_weight())
                best = best_splits[len(best_splits) - 1]
                second_best = best_splits[len(best_splits) - 2]
                if best.split_merit - second_best.split_merit > hoeffding_bound or hoeffding_bound < self._hoeffding_tie_threshold:
                    do_split = True

            if do_split:
                best = best_splits[len(best_splits) - 1]
                if best.split_test is None:
                    # preprune
                    self.deactivate_node(node, parent, parent_branch)
                else:
                    new_split = SplitNode(node.class_distribution,
                                          best.split_test)

                    for i in range(best.num_splits()):
                        new_child = self.new_learning_node()
                        new_child.class_distribution = best.post_split_class_distributions[
                            i]
                        new_child.weight_seen_at_last_split_eval = new_child.total_weight(
                        )
                        branch_name = ''
                        if self._header.attribute(
                                name=best.split_test.split_attributes()
                            [0]).is_numeric():
                            if i is 0:
                                branch_name = 'left'
                            else:
                                branch_name = 'right'
                        else:
                            split_attribute = self._header.attribute(
                                name=best.split_test.split_attributes()[0])
                            branch_name = split_attribute.value(i)
                        new_split.set_child(branch_name, new_child)

                    self._active_leaf_count -= 1
                    self._decision_node_count += 1
                    self._active_leaf_count += best.num_splits()

                    if parent is None:
                        self._root = new_split
                    else:
                        parent.set_child(parent_branch, new_split)

    def new_learning_node(self):
        """Create a new learning node. Will always be an ActiveHNode while Naive Bayes
        nodes are not implemented.

        Returns:
            ActiveHNode: The new learning node.
        """
        # Leaf strategy should be handled here if/when the Naive Bayes approach is implemented
        return ActiveHNode()
Пример #9
0
class HoeffdingTree(object):
    """Main class for a Hoeffding Tree, also known as Very Fast Decision Tree (VFDT)."""
    def __init__(self):
        self._header = None
        self._root = None
        self._grace_period = 200
        self._split_confidence = 0.0000001
        self._hoeffding_tie_threshold = 0.05
        self._min_frac_weight_for_two_branches_gain = 0.01

        # Split metric stuff goes here
        self.GINI_SPLIT = 0
        self.INFO_GAIN_SPLIT = 1

        self._selected_split_metric = self.INFO_GAIN_SPLIT
        self._split_metric = InfoGainSplitMetric(
            self._min_frac_weight_for_two_branches_gain)
        #self._selected_split_metric = self.GINI_SPLIT
        #self._split_metric = GiniSplitMetric()

        # Leaf prediction strategy stuff goes here

        # Only used when the leaf prediction strategy is baded on Naive Bayes, not useful right now
        #self._nb_threshold = 0

        self._active_leaf_count = 0
        self._inactive_leaf_count = 0
        self._decision_node_count = 0

        # Print out leaf models in the case of naive Bayes or naive Bayes adaptive leaves
        self._print_leaf_models = False

        logging.basicConfig(
            level=logging.DEBUG,
            format=
            '%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
            #filename='../log/tmp.log',
            datefmt='%a, %d %b %Y %H:%M:%S',
            filemode='w')
        self.logger = logging.getLogger("vfdt")  #self.name
        # 定义一个FileHandler,将INFO级别或更高的日志信息记录到log文件,并将其添加到当前的日志处理对象#
        # fh = logging.FileHandler('../log/' + "vfdt" + '.log', mode='w')# self.name
        # formatter = logging.Formatter('%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s')
        # fh.setFormatter(formatter)
        # fh.setLevel(logging.INFO)
        # self.logger.addHandler(fh)
        # 定义一个StreamHandler,将INFO级别或更高的日志信息打印到标准错误,并将其添加到当前的日志处理对象#
        console = logging.StreamHandler()
        console.setLevel(logging.INFO)
        formatter = logging.Formatter(
            '%(message)s')  # ('%(name)-12s: %(levelname)-8s )
        console.setFormatter(formatter)

        self.logger.addHandler(console)

    def __str__(self):

        if self._root is None:
            return 'No model built yet!'
        return self._root.__str__(self._print_leaf_models)

    def reset(self):
        """Reset the classifier and set all node/leaf counters to zero."""
        self._root = None
        self._active_leaf_count = 0
        self._inactive_leaf_count = 0
        self._decision_node_count = 0

    def set_minimum_fraction_of_weight_info_gain(self, m):
        self._min_frac_weight_for_two_branches_gain = m

    def get_minimum_fraction_of_weight_info_gain(self):
        return self._min_frac_weight_for_two_branches_gain

    def set_grace_period(self, grace):
        self._grace_period = grace

    def get_grace_period(self):
        return self._grace_period

    def set_hoeffding_tie_threshold(self, ht):
        self._hoeffding_tie_threshold = ht

    def get_hoeffding_tie_threshold(self):
        return self._hoeffding_tie_threshold

    def set_split_confidence(self, sc):
        self._split_confidence = sc

    def get_split_confidence(self):
        return self._split_confidence

    def compute_hoeffding_bound(self, max_value, confidence, weight):
        """Calculate the Hoeffding bound.

        Args:
            max_value (float): 
            confidence (float):
            weight (float): 

        Returns:
            (float): The Hoeffding bound.
        """
        return math.sqrt(
            ((max_value * max_value) * math.log(1.0 / confidence)) /
            (2.0 * weight))

    def build_classifier(self, dataset, testdataset=None):
        """Build the classifier.

        Args:
            dataset (Dataset): The data to start training the classifier.
        """
        #日志部分
        self.logger.info(
            "Start build classifier--------------------------\n"
            "grace_period = {},\n"
            "split_confidence = {},\n"
            "hoeffding_tie_threshold = {},\n"
            "min_frac_weight_for_two_branches_gain = {}\n"
            "--------------------------------------------------".format(
                self._grace_period, self._split_confidence,
                self._hoeffding_tie_threshold,
                self._min_frac_weight_for_two_branches_gain))

        #训练部分
        self.reset()
        self._header = dataset
        if self._selected_split_metric is self.GINI_SPLIT:
            self._split_metric = GiniSplitMetric()
        else:
            self._split_metric = InfoGainSplitMetric(
                self._min_frac_weight_for_two_branches_gain)

        count = 0
        test_epoch = 100
        test_acc_list = []
        for i in range(dataset.num_instances()):
            self.update_classifier(dataset.instance(i))
            count += 1
            if count % test_epoch == 0 and testdataset != None:
                acc = self.valuate_acc(testdataset)
                test_acc_list.append(acc)

        #画图部分
        if testdataset != None:
            x_axis = range(len(test_acc_list))
            plt.plot(
                x_axis, test_acc_list,
                label='test_acc')  # Plot some data on the (implicit) axes.
            plt.xlabel('iter')
            plt.ylabel('acc')
            figpath = './respic/' + 'acc.jpg'
            plt.title(figpath.split('/')[-1])
            plt.legend()
            plt.savefig(figpath)
            plt.close('all')

    def update_classifier(self, instance):
        """Update the classifier with the given instance.

        Args:
            instance (Instance): The new instance to be used to train the classifier.
        """
        if instance.class_is_missing():
            return
        if self._root is None:
            self._root = self.new_learning_node()

        l = self._root.leaf_for_instance(instance, None, None)

        actual_node = l.the_node
        if actual_node is None:
            actual_node = ActiveHNode()
            l.parent_node.set_child(l.parent_branch, actual_node)

        # ActiveHNode should be changed to a LearningNode interface if Naive Bayes nodes are used
        if isinstance(actual_node, InactiveHNode):
            actual_node.update_node(instance)
        if isinstance(actual_node, ActiveHNode):
            actual_node.update_node(instance)
            total_weight = actual_node.total_weight()
            if total_weight - actual_node.weight_seen_at_last_split_eval > self._grace_period:
                self.try_split(actual_node, l.parent_node, l.parent_branch)
                actual_node.weight_seen_at_last_split_eval = total_weight

    def distribution_for_instance(self, instance):
        """Return the class probabilities for an instance.

        Args:
            instance (Instance): The instance to calculate the class probabilites for.

        Returns:
            list[float]: The class probabilities.
        """
        class_attribute = instance.class_attribute()
        pred = []

        if self._root is not None:
            l = self._root.leaf_for_instance(instance, None, None)
            actual_node = l.the_node
            if actual_node is None:
                actual_node = l.parent_node
            pred = actual_node.get_distribution(instance, class_attribute)
        else:
            # All class values equally likely
            pred = [1 for i in range(class_attribute.num_values())]
            pred = utils.normalize(pred)

        return pred

    def deactivate_node(self, to_deactivate, parent, parent_branch):
        """Prevent supplied node of growing.

        Args:
            to_deactivate (ActiveHNode): The node to be deactivated.
            parent (SplitNode): The parent of the node.
            parent_branch (str): The branch leading from the parent to the node.
        """
        leaf = InactiveHNode(to_deactivate.class_distribution)

        if parent is None:
            self._root = leaf
        else:
            parent.set_child(parent_branch, leaf)

        self._active_leaf_count -= 1
        self._inactive_leaf_count += 1

    def activate_node(self, to_activate, parent, parent_branch):
        """Allow supplied node to grow.

        Args:
            to_activate (InactiveHNode): The node to be activated.
            parent (SplitNode): The parent of the node.
            parent_branch (str): The branch leading from the parent to the node.
        """
        leaf = ActiveHNode()
        leaf.class_distribution = to_activate.class_distribution

        if parent is None:
            self._root = leaf
        else:
            parent.set_child(parent_branch, leaf)

        self._active_leaf_count += 1
        self._inactive_leaf_count -= 1

    def try_split(self, node, parent, parent_branch):
        """Try a split from the supplied node.

        Args:
            node (ActiveHNode): The node to split.
            parent (SplitNode): The parent of the node.
            parent_branch (str): The branch leading from the parent to the node.
        """
        # Non-pure?
        if node.num_entries_in_class_distribution() > 1:
            best_splits = node.get_possible_splits(self._split_metric)
            best_splits.sort(key=attrgetter('split_merit'))

            do_split = False
            if len(best_splits) < 2:
                do_split = len(best_splits) > 0
            else:
                # Compute Hoeffding bound
                metric_max = self._split_metric.get_metric_range(
                    node.class_distribution)
                hoeffding_bound = self.compute_hoeffding_bound(
                    metric_max, self._split_confidence, node.total_weight())
                best = best_splits[len(best_splits) - 1]
                second_best = best_splits[len(best_splits) - 2]
                #这里是利用最优解和第二优解比较hoeffding_bound    或者当hoeffding_bound小于设定值 直接分裂
                if best.split_merit - second_best.split_merit > hoeffding_bound or hoeffding_bound < self._hoeffding_tie_threshold:
                    do_split = True

            if do_split:
                best = best_splits[len(best_splits) - 1]
                if best.split_test is None:
                    # preprune
                    self.deactivate_node(node, parent, parent_branch)
                else:
                    new_split = SplitNode(node.class_distribution,
                                          best.split_test)

                    for i in range(best.num_splits()):
                        new_child = self.new_learning_node()
                        new_child.class_distribution = best.post_split_class_distributions[
                            i]
                        new_child.weight_seen_at_last_split_eval = new_child.total_weight(
                        )
                        branch_name = ''
                        if self._header.attribute(
                                name=best.split_test.split_attributes()
                            [0]).is_numeric():
                            if i is 0:
                                branch_name = 'left'
                            else:
                                branch_name = 'right'
                        else:
                            split_attribute = self._header.attribute(
                                name=best.split_test.split_attributes()[0])
                            branch_name = split_attribute.value(i)
                        new_split.set_child(branch_name, new_child)

                    self._active_leaf_count -= 1
                    self._decision_node_count += 1
                    self._active_leaf_count += best.num_splits()

                    if parent is None:
                        self._root = new_split
                    else:
                        parent.set_child(parent_branch, new_split)

    def new_learning_node(self):
        """Create a new learning node. Will always be an ActiveHNode while Naive Bayes
        nodes are not implemented.

        Returns:
            ActiveHNode: The new learning node.
        """
        # Leaf strategy should be handled here if/when the Naive Bayes approach is implemented
        return ActiveHNode()

    #自己加的方法直接通过结点预测
    def predict(self, instance):
        """Predict the instance with the classifier.

        Args:
            instance (Instance): The new instance .
        """
        # if instance.class_is_missing():
        #     return
        if self._root is None:
            return None

        l = self._root.leaf_for_instance(instance, None, None)
        actual_node = l.the_node
        if actual_node is None:
            return None
        return actual_node.predict()

    #输入数据集 利用predict方法计算准确率
    def valuate_acc(self, test_data):
        """

        :param test_data:
        :return:
        """
        count = 0
        for i in range(test_data.num_instances()):
            label = test_data.class_attribute().value(
                test_data.instance(i).class_value())
            pre = self.predict(test_data.instance(i))
            if label == pre:
                count += 1
        accuracy = count / test_data.num_instances()
        self.logger.info("test accucy:{}".format(count /
                                                 test_data.num_instances()))
        return accuracy

    def dump_mode(self, path='./mode/tmp.vfdt'):
        self.logger.info("mode was saved in :{}".format(path))
        self.logger.removeHandler(self.logger.handlers[-1])
        #        self.logger.removeHandler(self.logger.handlers[-1])
        with open(path, 'wb') as f:
            pickle.dump(self, f)