Пример #1
0
    def eval_tree(self, t_id):
        if self.performance_on_test[t_id]:
            return
        depth = self.tree_depths[t_id]
        self.node_descriptions[t_id] = [[] for _ in range(depth + 1)]
        TP, FP, TN, FN = 0, 0, 0, 0
        data = self.test
        for level in range(depth + 1):
            cue, direction, threshold, decision = self.selected[t_id][level]
            undecided, metrics, loc_auc = self.eval_decision(
                data, cue, direction, threshold, decision)
            tp, fp, tn, fn = self.update_metrics(level, depth, decision,
                                                 metrics)
            TP, FP, TN, FN = TP + tp, FP + fp, TN + tn, FN + fn
            if len(undecided) == 0:
                break
            data = undecided
        pre, rec, spec, fpr, npv, acc, f1 = get_performance([TP, FP, TN, FN])
        self.performance_on_test[t_id] = [
            TP, FP, TN, FN, pre, rec, spec, fpr, npv, acc, f1
        ]
        dist2heaven = get_score("Dist2Heaven",
                                self.performance_on_test[t_id][:4])
        loc_auc = -self.get_tree_loc_auc(self.test, t_id)

        self.results[t_id] = {
            "Accuracy": self.performance_on_test[t_id][9],
            "Dist2Heaven": dist2heaven,
            "LOC_AUC": loc_auc
        }
Пример #2
0
 def growEven(self, data, t_id, level, cur_performance):
     if level >= self.max_depth:
         return
     if len(data) == 0:
         print("No data")
         return
     self.tree_depths[t_id] = level
     decision = self.structures[t_id][level]
     structure = tuple(self.structures[t_id][:level + 1])
     cur_selected = self.computed_cache.get(structure, None)
     if not cur_selected:
         cur_selected = self.call_eval_point_split(data, t_id, level,
                                                   cur_performance,
                                                   cur_selected, decision)
         self.computed_cache[structure] = cur_selected
     self.selected[t_id][level] = cur_selected['rule']
     self.performance_on_train[t_id][level] = cur_selected[
         'metrics'] + get_performance(cur_selected['metrics'])
     if level < 3:
         self.selected[t_id + 1][level] = self.selected[t_id][level]
         self.performance_on_train[
             t_id + 1][level] = self.performance_on_train[t_id][level]
     if level == 2:
         #global store_cur_selected
         self.store_cur_selected = cur_selected
     self.growEven(cur_selected['undecided'], t_id, level + 1,
                   cur_selected['metrics'])
Пример #3
0
    def grow(self, data, t_id, level, cur_performance):
        """
        :param data: current data for future tree growth
        :param t_id: tree id
        :param level: level id
        :return: None
        """
        if level >= self.max_depth:
            return
        if len(data) == 0:
            print "?????????????????????? Early Ends ???????????????????????"
            return
        # print "level, ", level
        self.tree_depths[t_id] = level
        decision = self.structures[t_id][level]
        structure = tuple(self.structures[t_id][:level + 1])
        #print(t_id, level, structure)
        cur_selected = self.computed_cache.get(structure, None)
        Y = data.as_matrix(columns=[self.target])
        if not cur_selected:
            for cue in list(data):
                if cue in self.ignore or cue == self.target:
                    continue

                if (self.median_top == 1 and level == 0) or (self.median_top
                                                             == 0):
                    threshold = data[cue].median()
                else:
                    threshold = data[cue]
                for direction in "><":
                    undecided, metrics, loc_auc = self.eval_decision(
                        data, cue, direction, threshold, decision)
                    tp, fp, tn, fn = self.update_metrics(
                        level, self.max_depth, decision, metrics)
                    # if the decision lead to no data, punish the score
                    if sum([tp, fp, tn, fn]) == 0:
                        score = float('inf')
                    elif self.criteria == "LOC_AUC":
                        score = loc_auc
                    else:
                        score = get_score(self.criteria,
                                          [TP + tp, FP + fp, TN + tn, FN + fn])
                    # score = get_score(self.criteria, metrics)
                    # if not cur_selected or metrics[goal] > self.performance_on_train[t_id][level][cur_selected][goal]:
                    if not cur_selected or score < cur_selected['score']:
                        cur_selected = {'rule': (cue, direction, threshold, decision), \
                                        'undecided': undecided, \
                                        'metrics': [TP + tp, FP + fp, TN + tn, FN + fn], \
                                        # 'metrics': metrics,

                                        'score': score}
                        x = 1
            self.computed_cache[structure] = cur_selected
        self.selected[t_id][level] = cur_selected['rule']
        self.performance_on_train[t_id][level] = cur_selected[
            'metrics'] + get_performance(cur_selected['metrics'])
        self.grow(cur_selected['undecided'], t_id, level + 1,
                  cur_selected['metrics'])
Пример #4
0
 def eval_tree(self, t_id):
     if self.performance_on_test[t_id]:
         return
     depth = self.tree_depths[t_id]
     self.node_descriptions[t_id] = [[] for _ in range(depth + 1)]
     TP, FP, TN, FN = 0, 0, 0, 0
     data = self.test
     for level in range(depth + 1):
         cue, direction, threshold, decision = self.selected[t_id][level]
         undecided, metrics, loc_auc = self.eval_decision(
             data, cue, direction, threshold, decision)
         tp, fp, tn, fn = self.update_metrics(level, depth, decision,
                                              metrics)
         TP, FP, TN, FN = TP + tp, FP + fp, TN + tn, FN + fn
         if len(undecided) == 0:
             break
         data = undecided
     pre, rec, spec, fpr, npv, acc, f1 = get_performance([TP, FP, TN, FN])
     self.performance_on_test[t_id] = [
         TP, FP, TN, FN, pre, rec, spec, fpr, npv, acc, f1
     ]
Пример #5
0
    def grow(self, data, t_id, level, cur_performance):
        """
        :param data: current data for future tree growth
        :param t_id: tree id
        :param level: level id
        :return: None
        """
        if level >= self.max_depth:
            return
        if len(data) == 0:
            print "?????????????????????? Early Ends ???????????????????????"
            return
        self.tree_depths[t_id] = level
        decision = self.structures[t_id][level]
        structure = tuple(self.structures[t_id][:level + 1])
        cur_selected = self.computed_cache.get(structure, None)
        Y = data.as_matrix(columns=[self.target])
        if not cur_selected:
            for cue in list(data):
                if cue in self.ignore or cue == self.target:
                    continue
                if self.split_method == "MDLP":
                    mdlp = MDLP()
                    X = data.as_matrix(columns=[cue])
                    X_disc = mdlp.fit_transform(X, Y)
                    X_interval = np.asarray(mdlp.cat2intervals(X_disc, 0))
                    bins = np.unique(X_disc, axis=0)
                    if len(
                            bins
                    ) <= 1:  # MDLP return the whole range as one bin, use median instead.
                        threshold = data[cue].median()
                        for direction in "><":
                            cur_selected = self.eval_point_split(
                                level, cur_selected, cur_performance, data,
                                cue, direction, threshold, decision)
                        continue
                    # print ", ".join([cue, str(bins)+" bins"])
                    for bin in bins:
                        indexes = np.where(X_disc == bin)[0]
                        interval = X_interval[indexes]
                        try:
                            if len(np.unique(interval, axis=0)) != 1:
                                print "???????????????????????????????????????????????????"
                        except:
                            print 'ha'
                        interval = interval[0]
                        if interval[0] == float('-inf'):
                            threshold = interval[1]
                            for direction in "><":
                                cur_selected = self.eval_point_split(
                                    level, cur_selected, cur_performance, data,
                                    cue, direction, threshold, decision)
                        elif interval[1] == float('inf'):
                            threshold = interval[0]
                            for direction in "><":
                                cur_selected = self.eval_point_split(
                                    level, cur_selected, cur_performance, data,
                                    cue, direction, threshold, decision)
                        else:
                            cur_selected = self.eval_range_split(
                                level, cur_selected, cur_performance, data,
                                cue, indexes, interval, decision)
                    continue
                elif self.split_method == "percentile":
                    thresholds = set(data[cue].quantile(
                        [x / 20.0 for x in range(1, 20)],
                        interpolation='midpoint'))
                else:
                    thresholds = [data[cue].median()]
                # point split, e.g. median or x% percentiles.
                for threshold in thresholds:
                    for direction in "><":
                        cur_selected = self.eval_point_split(
                            level, cur_selected, cur_performance, data, cue,
                            direction, threshold, decision)

            self.computed_cache[structure] = cur_selected
        self.selected[t_id][level] = cur_selected['rule']
        self.performance_on_train[t_id][level] = cur_selected[
            'metrics'] + get_performance(cur_selected['metrics'])
        self.grow(cur_selected['undecided'], t_id, level + 1,
                  cur_selected['metrics'])