def eval_tree(self, t_id): if self.performance_on_test[t_id]: return depth = self.tree_depths[t_id] self.node_descriptions[t_id] = [[] for _ in range(depth + 1)] TP, FP, TN, FN = 0, 0, 0, 0 data = self.test for level in range(depth + 1): cue, direction, threshold, decision = self.selected[t_id][level] undecided, metrics, loc_auc = self.eval_decision( data, cue, direction, threshold, decision) tp, fp, tn, fn = self.update_metrics(level, depth, decision, metrics) TP, FP, TN, FN = TP + tp, FP + fp, TN + tn, FN + fn if len(undecided) == 0: break data = undecided pre, rec, spec, fpr, npv, acc, f1 = get_performance([TP, FP, TN, FN]) self.performance_on_test[t_id] = [ TP, FP, TN, FN, pre, rec, spec, fpr, npv, acc, f1 ] dist2heaven = get_score("Dist2Heaven", self.performance_on_test[t_id][:4]) loc_auc = -self.get_tree_loc_auc(self.test, t_id) self.results[t_id] = { "Accuracy": self.performance_on_test[t_id][9], "Dist2Heaven": dist2heaven, "LOC_AUC": loc_auc }
def growEven(self, data, t_id, level, cur_performance): if level >= self.max_depth: return if len(data) == 0: print("No data") return self.tree_depths[t_id] = level decision = self.structures[t_id][level] structure = tuple(self.structures[t_id][:level + 1]) cur_selected = self.computed_cache.get(structure, None) if not cur_selected: cur_selected = self.call_eval_point_split(data, t_id, level, cur_performance, cur_selected, decision) self.computed_cache[structure] = cur_selected self.selected[t_id][level] = cur_selected['rule'] self.performance_on_train[t_id][level] = cur_selected[ 'metrics'] + get_performance(cur_selected['metrics']) if level < 3: self.selected[t_id + 1][level] = self.selected[t_id][level] self.performance_on_train[ t_id + 1][level] = self.performance_on_train[t_id][level] if level == 2: #global store_cur_selected self.store_cur_selected = cur_selected self.growEven(cur_selected['undecided'], t_id, level + 1, cur_selected['metrics'])
def grow(self, data, t_id, level, cur_performance): """ :param data: current data for future tree growth :param t_id: tree id :param level: level id :return: None """ if level >= self.max_depth: return if len(data) == 0: print "?????????????????????? Early Ends ???????????????????????" return # print "level, ", level self.tree_depths[t_id] = level decision = self.structures[t_id][level] structure = tuple(self.structures[t_id][:level + 1]) #print(t_id, level, structure) cur_selected = self.computed_cache.get(structure, None) Y = data.as_matrix(columns=[self.target]) if not cur_selected: for cue in list(data): if cue in self.ignore or cue == self.target: continue if (self.median_top == 1 and level == 0) or (self.median_top == 0): threshold = data[cue].median() else: threshold = data[cue] for direction in "><": undecided, metrics, loc_auc = self.eval_decision( data, cue, direction, threshold, decision) tp, fp, tn, fn = self.update_metrics( level, self.max_depth, decision, metrics) # if the decision lead to no data, punish the score if sum([tp, fp, tn, fn]) == 0: score = float('inf') elif self.criteria == "LOC_AUC": score = loc_auc else: score = get_score(self.criteria, [TP + tp, FP + fp, TN + tn, FN + fn]) # score = get_score(self.criteria, metrics) # if not cur_selected or metrics[goal] > self.performance_on_train[t_id][level][cur_selected][goal]: if not cur_selected or score < cur_selected['score']: cur_selected = {'rule': (cue, direction, threshold, decision), \ 'undecided': undecided, \ 'metrics': [TP + tp, FP + fp, TN + tn, FN + fn], \ # 'metrics': metrics, 'score': score} x = 1 self.computed_cache[structure] = cur_selected self.selected[t_id][level] = cur_selected['rule'] self.performance_on_train[t_id][level] = cur_selected[ 'metrics'] + get_performance(cur_selected['metrics']) self.grow(cur_selected['undecided'], t_id, level + 1, cur_selected['metrics'])
def eval_tree(self, t_id): if self.performance_on_test[t_id]: return depth = self.tree_depths[t_id] self.node_descriptions[t_id] = [[] for _ in range(depth + 1)] TP, FP, TN, FN = 0, 0, 0, 0 data = self.test for level in range(depth + 1): cue, direction, threshold, decision = self.selected[t_id][level] undecided, metrics, loc_auc = self.eval_decision( data, cue, direction, threshold, decision) tp, fp, tn, fn = self.update_metrics(level, depth, decision, metrics) TP, FP, TN, FN = TP + tp, FP + fp, TN + tn, FN + fn if len(undecided) == 0: break data = undecided pre, rec, spec, fpr, npv, acc, f1 = get_performance([TP, FP, TN, FN]) self.performance_on_test[t_id] = [ TP, FP, TN, FN, pre, rec, spec, fpr, npv, acc, f1 ]
def grow(self, data, t_id, level, cur_performance): """ :param data: current data for future tree growth :param t_id: tree id :param level: level id :return: None """ if level >= self.max_depth: return if len(data) == 0: print "?????????????????????? Early Ends ???????????????????????" return self.tree_depths[t_id] = level decision = self.structures[t_id][level] structure = tuple(self.structures[t_id][:level + 1]) cur_selected = self.computed_cache.get(structure, None) Y = data.as_matrix(columns=[self.target]) if not cur_selected: for cue in list(data): if cue in self.ignore or cue == self.target: continue if self.split_method == "MDLP": mdlp = MDLP() X = data.as_matrix(columns=[cue]) X_disc = mdlp.fit_transform(X, Y) X_interval = np.asarray(mdlp.cat2intervals(X_disc, 0)) bins = np.unique(X_disc, axis=0) if len( bins ) <= 1: # MDLP return the whole range as one bin, use median instead. threshold = data[cue].median() for direction in "><": cur_selected = self.eval_point_split( level, cur_selected, cur_performance, data, cue, direction, threshold, decision) continue # print ", ".join([cue, str(bins)+" bins"]) for bin in bins: indexes = np.where(X_disc == bin)[0] interval = X_interval[indexes] try: if len(np.unique(interval, axis=0)) != 1: print "???????????????????????????????????????????????????" except: print 'ha' interval = interval[0] if interval[0] == float('-inf'): threshold = interval[1] for direction in "><": cur_selected = self.eval_point_split( level, cur_selected, cur_performance, data, cue, direction, threshold, decision) elif interval[1] == float('inf'): threshold = interval[0] for direction in "><": cur_selected = self.eval_point_split( level, cur_selected, cur_performance, data, cue, direction, threshold, decision) else: cur_selected = self.eval_range_split( level, cur_selected, cur_performance, data, cue, indexes, interval, decision) continue elif self.split_method == "percentile": thresholds = set(data[cue].quantile( [x / 20.0 for x in range(1, 20)], interpolation='midpoint')) else: thresholds = [data[cue].median()] # point split, e.g. median or x% percentiles. for threshold in thresholds: for direction in "><": cur_selected = self.eval_point_split( level, cur_selected, cur_performance, data, cue, direction, threshold, decision) self.computed_cache[structure] = cur_selected self.selected[t_id][level] = cur_selected['rule'] self.performance_on_train[t_id][level] = cur_selected[ 'metrics'] + get_performance(cur_selected['metrics']) self.grow(cur_selected['undecided'], t_id, level + 1, cur_selected['metrics'])