def __prun_tree(self, cur_node): '''剪枝''' if len(cur_node.childNode) == 0: #叶子节点直接跳过 return else: cur_node.cls = get_cls_from_data(cur_node.dataset) cur_err_sum = get_err_sum(cur_node.cls, cur_node.dataset) + 0.5 leaf_err_set = [] self.leaf_err_sum(cur_node, leaf_err_set) leaf_e_sum = sum(leaf_err_set) + 0.5 * len(leaf_err_set) leaf_err_ratio = leaf_e_sum / len(cur_node.dataset) std_dev = np.sqrt(leaf_err_ratio * (1 - leaf_err_ratio)) if leaf_e_sum + std_dev > cur_err_sum: print leaf_e_sum + std_dev, cur_err_sum, " prun!!!!" cur_node.childNode = {} cur_node.cls = get_cls_from_data(cur_node.dataset) else: for _, c in cur_node.childNode.items(): self.__prun_tree(c)
def __construct_tree(self, cur_node, attr_list): ''' 递归构建决策树 ''' data = cur_node.dataset data_classified = {} max_gain_ratio, index = sys.float_info.min, -1 num_border = 0.0 for idx in attr_list: if idx in self.disc_type: #离散属性 gain_r = self.disc_gain_rt(idx, data) else: #数值属性 gain_r, num_border = self.num_gain_rt(idx,data) if gain_r > max_gain_ratio: max_gain_ratio = gain_r index = idx if index == -1: #所有属性都不能满足条件 cur_node.cls = get_cls_from_data(data) return cur_node.attr_index = index if index in self.disc_type: #离散属性 cur_node.attr_type = 1 #对数据进行分类 for val in self.disc_type[index]: data_classified[val] = [] for d in data: data_classified[d[index]].append(d) else: #连续属性 cur_node.attr_type = 0 cur_node.demark = num_border data_classified[0] = [] data_classified[1] = [] for d in data: if d[index] < num_border: data_classified[0].append(d) else: data_classified[1].append(d) if len(attr_list) == 1: #下一次递归属性集为空 for k, v in data_classified.items(): child_node = TreeNode(v) #属性值对应的数据集为空,则使用当前节点的数据集判断节点对应的分类 if len(v) == 0: child_node.cls = get_cls_from_data(data) else: child_node.cls = get_cls_from_data(v) cur_node.childNode[k] = child_node else: sub_attr = list(attr_list) sub_attr.remove(index) for k, v in data_classified.items(): child_node = TreeNode(v) if len(v) == 0: child_node.cls = get_cls_from_data(data) elif check_purity(v) == 1: child_node.cls = v[0][-1] #随便取一个sample的标签 else: self.__construct_tree(child_node, sub_attr) #对子节点进行递归 cur_node.childNode[k] = child_node
def __construct_tree(self, cur_node, attr_list): ''' 递归构建决策树 ''' data = cur_node.dataset data_classified = {} max_gain_ratio, index = sys.float_info.min, -1 num_border = 0.0 for idx in attr_list: if idx in self.disc_type: #离散属性 gain_r = self.disc_gain_rt(idx, data) else: #数值属性 gain_r, num_border = self.num_gain_rt(idx, data) if gain_r > max_gain_ratio: max_gain_ratio = gain_r index = idx if index == -1: #所有属性都不能满足条件 cur_node.cls = get_cls_from_data(data) return cur_node.attr_index = index if index in self.disc_type: #离散属性 cur_node.attr_type = 1 #对数据进行分类 for val in self.disc_type[index]: data_classified[val] = [] for d in data: data_classified[d[index]].append(d) else: #连续属性 cur_node.attr_type = 0 cur_node.demark = num_border data_classified[0] = [] data_classified[1] = [] for d in data: if d[index] < num_border: data_classified[0].append(d) else: data_classified[1].append(d) if len(attr_list) == 1: #下一次递归属性集为空 for k, v in data_classified.items(): child_node = TreeNode(v) #属性值对应的数据集为空,则使用当前节点的数据集判断节点对应的分类 if len(v) == 0: child_node.cls = get_cls_from_data(data) else: child_node.cls = get_cls_from_data(v) cur_node.childNode[k] = child_node else: sub_attr = list(attr_list) sub_attr.remove(index) for k, v in data_classified.items(): child_node = TreeNode(v) if len(v) == 0: child_node.cls = get_cls_from_data(data) elif check_purity(v) == 1: child_node.cls = v[0][-1] #随便取一个sample的标签 else: self.__construct_tree(child_node, sub_attr) #对子节点进行递归 cur_node.childNode[k] = child_node