def __build_tree(self, X, y, n_features, feature_indices, depth): node_data_set = np.column_stack((X, y)) sample_size = len(y) if len(y) <= self.min_samples_split or (depth != None and depth == self.max_depth): estimated_value = np.mean(y) # leaf = Leaf(estimated_value=estimated_value, sample_size=sample_size, leaf_data_set=node_data_set) return leaf #寻找分裂属性和最优分裂点 best_feature_index, threshold, min_mes = find_split( X, y, self.criterion, feature_indices) X_true, y_true, X_false, y_false = split(X, y, best_feature_index, threshold) # 分成左子树和右子树 node = Node(feature_index=best_feature_index, threshold=threshold, min_mes=min_mes, sample_size=sample_size, node_data_set=node_data_set) # # 随机的选特征 feature_indices = random.sample(range(n_features), int(self.max_features)) ## 递归的创建左子树 node.branch_true = self.__build_tree(X_true, y_true, n_features, feature_indices, depth + 1) ## 随机的选特征 feature_indices = random.sample(range(n_features), int(self.max_features)) node.branch_false = self.__build_tree(X_false, y_false, n_features, feature_indices, depth + 1) return node
def build_tree(self, X, y, feature_indices,fa_feature_index,select_feature_fa, father_node,depth): """ 建立决策树 X : y: feature_indices:随机选择的特征集合 fa_feature_index:父节点选择的哪个特征作为分裂特征,、初始时为-1, depth :树的深度 select_feature_fa :记录当前节点的父节点的最优分割属性 """ select_feature_fa.append(fa_feature_index) n_features = X.shape[1] n_features_list = [i for i in range(n_features)] #记录选择的特征 self.select_feature.append(feature_indices) self.sample_num.append(len(y)) node_data_set = np.column_stack((X, y)) # 树终止条件 if self.criterion == 'entropy': if depth is self.max_depth or len(y) < self.min_samples_split or entropy(y) is 0: return mode(y)[0][0]# 返回y数组的众数 # 树终止条件 if self.criterion == 'gini': temp_gini = gini(y) self.gini_.append(temp_gini) sample_num = len(y) if depth is self.max_depth or sample_num < self.min_samples_split or temp_gini < self.min_impurity_split: # if depth is self.max_depth or temp_gini < self.min_impurity_split: #所有的特征都已经被选择了,就随机选择一个特征,使得叶子节点构成双特征 if set(n_features_list) == set(select_feature_fa): index = random.randrange(len(n_features_list)) current_feature_index = n_features_list[index] current_max_value = np.max(X[:, current_feature_index]) current_min_value = np.min(X[:, current_feature_index]) else: to_be_select = list(set(n_features_list) - set(select_feature_fa)) index = random.randrange(len(to_be_select)) current_feature_index = to_be_select[index] current_max_value = np.max(X[:, current_feature_index]) current_min_value = np.min(X[:, current_feature_index]) leaf = Leaf(mode(y)[0][0],fa_feature_index , np.max(X[:,fa_feature_index]), np.min(X[:,fa_feature_index]),current_feature_index,current_max_value, current_min_value,select_feature_fa,node_data_set,sample_num,prior_node= father_node) self.leaf_list.append(leaf) return leaf # feature_index最佳分割属性, threshold 最佳分割属性值,gini_ 系数 feature_index, threshold, max_value ,min_value ,gini_ = find_split(X, y, self.criterion, feature_indices) fa_max_value = np.max(X[:, fa_feature_index]) # 该节点记录父节点分裂特征的最大值 fa_min_value = np.min(X[:, fa_feature_index]) # 该节点记录父节点分裂特征的最小值 X_true, y_true, X_false, y_false = split(X, y, feature_index, threshold)# 分成左子树和右子树 # 没有元素 if y_true.shape[0] is 0 or y_false.shape[0] is 0: if set(n_features_list) == set(select_feature_fa): index = random.randrange(len(n_features_list)) current_feature_index = n_features_list[index] current_max_value = np.max(X[:, current_feature_index]) current_min_value = np.min(X[:, current_feature_index]) else: to_be_select = list(set(n_features_list) - set(select_feature_fa)) index = random.randrange(len(to_be_select)) current_feature_index = to_be_select[index] current_max_value = np.max(X[:, current_feature_index]) current_min_value = np.min(X[:, current_feature_index]) leaf = Leaf(mode(y)[0][0], fa_feature_index, np.max(X[:, fa_feature_index]), np.min(X[:, fa_feature_index]), current_feature_index,current_max_value,current_min_value,select_feature_fa,node_data_set,prior_node= father_node,sample_num= 0) self.leaf_list.append(leaf) return leaf node = Node(feature_index=feature_index, fa_feature_index = fa_feature_index, threshold = threshold, max_value = max_value, min_value = min_value, fa_max_value = fa_max_value, fa_min_value = fa_min_value, gini_coefficient = gini_, node_data_set = node_data_set) # # 随机的选特征 n_features = X.shape[1] n_sub_features = int(self.max_features) # feature_indices = random.sample(range(n_features), n_sub_features) select_feature = list() select_feature += select_feature_fa # 记录节点选择的特征 ## 递归的创建左子树 node.branch_true = self.build_tree(X_true, y_true, feature_indices,feature_index, select_feature,node,depth + 1) ## 随机的选特征 feature_indices = random.sample(range(n_features), n_sub_features) # 递归的创建右子树 select_feature = list() select_feature += select_feature_fa # 记录节点选择的特征 node.branch_false = self.build_tree(X_false, y_false, feature_indices,feature_index, select_feature,node,depth + 1) node.prior_node = father_node #指向前驱节点 return node