def _build_tree(self, X, y, current_depth=0): largest_impurity = 0 best_criteria = None best_sets = None if len(np.shape(y)) == 1: y = np.expand_dims(y, axis=1) Xy = np.concatenate((X, y), axis=1) n_samples, n_features = np.shape(X) if n_samples >= self.min_samples_split and current_depth <= self.max_depth: for feature_i in range(n_features): feature_values = np.expand_dims(X[:, feature_i], axis=1) unique_values = np.unique(feature_values) for threshold in unique_values: Xy1, Xy2 = divide_on_feature(Xy, feature_i, threshold) if len(Xy1) > 0 and len(Xy2) > 0: y1 = Xy1[:, n_features:] y2 = Xy2[:, n_features:] impurity = self._impurity_calculation(y, y1, y2) if impurity > largest_impurity: largest_impurity = impurity best_criteria = { "feature_i": feature_i, "threshold": threshold } best_sets = { "leftX": Xy1[:, :n_features], "lefty": Xy1[:, n_features:], "rightX": Xy2[:, :n_features], "righty": Xy2[:, n_features:] } if largest_impurity > self.min_impurity: true_branch = self._build_tree(best_sets["leftX"], best_sets["lefty"], current_depth + 1) false_branch = self._build_tree(best_sets["rightX"], best_sets["righty"], current_depth + 1) return DecisionNode(feature_i=best_criteria["feature_i"], threshold=best_criteria["threshold"], true_branch=true_branch, false_branch=false_branch) leaf_value = self._leaf_value_calculation(y) return DecisionNode(value=leaf_value)
def _build_tree(self, X, y, current_depth=0): """ Recursive method which builds out the decision tree and splits X and respective y on the feature of X which (based on impurity) best separates the data""" largest_impurity = 0 best_criteria = None # Feature index and threshold best_sets = None # Subsets of the data # Check if expansion of y is needed if len(np.shape(y)) == 1: y = np.expand_dims(y, axis=1) # Add y as last column of X Xy = np.concatenate((X, y), axis=1) n_samples, n_features = np.shape(X) if n_samples >= self.min_samples_split and current_depth <= self.max_depth: # Calculate the impurity for each feature for feature_i in range(n_features): # All values of feature_i feature_values = np.expand_dims(X[:, feature_i], axis=1) unique_values = np.unique(feature_values) # Iterate through all unique values of feature column i and # calculate the impurity for threshold in unique_values: # Divide X and y depending on if the feature value of X at index feature_i # meets the threshold Xy1, Xy2 = divide_on_feature(Xy, feature_i, threshold) if len(Xy1) > 0 and len(Xy2) > 0: # Select the y-values of the two sets y1 = Xy1[:, n_features:] y2 = Xy2[:, n_features:] # Calculate impurity impurity = self._impurity_calculation(y, y1, y2) # If this threshold resulted in a higher information gain than previously # recorded save the threshold value and the feature # index if impurity > largest_impurity: largest_impurity = impurity best_criteria = { "feature_i": feature_i, "threshold": threshold } best_sets = { "leftX": Xy1[:, :n_features], # X of left subtree "lefty": Xy1[:, n_features:], # y of left subtree "rightX": Xy2[:, :n_features], # X of right subtree "righty": Xy2[:, n_features:] # y of right subtree } if largest_impurity > self.min_impurity: # Build subtrees for the right and left branches true_branch = self._build_tree(best_sets["leftX"], best_sets["lefty"], current_depth + 1) false_branch = self._build_tree(best_sets["rightX"], best_sets["righty"], current_depth + 1) return DecisionNode(feature_i=best_criteria["feature_i"], threshold=best_criteria["threshold"], true_branch=true_branch, false_branch=false_branch) # We're at leaf => determine value leaf_value = self._leaf_value_calculation(y) return DecisionNode(value=leaf_value)
def _build_tree(self, X, y, current_depth=0): """ Recursive method which builds out the decision tree and splits X and respective y on the feature of X which (based on impurity) best separates the data """ X = np.array(X) y = np.array(y).reshape(len(y), -1) largest_impurity = 0 best_criteria = None #feature index and threshold best_sets = None #subsets of the data n_samples, n_features = np.shape(X) if (n_samples >= self.min_samples_split and current_depth <= self.max_depth): #calculate the impurity for each feature for feature_idx in range(n_features): #all values of feature_idx feature_values = X[:, feature_idx].reshape(-1, 1) unique_values = np.unique(feature_values) #iterate through all unique values of feature column i and calculate the impurity for threshold in unique_values: #devide X and y depending on if the feature value of X at index feature_idx meets the threshold idx_1, idx_2 = divide_on_feature(X, feature_idx, threshold) X1 = X[idx_1, :] X2 = X[idx_2, :] y1 = y[idx_1, :] y2 = y[idx_2, :] if (len(X1) > 0 and len(X2) > 0): #calculate impurity impurity = self._impurity_calculation(y, y1, y2) #save the threshold value and the feature index if this threshold resulted in a higher informaiton gain than previously recorded if (impurity > largest_impurity): largest_impurity = impurity best_criteria = { "feature_idx": feature_idx, "threshold": threshold } best_sets = { "leftX": X1, "lefty": y1, "rightX": X2, "righty": y2 } if (largest_impurity > self.min_impurity): #build subtree for the right and left branches true_branch = self._build_tree(best_sets["leftX"], best_sets["lefty"], current_depth + 1) false_branch = self._build_tree(best_sets["rightX"], best_sets["righty"], current_depth + 1) return DecisionNode(feature_idx=best_criteria["feature_idx"], threshold=best_criteria["threshold"], true_branch=true_branch, false_branch=false_branch) #determine value if it reaches at a leaf leaf_value = self._leaf_value_calculation(y) return DecisionNode(value=leaf_value)
def _build_tree(self, X, y, current_depth=0): """ Recursive method which builds out the decision tree and splits X and respective y on the feature of X which (based on impurity) best separates the data""" largest_impurity = 0 best_criteria = None # Feature index and threshold best_sets = None # Subsets of the data # Check if expansion of y is needed if len(np.shape(y)) == 1: y = np.expand_dims(y, axis=1) # [1,2,1,1]------->[[1],[2],[1],[1]] # Add y as last column of X Xy = np.concatenate( (X, y), axis=1) # [[feature_1,feature2,feature3,tagert],...........] n_samples, n_features = np.shape(X) if n_samples >= self.min_samples_split and current_depth <= self.max_depth: # Calculate the impurity for each feature for feature_i in range(n_features): # All values of feature_i feature_values = np.expand_dims( X[:, feature_i], axis=1 ) # [[6.2], [5.1], [4.8], [5.6], [7.2], [4.6], [5.1], [6.9], [6.7], [5.1], [7.7], [5.1], [6.4], [6. ], [6.1], [5. ], [6.5], [5.7], [6.2], [4.6], [5. ], [6.3], [4.4], [5.2], [6.8], [4.6], [6.1], [4.9], [5.2], [4.8], [4.7], [5.8], [7.1], [4.8], [6.7], [6.3], [5. unique_values = np.unique( feature_values ) # [4.3 4.4 4.5 4.6 4.7 4.8 4.9 5. 5.1 5.2 5.3 5.4 5.5 5.6 5.7 5.8 6. 6.1, 6.2 6.3 6.4 6.5 6.6 6.7 6.8 6.9 7.1 7.2 7.6 7.7 7.9] # Iterate through all unique values of feature column i and # calculate the impurity for threshold in unique_values: # Divide X and y depending on if the feature value of X at index feature_i meets the threshold Xy1, Xy2 = divide_on_feature(Xy, feature_i, threshold) if len(Xy1) > 0 and len(Xy2) > 0: # Select the y-values of the two sets y1 = Xy1[:, n_features:] # [[1.], [0.], [0.], [1.], [0.], [0.], [0.], [1.], [0.], [0.], [1.], [0.], [2.], [2.], [1.], [2.], [1.], [0.], [2.], [1.], [2.], [0.], [0.], [2.], [0.], [2.], [0.], [1.], [1.], [2.], [1.], [2.], [2.], [1.], [1.], [0.], [0.], [2.], [0.], [0.], [0.], [1.], [2. y2 = Xy2[:, n_features:] # [[0.]] # Calculate impurity 得到这个切分点的信息增益值 impurity = self._impurity_calculation(y, y1, y2) # If this threshold resulted in a higher information gain than previously # recorded save the threshold value and the feature # index if impurity > largest_impurity: largest_impurity = impurity best_criteria = { "feature_i": feature_i, "threshold": threshold } best_sets = { "leftX": Xy1[:, :n_features], # X of left subtree "lefty": Xy1[:, n_features:], # y of left subtree "rightX": Xy2[:, :n_features], # X of right subtree "righty": Xy2[:, n_features:] # y of right subtree } if largest_impurity > self.min_impurity: # Build subtrees for the right and left branches true_branch = self._build_tree(best_sets["leftX"], best_sets["lefty"], current_depth + 1) false_branch = self._build_tree(best_sets["rightX"], best_sets["righty"], current_depth + 1) return DecisionNode(feature_i=best_criteria["feature_i"], threshold=best_criteria["threshold"], true_branch=true_branch, false_branch=false_branch) # We're at leaf => determine value 是计算子节点值的方法,这里使用的是选取数据集中出现最多的种类(target1、target2或者target3) leaf_value = self._leaf_value_calculation(y) return DecisionNode(value=leaf_value)
def _build_tree(self, X, y, feature_bins, current_depth=0): """ Recursive method which builds out the decision tree and splits X and respective y on the feature of X which (based on impurity) best separates the data""" largest_impurity = 0 best_criteria = None # Feature index and threshold best_sets = None # Subsets of the data # Check if expansion of y is needed if len(np.shape(y)) == 1: y = np.expand_dims(y, axis=1) # Add y as last column of X Xy = np.concatenate((X, y), axis=1) n_samples, n_features = np.shape(X) if current_depth <= self.max_depth: # Calculate the impurity for each feature for feature_i in range(len(feature_bins)): # All values of feature_i #feature_values = np.expand_dims(X[:, feature_i], axis=1) unique_values = feature_bins[feature_i] # Iterate through all unique values of feature column i and # calculate the impurity for threshold in unique_values: # Divide X and y depending on if the feature value of X at index feature_i # meets the threshold Xy1, Xy2 = divide_on_feature(Xy, feature_i, threshold) y1 = np.atleast_2d(Xy1)[:, n_features:] y2 = np.atleast_2d(Xy2)[:, n_features:] self._send_data(y, y1, y2) lock6.acquire() largest_impurity = results.get() best_criteria = results.get() leaf_value = results.get() lock4.release() Xy1, Xy2 = divide_on_feature(Xy, best_criteria["feature_i"], best_criteria["threshold"]) best_sets = { "leftX": np.atleast_2d(Xy1)[:, :n_features], # X of left subtree "lefty": np.atleast_2d(Xy1)[:, n_features:], # y of left subtree "rightX": np.atleast_2d(Xy2)[:, :n_features], # X of right subtree "righty": np.atleast_2d(Xy2)[:, n_features:] # y of right subtree } if largest_impurity > self.min_impurity: # Build subtrees for the right and left branches if best_criteria["feature_i"] == 1: split_func = lambda sample: sample >= best_criteria["threshold" ] else: split_func = lambda sample: sample == best_criteria["threshold" ] l_bin = np.array([ sample for sample in feature_bins[best_criteria["feature_i"]] if split_func(sample) ]) r_bin = np.array([ sample for sample in feature_bins[best_criteria["feature_i"]] if not split_func(sample) ]) l_bins = feature_bins r_bins = feature_bins l_bins[best_criteria["feature_i"]] = l_bin r_bins[best_criteria["feature_i"]] = r_bin true_branch = self._build_tree(best_sets["leftX"], best_sets["lefty"], l_bins, current_depth + 1) false_branch = self._build_tree(best_sets["rightX"], best_sets["righty"], r_bins, current_depth + 1) return DecisionNode(feature_i=best_criteria["feature_i"], threshold=best_criteria["threshold"], true_branch=true_branch, false_branch=false_branch) # We're at leaf => determine value return DecisionNode(value=leaf_value)
def _build_tree(self, X, y, current_depth=0): """ Recursive method which builds out the decision tree and splits X and respective y on the feature of X which (based on impurity) best separates the data""" largest_impurity = 0 best_criteria = None # Feature index and threshold best_sets = None # Subsets of the data # Check if expansion of y is needed if len(np.shape(y)) == 1: y = np.expand_dims(y, axis=1) # Add y as last column of X Xy = np.concatenate((X, y), axis=1) n_samples, n_features = np.shape(X) if n_samples >= self.min_samples_split and current_depth <= self.max_depth: # Calculate the impurity for each feature for feature_i in range(n_features): # All values of feature_i feature_values = np.expand_dims(X[:, feature_i], axis=1) unique_values = np.unique(feature_values) # Iterate through all unique values of feature column i and # calculate the impurity for threshold in unique_values: # Divide X and y depending on if the feature value of X at index feature_i # meets the threshold Xy1, Xy2 = divide_on_feature(Xy, feature_i, threshold) if len(Xy1) > 0 and len(Xy2) > 0: # Select the y-values of the two sets y1 = Xy1[:, n_features:] y2 = Xy2[:, n_features:] # Calculate impurity impurity = self._impurity_calculation(y, y1, y2) # If this threshold resulted in a higher information gain than previously # recorded save the threshold value and the feature # index if impurity > largest_impurity: largest_impurity = impurity best_criteria = {"feature_i": feature_i, "threshold": threshold} best_sets = { "leftX": Xy1[:, :n_features], # X of left subtree "lefty": Xy1[:, n_features:], # y of left subtree "rightX": Xy2[:, :n_features], # X of right subtree "righty": Xy2[:, n_features:] # y of right subtree } if largest_impurity > self.min_impurity: # Build subtrees for the right and left branches true_branch = self._build_tree(best_sets["leftX"], best_sets["lefty"], current_depth + 1) false_branch = self._build_tree(best_sets["rightX"], best_sets["righty"], current_depth + 1) return DecisionNode(feature_i=best_criteria["feature_i"], threshold=best_criteria[ "threshold"], true_branch=true_branch, false_branch=false_branch) # We're at leaf => determine value leaf_value = self._leaf_value_calculation(y) return DecisionNode(value=leaf_value)
def _build_tree(self, X, y, current_depth=0): largest_impurity = 0 min_gini = 999 best_criteria = None best_sets = None #check if expansion of y is needed if len(np.shape(y)) == 1: y = np.expand_dims(y, axis=1) # add y as last column of X Xy = np.concatenate((X, y), axis=1) n_samples, n_features = np.shape(X) if n_samples >= self.min_samples_split \ and current_depth <= self.max_depth: for feature_i in range(n_features): #all values of feature_i feature_values = np.expand_dims(X[:, feature_i], axis=1) unique_values = np.unique(feature_values) #iterate through all unique values of feature column i and #calculate the impurity for threshold in unique_values: #divide X and y depending on if the feature #value of X at index meets the threshold Xy1, Xy2 = divide_on_feature(Xy, feature_i, threshold) if len(Xy1) > 0 and len(Xy2) > 0: #select the y values of the two sets y1 = Xy1[:, n_features:] y2 = Xy2[:, n_features:] #calculate impurity impurity = self._impurity_calculation(y, y1, y2) flag = False if self.criterion == "entropy": if impurity > largest_impurity: flag = True largest_impurity = impurity elif self.criterion == "gini": if impurity < min_gini: flag = True min_gini = impurity if flag: best_criteria = { "feature_i": feature_i, "threshold": threshold } best_sets = { "leftX": Xy1[:, :n_features], "lefty": Xy1[:, n_features:], "rightX": Xy2[:, :n_features], "righty": Xy2[:, n_features:] } if self.criterion == "gini": comp_criterion = min_gini else: comp_criterion = largest_impurity if comp_criterion > self.min_impurity and best_criteria is not None: #build subtrees for the right and left branches true_branch = self._build_tree(best_sets["leftX"], best_sets["lefty"], current_depth + 1) false_branch = self._build_tree(best_sets["rightX"], best_sets["righty"], current_depth + 1) return DecisionNode(feature_i=best_criteria["feature_i"], threshold=best_criteria["threshold"], true_branch=true_branch, false_branch=false_branch) #we are at leaf =>determine value leaf_value = self._leaf_value_calculation(y) return DecisionNode(value=leaf_value)
def _build_tree(self, X, y, current_depth=0): """ 递归地建立决策树 """ largest_impurity = 0 best_criteria = None # 特征索引和阈值 best_sets = None # 数据的子集 # 把一维的y变成二维 if len(np.shape(y)) == 1: y = np.expand_dims(y, axis=1) # 扩展y的维度,例如(366,) -> (366,1) # 将X, y拼接成一个数据集 Xy = np.concatenate((X, y), axis=1) n_samples, n_features = np.shape(X) # X的维度 accelerate = True # 选择直接遍历/预排序(加速) # 数据表Xy添加索引维度 if accelerate: Xy = np.hstack((np.arange(len(Xy)).reshape((-1,1)), Xy)) if n_samples >= self.min_samples_split and current_depth <= self.max_depth: # 遍历特征 for feature_i in range(n_features): # 特征i的所有值 feature_values = np.expand_dims(X[:, feature_i], axis=1) unique_values = np.unique(feature_values) # 数据表按照当前特征的值排序,只排序数值型变量 if isinstance(unique_values[0], int) or isinstance(unique_values[0], float): sorted_Xy = np.array(sorted(Xy, key = lambda x: x[feature_i+1])) # 遍历特征i的所有不同的值并计算出不纯度 for threshold in unique_values: # 将数据集根据阈值分开,直接遍历法/预排序法 if accelerate: Xy1, Xy2 = divide_on_feature2(Xy, sorted_Xy, feature_i, threshold) else: Xy1, Xy2 = divide_on_feature(Xy, feature_i, threshold) if len(Xy1) > 0 and len(Xy2) > 0: # 两个数据集都不是空的 # 提取出两个数据集的y值来计算不纯度的下降 y1 = Xy1[:, -2:] y2 = Xy2[:, -2:] impurity = self._impurity_calculation(y, y1, y2) # 找到相比不分裂,最大增益的分裂点 if impurity > largest_impurity: largest_impurity = impurity best_criteria = {"feature_i": feature_i, "threshold": threshold} best_sets = { "leftX": Xy1[:, :n_features], # 左子树的X "lefty": Xy1[:, -2:], # 左子树的y "rightX": Xy2[:, :n_features], # 右子树的X "righty": Xy2[:, -2:] # 右子树的y } if largest_impurity > self.min_impurity: # 分裂增益大于阈值才分裂 # 给左子树、右子树继续分裂。返回的是分裂的特征信息,不包含数据集 true_branch = self._build_tree(best_sets["leftX"], best_sets["lefty"], current_depth + 1) false_branch = self._build_tree(best_sets["rightX"], best_sets["righty"], current_depth + 1) return DecisionNode(feature_i=best_criteria["feature_i"], threshold=best_criteria[ "threshold"], true_branch=true_branch, false_branch=false_branch) # 如果没有继续分裂,则为叶子节点,计算这个节点的预测值 leaf_value = self._leaf_value_calculation(y) return DecisionNode(value=leaf_value)