예제 #1
0
    def _build_tree(self, X, y, current_depth=0):
        largest_impurity = 0
        best_criteria = None
        best_sets = None

        if len(np.shape(y)) == 1:
            y = np.expand_dims(y, axis=1)

        Xy = np.concatenate((X, y), axis=1)

        n_samples, n_features = np.shape(X)

        if n_samples >= self.min_samples_split and current_depth <= self.max_depth:
            for feature_i in range(n_features):
                feature_values = np.expand_dims(X[:, feature_i], axis=1)
                unique_values = np.unique(feature_values)
                for threshold in unique_values:
                    Xy1, Xy2 = divide_on_feature(Xy, feature_i, threshold)
                    if len(Xy1) > 0 and len(Xy2) > 0:
                        y1 = Xy1[:, n_features:]
                        y2 = Xy2[:, n_features:]

                        impurity = self._impurity_calculation(y, y1, y2)

                        if impurity > largest_impurity:
                            largest_impurity = impurity
                            best_criteria = {
                                "feature_i": feature_i,
                                "threshold": threshold
                            }
                            best_sets = {
                                "leftX": Xy1[:, :n_features],
                                "lefty": Xy1[:, n_features:],
                                "rightX": Xy2[:, :n_features],
                                "righty": Xy2[:, n_features:]
                            }

        if largest_impurity > self.min_impurity:
            true_branch = self._build_tree(best_sets["leftX"],
                                           best_sets["lefty"],
                                           current_depth + 1)
            false_branch = self._build_tree(best_sets["rightX"],
                                            best_sets["righty"],
                                            current_depth + 1)
            return DecisionNode(feature_i=best_criteria["feature_i"],
                                threshold=best_criteria["threshold"],
                                true_branch=true_branch,
                                false_branch=false_branch)
        leaf_value = self._leaf_value_calculation(y)
        return DecisionNode(value=leaf_value)
    def _build_tree(self, X, y, current_depth=0):
        """ Recursive method which builds out the decision tree and splits X and respective y
        on the feature of X which (based on impurity) best separates the data"""
        largest_impurity = 0
        best_criteria = None  # Feature index and threshold
        best_sets = None  # Subsets of the data

        # Check if expansion of y is needed
        if len(np.shape(y)) == 1:
            y = np.expand_dims(y, axis=1)

        # Add y as last column of X
        Xy = np.concatenate((X, y), axis=1)

        n_samples, n_features = np.shape(X)

        if n_samples >= self.min_samples_split and current_depth <= self.max_depth:
            # Calculate the impurity for each feature
            for feature_i in range(n_features):
                # All values of feature_i
                feature_values = np.expand_dims(X[:, feature_i], axis=1)
                unique_values = np.unique(feature_values)

                # Iterate through all unique values of feature column i and
                # calculate the impurity
                for threshold in unique_values:
                    # Divide X and y depending on if the feature value of X at index feature_i
                    # meets the threshold
                    Xy1, Xy2 = divide_on_feature(Xy, feature_i, threshold)

                    if len(Xy1) > 0 and len(Xy2) > 0:
                        # Select the y-values of the two sets
                        y1 = Xy1[:, n_features:]
                        y2 = Xy2[:, n_features:]

                        # Calculate impurity
                        impurity = self._impurity_calculation(y, y1, y2)

                        # If this threshold resulted in a higher information gain than previously
                        # recorded save the threshold value and the feature
                        # index
                        if impurity > largest_impurity:
                            largest_impurity = impurity
                            best_criteria = {
                                "feature_i": feature_i,
                                "threshold": threshold
                            }
                            best_sets = {
                                "leftX":
                                Xy1[:, :n_features],  # X of left subtree
                                "lefty": Xy1[:,
                                             n_features:],  # y of left subtree
                                "rightX":
                                Xy2[:, :n_features],  # X of right subtree
                                "righty":
                                Xy2[:, n_features:]  # y of right subtree
                            }

        if largest_impurity > self.min_impurity:
            # Build subtrees for the right and left branches
            true_branch = self._build_tree(best_sets["leftX"],
                                           best_sets["lefty"],
                                           current_depth + 1)
            false_branch = self._build_tree(best_sets["rightX"],
                                            best_sets["righty"],
                                            current_depth + 1)
            return DecisionNode(feature_i=best_criteria["feature_i"],
                                threshold=best_criteria["threshold"],
                                true_branch=true_branch,
                                false_branch=false_branch)

        # We're at leaf => determine value
        leaf_value = self._leaf_value_calculation(y)
        return DecisionNode(value=leaf_value)
예제 #3
0
    def _build_tree(self, X, y, current_depth=0):
        """
        Recursive method which builds out the decision tree and splits X and respective y on the feature of X which (based on impurity) best separates the data
        """
        X = np.array(X)
        y = np.array(y).reshape(len(y), -1)

        largest_impurity = 0
        best_criteria = None  #feature index and threshold
        best_sets = None  #subsets of the data

        n_samples, n_features = np.shape(X)

        if (n_samples >= self.min_samples_split
                and current_depth <= self.max_depth):
            #calculate the impurity for each feature
            for feature_idx in range(n_features):
                #all values of feature_idx
                feature_values = X[:, feature_idx].reshape(-1, 1)
                unique_values = np.unique(feature_values)

                #iterate through all unique values of feature column i and calculate the impurity
                for threshold in unique_values:
                    #devide X and y depending on if the feature value of X at index feature_idx meets the threshold
                    idx_1, idx_2 = divide_on_feature(X, feature_idx, threshold)

                    X1 = X[idx_1, :]
                    X2 = X[idx_2, :]
                    y1 = y[idx_1, :]
                    y2 = y[idx_2, :]

                    if (len(X1) > 0 and len(X2) > 0):
                        #calculate impurity
                        impurity = self._impurity_calculation(y, y1, y2)

                        #save the threshold value and the feature index if this threshold resulted in a higher informaiton gain than previously recorded
                        if (impurity > largest_impurity):
                            largest_impurity = impurity
                            best_criteria = {
                                "feature_idx": feature_idx,
                                "threshold": threshold
                            }
                            best_sets = {
                                "leftX": X1,
                                "lefty": y1,
                                "rightX": X2,
                                "righty": y2
                            }
        if (largest_impurity > self.min_impurity):
            #build subtree for the right and left branches
            true_branch = self._build_tree(best_sets["leftX"],
                                           best_sets["lefty"],
                                           current_depth + 1)
            false_branch = self._build_tree(best_sets["rightX"],
                                            best_sets["righty"],
                                            current_depth + 1)
            return DecisionNode(feature_idx=best_criteria["feature_idx"],
                                threshold=best_criteria["threshold"],
                                true_branch=true_branch,
                                false_branch=false_branch)

        #determine value if it reaches at a leaf
        leaf_value = self._leaf_value_calculation(y)
        return DecisionNode(value=leaf_value)
예제 #4
0
    def _build_tree(self, X, y, current_depth=0):
        """ Recursive method which builds out the decision tree and splits X and respective y
        on the feature of X which (based on impurity) best separates the data"""
        largest_impurity = 0
        best_criteria = None  # Feature index and threshold
        best_sets = None  # Subsets of the data

        # Check if expansion of y is needed
        if len(np.shape(y)) == 1:
            y = np.expand_dims(y, axis=1)  # [1,2,1,1]------->[[1],[2],[1],[1]]

        # Add y as last column of X
        Xy = np.concatenate(
            (X, y),
            axis=1)  # [[feature_1,feature2,feature3,tagert],...........]

        n_samples, n_features = np.shape(X)

        if n_samples >= self.min_samples_split and current_depth <= self.max_depth:
            # Calculate the impurity for each feature
            for feature_i in range(n_features):
                # All values of feature_i
                feature_values = np.expand_dims(
                    X[:, feature_i], axis=1
                )  # [[6.2], [5.1], [4.8], [5.6], [7.2], [4.6], [5.1], [6.9], [6.7], [5.1], [7.7], [5.1], [6.4], [6. ], [6.1], [5. ], [6.5], [5.7], [6.2], [4.6], [5. ], [6.3], [4.4], [5.2], [6.8], [4.6], [6.1], [4.9], [5.2], [4.8], [4.7], [5.8], [7.1], [4.8], [6.7], [6.3], [5.
                unique_values = np.unique(
                    feature_values
                )  # [4.3 4.4 4.5 4.6 4.7 4.8 4.9 5.  5.1 5.2 5.3 5.4 5.5 5.6 5.7 5.8 6.  6.1, 6.2 6.3 6.4 6.5 6.6 6.7 6.8 6.9 7.1 7.2 7.6 7.7 7.9]

                # Iterate through all unique values of feature column i and
                # calculate the impurity
                for threshold in unique_values:
                    # Divide X and y depending on if the feature value of X at index feature_i meets the threshold
                    Xy1, Xy2 = divide_on_feature(Xy, feature_i, threshold)

                    if len(Xy1) > 0 and len(Xy2) > 0:
                        # Select the y-values of the two sets
                        y1 = Xy1[:,
                                 n_features:]  # [[1.], [0.], [0.], [1.], [0.], [0.], [0.], [1.], [0.], [0.], [1.], [0.], [2.], [2.], [1.], [2.], [1.], [0.], [2.], [1.], [2.], [0.], [0.], [2.], [0.], [2.], [0.], [1.], [1.], [2.], [1.], [2.], [2.], [1.], [1.], [0.], [0.], [2.], [0.], [0.], [0.], [1.], [2.
                        y2 = Xy2[:, n_features:]  # [[0.]]

                        # Calculate impurity   得到这个切分点的信息增益值
                        impurity = self._impurity_calculation(y, y1, y2)

                        # If this threshold resulted in a higher information gain than previously
                        # recorded save the threshold value and the feature
                        # index
                        if impurity > largest_impurity:
                            largest_impurity = impurity
                            best_criteria = {
                                "feature_i": feature_i,
                                "threshold": threshold
                            }
                            best_sets = {
                                "leftX":
                                Xy1[:, :n_features],  # X of left subtree
                                "lefty": Xy1[:,
                                             n_features:],  # y of left subtree
                                "rightX":
                                Xy2[:, :n_features],  # X of right subtree
                                "righty":
                                Xy2[:, n_features:]  # y of right subtree
                            }

        if largest_impurity > self.min_impurity:
            # Build subtrees for the right and left branches
            true_branch = self._build_tree(best_sets["leftX"],
                                           best_sets["lefty"],
                                           current_depth + 1)
            false_branch = self._build_tree(best_sets["rightX"],
                                            best_sets["righty"],
                                            current_depth + 1)
            return DecisionNode(feature_i=best_criteria["feature_i"],
                                threshold=best_criteria["threshold"],
                                true_branch=true_branch,
                                false_branch=false_branch)

        # We're at leaf => determine value     是计算子节点值的方法,这里使用的是选取数据集中出现最多的种类(target1、target2或者target3)
        leaf_value = self._leaf_value_calculation(y)
        return DecisionNode(value=leaf_value)
    def _build_tree(self, X, y, feature_bins, current_depth=0):
        """ Recursive method which builds out the decision tree and splits X and respective y
        on the feature of X which (based on impurity) best separates the data"""
        largest_impurity = 0
        best_criteria = None  # Feature index and threshold
        best_sets = None  # Subsets of the data

        # Check if expansion of y is needed
        if len(np.shape(y)) == 1:
            y = np.expand_dims(y, axis=1)

        # Add y as last column of X
        Xy = np.concatenate((X, y), axis=1)
        n_samples, n_features = np.shape(X)

        if current_depth <= self.max_depth:
            # Calculate the impurity for each feature
            for feature_i in range(len(feature_bins)):
                # All values of feature_i
                #feature_values = np.expand_dims(X[:, feature_i], axis=1)
                unique_values = feature_bins[feature_i]

                # Iterate through all unique values of feature column i and
                # calculate the impurity
                for threshold in unique_values:
                    # Divide X and y depending on if the feature value of X at index feature_i
                    # meets the threshold

                    Xy1, Xy2 = divide_on_feature(Xy, feature_i, threshold)
                    y1 = np.atleast_2d(Xy1)[:, n_features:]
                    y2 = np.atleast_2d(Xy2)[:, n_features:]
                    self._send_data(y, y1, y2)

        lock6.acquire()
        largest_impurity = results.get()
        best_criteria = results.get()
        leaf_value = results.get()
        lock4.release()
        Xy1, Xy2 = divide_on_feature(Xy, best_criteria["feature_i"],
                                     best_criteria["threshold"])
        best_sets = {
            "leftX": np.atleast_2d(Xy1)[:, :n_features],  # X of left subtree
            "lefty": np.atleast_2d(Xy1)[:, n_features:],  # y of left subtree
            "rightX": np.atleast_2d(Xy2)[:, :n_features],  # X of right subtree
            "righty": np.atleast_2d(Xy2)[:, n_features:]  # y of right subtree
        }
        if largest_impurity > self.min_impurity:
            # Build subtrees for the right and left branches
            if best_criteria["feature_i"] == 1:
                split_func = lambda sample: sample >= best_criteria["threshold"
                                                                    ]
            else:
                split_func = lambda sample: sample == best_criteria["threshold"
                                                                    ]

            l_bin = np.array([
                sample for sample in feature_bins[best_criteria["feature_i"]]
                if split_func(sample)
            ])
            r_bin = np.array([
                sample for sample in feature_bins[best_criteria["feature_i"]]
                if not split_func(sample)
            ])
            l_bins = feature_bins
            r_bins = feature_bins
            l_bins[best_criteria["feature_i"]] = l_bin
            r_bins[best_criteria["feature_i"]] = r_bin
            true_branch = self._build_tree(best_sets["leftX"],
                                           best_sets["lefty"], l_bins,
                                           current_depth + 1)
            false_branch = self._build_tree(best_sets["rightX"],
                                            best_sets["righty"], r_bins,
                                            current_depth + 1)
            return DecisionNode(feature_i=best_criteria["feature_i"],
                                threshold=best_criteria["threshold"],
                                true_branch=true_branch,
                                false_branch=false_branch)

        # We're at leaf => determine value
        return DecisionNode(value=leaf_value)
    def _build_tree(self, X, y, current_depth=0):
        """ Recursive method which builds out the decision tree and splits X and respective y
        on the feature of X which (based on impurity) best separates the data"""
        largest_impurity = 0
        best_criteria = None  # Feature index and threshold
        best_sets = None  # Subsets of the data

        # Check if expansion of y is needed
        if len(np.shape(y)) == 1:
            y = np.expand_dims(y, axis=1)

        # Add y as last column of X
        Xy = np.concatenate((X, y), axis=1)

        n_samples, n_features = np.shape(X)

        if n_samples >= self.min_samples_split and current_depth <= self.max_depth:
            # Calculate the impurity for each feature
            for feature_i in range(n_features):
                # All values of feature_i
                feature_values = np.expand_dims(X[:, feature_i], axis=1)
                unique_values = np.unique(feature_values)

                # Iterate through all unique values of feature column i and
                # calculate the impurity
                for threshold in unique_values:
                    # Divide X and y depending on if the feature value of X at index feature_i
                    # meets the threshold
                    Xy1, Xy2 = divide_on_feature(Xy, feature_i, threshold)

                    if len(Xy1) > 0 and len(Xy2) > 0:
                        # Select the y-values of the two sets
                        y1 = Xy1[:, n_features:]
                        y2 = Xy2[:, n_features:]

                        # Calculate impurity
                        impurity = self._impurity_calculation(y, y1, y2)

                        # If this threshold resulted in a higher information gain than previously
                        # recorded save the threshold value and the feature
                        # index
                        if impurity > largest_impurity:
                            largest_impurity = impurity
                            best_criteria = {"feature_i": feature_i, "threshold": threshold}
                            best_sets = {
                                "leftX": Xy1[:, :n_features],  # X of left subtree
                                "lefty": Xy1[:, n_features:],  # y of left subtree
                                "rightX": Xy2[:, :n_features],  # X of right subtree
                                "righty": Xy2[:, n_features:]  # y of right subtree
                            }

        if largest_impurity > self.min_impurity:
            # Build subtrees for the right and left branches
            true_branch = self._build_tree(best_sets["leftX"], best_sets["lefty"], current_depth + 1)
            false_branch = self._build_tree(best_sets["rightX"], best_sets["righty"], current_depth + 1)
            return DecisionNode(feature_i=best_criteria["feature_i"], threshold=best_criteria[
                "threshold"], true_branch=true_branch, false_branch=false_branch)

        # We're at leaf => determine value
        leaf_value = self._leaf_value_calculation(y)
        return DecisionNode(value=leaf_value)
예제 #7
0
    def _build_tree(self, X, y, current_depth=0):
        largest_impurity = 0
        min_gini = 999
        best_criteria = None
        best_sets = None

        #check if expansion of y is needed
        if len(np.shape(y)) == 1:
            y = np.expand_dims(y, axis=1)

        # add y as last column of X
        Xy = np.concatenate((X, y), axis=1)

        n_samples, n_features = np.shape(X)
        if n_samples >= self.min_samples_split \
            and current_depth <= self.max_depth:
            for feature_i in range(n_features):
                #all values of feature_i
                feature_values = np.expand_dims(X[:, feature_i], axis=1)
                unique_values = np.unique(feature_values)

                #iterate through all unique values of feature column i and
                #calculate the impurity
                for threshold in unique_values:
                    #divide X and y depending on if the feature
                    #value of X at index meets the threshold
                    Xy1, Xy2 = divide_on_feature(Xy, feature_i, threshold)

                    if len(Xy1) > 0 and len(Xy2) > 0:
                        #select the y values of the two sets
                        y1 = Xy1[:, n_features:]
                        y2 = Xy2[:, n_features:]

                        #calculate impurity
                        impurity = self._impurity_calculation(y, y1, y2)
                        flag = False
                        if self.criterion == "entropy":
                            if impurity > largest_impurity:
                                flag = True
                                largest_impurity = impurity
                        elif self.criterion == "gini":
                            if impurity < min_gini:
                                flag = True
                                min_gini = impurity

                        if flag:
                            best_criteria = {
                                "feature_i": feature_i,
                                "threshold": threshold
                            }
                            best_sets = {
                                "leftX": Xy1[:, :n_features],
                                "lefty": Xy1[:, n_features:],
                                "rightX": Xy2[:, :n_features],
                                "righty": Xy2[:, n_features:]
                            }

        if self.criterion == "gini":
            comp_criterion = min_gini
        else:
            comp_criterion = largest_impurity

        if comp_criterion > self.min_impurity and best_criteria is not None:
            #build subtrees for the right and left branches
            true_branch = self._build_tree(best_sets["leftX"],
                                           best_sets["lefty"],
                                           current_depth + 1)
            false_branch = self._build_tree(best_sets["rightX"],
                                            best_sets["righty"],
                                            current_depth + 1)
            return DecisionNode(feature_i=best_criteria["feature_i"],
                                threshold=best_criteria["threshold"],
                                true_branch=true_branch,
                                false_branch=false_branch)
        #we are at leaf =>determine value
        leaf_value = self._leaf_value_calculation(y)

        return DecisionNode(value=leaf_value)
예제 #8
0
    def _build_tree(self, X, y, current_depth=0):
        """ 递归地建立决策树 """
        largest_impurity = 0
        best_criteria = None  # 特征索引和阈值
        best_sets = None  # 数据的子集

        # 把一维的y变成二维
        if len(np.shape(y)) == 1:
            y = np.expand_dims(y, axis=1) # 扩展y的维度,例如(366,) -> (366,1)

        # 将X, y拼接成一个数据集
        Xy = np.concatenate((X, y), axis=1)
        n_samples, n_features = np.shape(X) # X的维度
        
        accelerate = True # 选择直接遍历/预排序(加速)
        
        # 数据表Xy添加索引维度
        if accelerate:
            Xy = np.hstack((np.arange(len(Xy)).reshape((-1,1)), Xy))
        
        if n_samples >= self.min_samples_split and current_depth <= self.max_depth:
            # 遍历特征
            for feature_i in range(n_features):
                # 特征i的所有值
                feature_values = np.expand_dims(X[:, feature_i], axis=1)
                unique_values = np.unique(feature_values)
                
                # 数据表按照当前特征的值排序,只排序数值型变量
                if isinstance(unique_values[0], int) or isinstance(unique_values[0], float):
                    sorted_Xy = np.array(sorted(Xy, key = lambda x: x[feature_i+1]))
                # 遍历特征i的所有不同的值并计算出不纯度
                for threshold in unique_values:
                    # 将数据集根据阈值分开,直接遍历法/预排序法
                    if accelerate:
                        Xy1, Xy2 = divide_on_feature2(Xy, sorted_Xy, feature_i, threshold)
                    else:
                        Xy1, Xy2 = divide_on_feature(Xy, feature_i, threshold)
                    
                    if len(Xy1) > 0 and len(Xy2) > 0: # 两个数据集都不是空的
                        # 提取出两个数据集的y值来计算不纯度的下降
                        y1 = Xy1[:, -2:]
                        y2 = Xy2[:, -2:]
                        impurity = self._impurity_calculation(y, y1, y2)
                        
                        # 找到相比不分裂,最大增益的分裂点
                        if impurity > largest_impurity:
                            largest_impurity = impurity
                            best_criteria = {"feature_i": feature_i, "threshold": threshold}
                            best_sets = {
                                "leftX": Xy1[:, :n_features],  # 左子树的X
                                "lefty": Xy1[:, -2:],  # 左子树的y
                                "rightX": Xy2[:, :n_features],  # 右子树的X
                                "righty": Xy2[:, -2:]  # 右子树的y
                            }

        if largest_impurity > self.min_impurity: # 分裂增益大于阈值才分裂
            # 给左子树、右子树继续分裂。返回的是分裂的特征信息,不包含数据集
            true_branch = self._build_tree(best_sets["leftX"], best_sets["lefty"], current_depth + 1)
            false_branch = self._build_tree(best_sets["rightX"], best_sets["righty"], current_depth + 1)
            return DecisionNode(feature_i=best_criteria["feature_i"], threshold=best_criteria[
                "threshold"], true_branch=true_branch, false_branch=false_branch)

        # 如果没有继续分裂,则为叶子节点,计算这个节点的预测值
        leaf_value = self._leaf_value_calculation(y)
        return DecisionNode(value=leaf_value)