예제 #1
0
    def get_split(self, X, y):
        """Find the best split for a node (the split that has the lowest gini impurity) with random subset of features.

        Args:
            X (ndarray): training set without class labels.
            y (ndarray) : class labels for training set.
        Returns:
            current_gini (float): gini impurity from best split.
            best_index (int): integer for column index for feature split on.
        """
        # If there are less than two observations at node we don't want to split it.
        if y.size <= 1:
            return None, None

        # Current node impurity prior to splitting.
        current_gini = _gini(y, self.n_classes)

        # Sample indices for features without replacement to find best split on.
        random_features = random.sample([idx for idx in range(self.n_features)], k=self.m)

        # Iterate through all features and calculate gini impurity from resulting split.
        split_index = None
        for index in random_features:
            feature = X[:, index]

            # Get indices to go to left child if feature value is 1, otherwise right child.
            left_idx = (feature == 1)
            left_y = y[left_idx]
            right_y = y[~left_idx]

            # If no observations in left child, set gini index to 0.
            if len(left_y) == 0:
                gini_left = 0
            else:
                gini_left = _gini(left_y, self.n_classes)

            # If no observations in right child, set gini index to 0.
            if len(right_y) == 0:
                gini_right = 0
            else:
                gini_right = _gini(right_y, self.n_classes)

            # Calculate probabilities for left and right children.
            prob_left = len(left_y) / len(y)
            prob_right = len(right_y) / len(y)

            # Gini index from split is just weighted average of gini indices from left and right children.
            split_gini = prob_left * gini_left + prob_right * gini_right

            # Update current gini value if the split is beneficial, save feature index associated with split.
            if split_gini < current_gini:
                current_gini = split_gini
                split_index = index
        return current_gini, split_index
예제 #2
0
    def fit_tree(self, X, y, depth=0):
        """Fit a decision tree with recursive splitting on nodes.

        Args:
            X (ndarray): training set without class labels.
            y (ndarray) : class labels for training set.
            depth (int) : starting depth of decision tree.
        Returns:
            tree (Node): root node of learned decision tree.
        """
        # Get number of training observations in current node with each class label 0 and 1.
        class_distribution = [np.sum(y == i) for i in range(self.n_classes)]

        # Instantiate node to grow the decision tree.
        tree = Node(n=y.size,
                    class_distribution=class_distribution,
                    gini_index=_gini(y, self.n_classes))

        # Perform recursive splitting to max depth.
        if depth < self.max_depth:
            gini_index, split_index = self.get_split(X, y)
            # Get indices for data and class labels to go to the left child, send the rest to the right child.
            if split_index is not None:
                index_left = (X[:, split_index] == 1)
                X_left, y_left = X[index_left], y[index_left]
                X_right, y_right = X[~index_left], y[~index_left]
                tree.gini_index = gini_index
                tree.feature_index = split_index
                depth += 1
                tree.left = self.fit_tree(X_left, y_left, depth=depth)
                tree.right = self.fit_tree(X_right, y_right, depth=depth)
        return tree
예제 #3
0
    def fit_tree(self, X, y, weights, depth=0):
        """Fit a decision tree with recursive splitting on nodes, takes additional weight argument for AdaBoost.

        Args:
            X (ndarray): training set without class labels.
            y (ndarray): class labels for training set.
            weights (ndarray): weights for each training instance.
            depth (int): starting depth of decision tree.
        Returns:
            tree (Node): root node of learned decision tree.
        """
        # Get sum of weights from each class for the class distribution in current node.
        D = weights
        class_weights = [np.sum(D * (y == i)) for i in range(self.n_classes)]

        # Instantiate node to grow the decision tree.
        tree = Node(n=y.size,
                    class_distribution=class_weights,
                    gini_index=_gini(y, self.n_classes, weights=D))

        # Perform recursive splitting to max depth.
        if depth < self.max_depth:
            gini_index, split_index = self.get_split(X, y, weights=D)
            # Get indices for data, class labels, and weights to go to the left child, send the rest to the right child.
            if split_index is not None:
                index_left = (X[:, split_index] == 1)
                X_left, y_left, D_left = X[index_left], y[index_left], D[
                    index_left]
                X_right, y_right, D_right = X[~index_left], y[~index_left], D[
                    ~index_left]
                tree.gini_index = gini_index
                tree.feature_index = split_index
                depth += 1
                tree.left = self.fit_tree(X_left,
                                          y_left,
                                          weights=D_left,
                                          depth=depth)
                tree.right = self.fit_tree(X_right,
                                           y_right,
                                           weights=D_right,
                                           depth=depth)
        return tree
예제 #4
0
    def get_split(self, X, y, weights):
        """Find the best split for a node (the split that has the lowest gini impurity), takes additional weight
        argument for AdaBoost.

        Args:
            X (ndarray): training set without class labels.
            y (ndarray): class labels for training set.
            weights (ndarray): weights for each training instance.
        Returns:
            current_gini (float): gini impurity from best split.
            best_index (int): integer for column index for feature split on.
        """
        # If there are less than two instances at node we don't want to split it.
        if y.size <= 1:
            return None, None

        # Get sum of weights from each class for the class distribution in current node.
        D = weights
        class_weights = [np.sum(D * (y == i)) for i in range(self.n_classes)]

        # Current node impurity prior to splitting.
        current_gini = _gini(y, self.n_classes, weights=D)

        # Iterate through all features and calculate gini impurity from resulting split.
        split_index = None
        for index in range(self.n_features):
            feature = X[:, index]

            # Get indices to go to left child if feature value is 1, otherwise right child.
            left_idx = (feature == 1)
            left_y = y[left_idx]
            left_D = D[left_idx]
            right_y = y[~left_idx]
            right_D = D[~left_idx]

            # If no observations in left child, set gini index to 0.
            if len(left_y) == 0:
                left_class_weights = [0, 0]
                gini_left = 0
            else:
                left_class_weights = [
                    np.sum(left_D * (left_y == i))
                    for i in range(self.n_classes)
                ]
                gini_left = _gini(left_y, self.n_classes, weights=left_D)

            # If no observations in right child, set gini index to 0.
            if len(right_y) == 0:
                right_class_weights = [0, 0]
                gini_right = 0
            else:
                right_class_weights = [
                    np.sum(right_D * (right_y == i))
                    for i in range(self.n_classes)
                ]
                gini_right = _gini(right_y, self.n_classes, weights=right_D)

            # Calculate probabilities for left and right children.
            prob_left = sum(left_class_weights) / sum(class_weights)
            prob_right = sum(right_class_weights) / sum(class_weights)

            # Gini index from split is just weighted average of gini indices from left and right children.
            split_gini = prob_left * gini_left + prob_right * gini_right

            # Update current gini value if the split is beneficial, save feature index associated with split.
            if split_gini < current_gini:
                current_gini = split_gini
                split_index = index
        return current_gini, split_index