def get_split(self, X, y): """Find the best split for a node (the split that has the lowest gini impurity) with random subset of features. Args: X (ndarray): training set without class labels. y (ndarray) : class labels for training set. Returns: current_gini (float): gini impurity from best split. best_index (int): integer for column index for feature split on. """ # If there are less than two observations at node we don't want to split it. if y.size <= 1: return None, None # Current node impurity prior to splitting. current_gini = _gini(y, self.n_classes) # Sample indices for features without replacement to find best split on. random_features = random.sample([idx for idx in range(self.n_features)], k=self.m) # Iterate through all features and calculate gini impurity from resulting split. split_index = None for index in random_features: feature = X[:, index] # Get indices to go to left child if feature value is 1, otherwise right child. left_idx = (feature == 1) left_y = y[left_idx] right_y = y[~left_idx] # If no observations in left child, set gini index to 0. if len(left_y) == 0: gini_left = 0 else: gini_left = _gini(left_y, self.n_classes) # If no observations in right child, set gini index to 0. if len(right_y) == 0: gini_right = 0 else: gini_right = _gini(right_y, self.n_classes) # Calculate probabilities for left and right children. prob_left = len(left_y) / len(y) prob_right = len(right_y) / len(y) # Gini index from split is just weighted average of gini indices from left and right children. split_gini = prob_left * gini_left + prob_right * gini_right # Update current gini value if the split is beneficial, save feature index associated with split. if split_gini < current_gini: current_gini = split_gini split_index = index return current_gini, split_index
def fit_tree(self, X, y, depth=0): """Fit a decision tree with recursive splitting on nodes. Args: X (ndarray): training set without class labels. y (ndarray) : class labels for training set. depth (int) : starting depth of decision tree. Returns: tree (Node): root node of learned decision tree. """ # Get number of training observations in current node with each class label 0 and 1. class_distribution = [np.sum(y == i) for i in range(self.n_classes)] # Instantiate node to grow the decision tree. tree = Node(n=y.size, class_distribution=class_distribution, gini_index=_gini(y, self.n_classes)) # Perform recursive splitting to max depth. if depth < self.max_depth: gini_index, split_index = self.get_split(X, y) # Get indices for data and class labels to go to the left child, send the rest to the right child. if split_index is not None: index_left = (X[:, split_index] == 1) X_left, y_left = X[index_left], y[index_left] X_right, y_right = X[~index_left], y[~index_left] tree.gini_index = gini_index tree.feature_index = split_index depth += 1 tree.left = self.fit_tree(X_left, y_left, depth=depth) tree.right = self.fit_tree(X_right, y_right, depth=depth) return tree
def fit_tree(self, X, y, weights, depth=0): """Fit a decision tree with recursive splitting on nodes, takes additional weight argument for AdaBoost. Args: X (ndarray): training set without class labels. y (ndarray): class labels for training set. weights (ndarray): weights for each training instance. depth (int): starting depth of decision tree. Returns: tree (Node): root node of learned decision tree. """ # Get sum of weights from each class for the class distribution in current node. D = weights class_weights = [np.sum(D * (y == i)) for i in range(self.n_classes)] # Instantiate node to grow the decision tree. tree = Node(n=y.size, class_distribution=class_weights, gini_index=_gini(y, self.n_classes, weights=D)) # Perform recursive splitting to max depth. if depth < self.max_depth: gini_index, split_index = self.get_split(X, y, weights=D) # Get indices for data, class labels, and weights to go to the left child, send the rest to the right child. if split_index is not None: index_left = (X[:, split_index] == 1) X_left, y_left, D_left = X[index_left], y[index_left], D[ index_left] X_right, y_right, D_right = X[~index_left], y[~index_left], D[ ~index_left] tree.gini_index = gini_index tree.feature_index = split_index depth += 1 tree.left = self.fit_tree(X_left, y_left, weights=D_left, depth=depth) tree.right = self.fit_tree(X_right, y_right, weights=D_right, depth=depth) return tree
def get_split(self, X, y, weights): """Find the best split for a node (the split that has the lowest gini impurity), takes additional weight argument for AdaBoost. Args: X (ndarray): training set without class labels. y (ndarray): class labels for training set. weights (ndarray): weights for each training instance. Returns: current_gini (float): gini impurity from best split. best_index (int): integer for column index for feature split on. """ # If there are less than two instances at node we don't want to split it. if y.size <= 1: return None, None # Get sum of weights from each class for the class distribution in current node. D = weights class_weights = [np.sum(D * (y == i)) for i in range(self.n_classes)] # Current node impurity prior to splitting. current_gini = _gini(y, self.n_classes, weights=D) # Iterate through all features and calculate gini impurity from resulting split. split_index = None for index in range(self.n_features): feature = X[:, index] # Get indices to go to left child if feature value is 1, otherwise right child. left_idx = (feature == 1) left_y = y[left_idx] left_D = D[left_idx] right_y = y[~left_idx] right_D = D[~left_idx] # If no observations in left child, set gini index to 0. if len(left_y) == 0: left_class_weights = [0, 0] gini_left = 0 else: left_class_weights = [ np.sum(left_D * (left_y == i)) for i in range(self.n_classes) ] gini_left = _gini(left_y, self.n_classes, weights=left_D) # If no observations in right child, set gini index to 0. if len(right_y) == 0: right_class_weights = [0, 0] gini_right = 0 else: right_class_weights = [ np.sum(right_D * (right_y == i)) for i in range(self.n_classes) ] gini_right = _gini(right_y, self.n_classes, weights=right_D) # Calculate probabilities for left and right children. prob_left = sum(left_class_weights) / sum(class_weights) prob_right = sum(right_class_weights) / sum(class_weights) # Gini index from split is just weighted average of gini indices from left and right children. split_gini = prob_left * gini_left + prob_right * gini_right # Update current gini value if the split is beneficial, save feature index associated with split. if split_gini < current_gini: current_gini = split_gini split_index = index return current_gini, split_index