def _calculate_information_gain(self, y, y_1, y_2): # Calculate information gain p = len(y_1) / len(y) entropy = calculate_entropy(y) info_gain = entropy - p * \ calculate_entropy(y_1) - (1 - p) * \ calculate_entropy(y_2) return info_gain
def _build_tree(self, X, y): # Calculate the entropy by the label values entropy = calculate_entropy(y) # Save the best informaion gain highest_info_gain = 0 best_criteria = None # Feature index and threshold best_sets = None # Subsets of the data # Add y as last column of X X_y = np.concatenate((X, np.expand_dims(y, axis=1)), axis=1) n_samples, n_features = np.shape(X) if n_samples >= self.min_samples_split: # Calculate the information gain for each feature for feature_i in range(n_features): # All values of feature_i feature_values = np.expand_dims(X[:, feature_i], axis=1) unique_values = np.unique(feature_values) # Iterate through all unique values of feature column i and # calculate the informaion gain for threshold in unique_values: Xy_1, Xy_2 = divide_on_feature(X_y, feature_i, threshold) if np.shape(X_y)[0] != np.shape(Xy_1)[0] + np.shape(Xy_2)[0]: print "Aj" sys.exit(0) # If one subset there is no use of calculating the information gain if len(Xy_1) > 0 and len(Xy_2) > 0: # Calculate information gain p = len(Xy_1) / n_samples y1 = Xy_1[:,-1] y2 = Xy_2[:,-1] info_gain = entropy - p * calculate_entropy(y1) - (1 - p) * calculate_entropy(y2) # If this threshold resulted in a higher information gain than previously # recorded save the threshold value and the feature index if info_gain > highest_info_gain: highest_info_gain = info_gain best_criteria = {"feature_i": feature_i, "threshold": threshold} best_sets = np.array([Xy_1, Xy_2]) # If we have any information gain to go by we build the tree deeper if self.current_depth < self.max_depth and highest_info_gain > self.min_gain: X_1, y_1 = best_sets[0][:, :-1], best_sets[0][:, -1] X_2, y_2 = best_sets[1][:, :-1], best_sets[1][:, -1] true_branch = self._build_tree(X_1, y_1) false_branch = self._build_tree(X_2, y_2) self.current_depth += 1 return DecisionNode(feature_i=best_criteria["feature_i"], threshold=best_criteria["threshold"], true_branch=true_branch, false_branch=false_branch) # There's no recorded information gain so we are at a leaf most_common = None max_count = 0 results = {} for label in np.unique(y): count = len(y[y == label]) if count > max_count: most_common = label max_count = count return DecisionNode(label=most_common)
def _build_tree(self, X, y): # Calculate the entropy by the label values entropy = calculate_entropy(y) highest_info_gain = 0 best_criteria = None # Feature index and threshold best_sets = None # Subsets of the data # Add y as last column of X X_y = np.concatenate((X, np.expand_dims(y, axis=1)), axis=1) n_samples, n_features = np.shape(X) if n_samples >= self.min_samples_split: # Calculate the information gain for each feature for feature_i in range(n_features): # All values of feature_i feature_values = np.expand_dims(X[:, feature_i], axis=1) unique_values = np.unique(feature_values) # Iterate through all unique values of feature column i and # calculate the informaion gain for threshold in unique_values: Xy_1, Xy_2 = divide_on_feature(X_y, feature_i, threshold) # If one subset there is no use of calculating the information gain if len(Xy_1) > 0 and len(Xy_2) > 0: # Calculate information gain p = len(Xy_1) / n_samples y1 = Xy_1[:, -1] y2 = Xy_2[:, -1] info_gain = entropy - p * calculate_entropy(y1) - ( 1 - p) * calculate_entropy(y2) # If this threshold resulted in a higher information gain than previously # recorded save the threshold value and the feature index if info_gain > highest_info_gain: highest_info_gain = info_gain best_criteria = { "feature_i": feature_i, "threshold": threshold } best_sets = { "left_branch": Xy_1, "right_branch": Xy_2 } # If we have any information gain to go by we build the tree deeper if self.current_depth < self.max_depth and highest_info_gain > self.min_gain: leftX, leftY = best_sets["left_branch"][:, :-1], best_sets[ "left_branch"][:, -1] # X - all cols. but last, y - last rightX, rightY = best_sets["right_branch"][:, :-1], best_sets[ "right_branch"][:, -1] # X - all cols. but last, y - last true_branch = self._build_tree(leftX, leftY) false_branch = self._build_tree(rightX, rightY) self.current_depth += 1 return DecisionNode(feature_i=best_criteria["feature_i"], threshold=best_criteria["threshold"], true_branch=true_branch, false_branch=false_branch) # There's no recorded information gain so we are at a leaf most_common = None max_count = 0 results = {} for label in np.unique(y): count = len(y[y == label]) if count > max_count: most_common = label max_count = count return DecisionNode(label=most_common)
def _calculate_information_gain(self, y, y1, y2): entropy = calculate_entropy(y) p = len(y1) / len(y) info_gain = entropy - p * calculate_entropy(y1) - (1 - p) * calculate_entropy(y2) return info_gain