예제 #1
0
    def _calculate_information_gain(self, y, y_1, y_2):
        # Calculate information gain
        p = len(y_1) / len(y)
        entropy = calculate_entropy(y)
        info_gain = entropy - p * \
            calculate_entropy(y_1) - (1 - p) * \
            calculate_entropy(y_2)

        return info_gain
예제 #2
0
	def _build_tree(self, X, y):
		# Calculate the entropy by the label values
		entropy = calculate_entropy(y)

		# Save the best informaion gain
		highest_info_gain = 0
		best_criteria = None	# Feature index and threshold
		best_sets = None		# Subsets of the data

		# Add y as last column of X
		X_y = np.concatenate((X, np.expand_dims(y, axis=1)), axis=1)

		n_samples, n_features = np.shape(X)

		if n_samples >= self.min_samples_split:
			# Calculate the information gain for each feature
			for feature_i in range(n_features):
				# All values of feature_i
				feature_values = np.expand_dims(X[:, feature_i], axis=1)
				unique_values = np.unique(feature_values)

				# Iterate through all unique values of feature column i and
				# calculate the informaion gain
				for threshold in unique_values:

					Xy_1, Xy_2 = divide_on_feature(X_y, feature_i, threshold)
					if np.shape(X_y)[0] != np.shape(Xy_1)[0] + np.shape(Xy_2)[0]:
						print "Aj"
						sys.exit(0)

					# If one subset there is no use of calculating the information gain
					if len(Xy_1) > 0 and len(Xy_2) > 0:
						# Calculate information gain
						p = len(Xy_1) / n_samples
						y1 = Xy_1[:,-1]
						y2 = Xy_2[:,-1]
						info_gain = entropy - p * calculate_entropy(y1) - (1 - p) * calculate_entropy(y2)

						# If this threshold resulted in a higher information gain than previously
						# recorded save the threshold value and the feature index
						if info_gain > highest_info_gain:
							highest_info_gain = info_gain
							best_criteria = {"feature_i": feature_i, "threshold": threshold}
							best_sets = np.array([Xy_1, Xy_2])

		# If we have any information gain to go by we build the tree deeper
		if self.current_depth < self.max_depth and highest_info_gain > self.min_gain:
			X_1, y_1 = best_sets[0][:, :-1], best_sets[0][:, -1]
			X_2, y_2 = best_sets[1][:, :-1], best_sets[1][:, -1]
			true_branch = self._build_tree(X_1, y_1)
			false_branch = self._build_tree(X_2, y_2)
			self.current_depth += 1
			return DecisionNode(feature_i=best_criteria["feature_i"], threshold=best_criteria["threshold"], true_branch=true_branch, false_branch=false_branch)
		# There's no recorded information gain so we are at a leaf
		most_common = None
		max_count = 0
		results = {}
		for label in np.unique(y):
			count = len(y[y == label])
			if count > max_count:
				most_common = label
				max_count = count
		return DecisionNode(label=most_common)
예제 #3
0
    def _build_tree(self, X, y):
        # Calculate the entropy by the label values
        entropy = calculate_entropy(y)

        highest_info_gain = 0
        best_criteria = None  # Feature index and threshold
        best_sets = None  # Subsets of the data

        # Add y as last column of X
        X_y = np.concatenate((X, np.expand_dims(y, axis=1)), axis=1)

        n_samples, n_features = np.shape(X)

        if n_samples >= self.min_samples_split:
            # Calculate the information gain for each feature
            for feature_i in range(n_features):
                # All values of feature_i
                feature_values = np.expand_dims(X[:, feature_i], axis=1)
                unique_values = np.unique(feature_values)

                # Iterate through all unique values of feature column i and
                # calculate the informaion gain
                for threshold in unique_values:
                    Xy_1, Xy_2 = divide_on_feature(X_y, feature_i, threshold)
                    # If one subset there is no use of calculating the information gain
                    if len(Xy_1) > 0 and len(Xy_2) > 0:
                        # Calculate information gain
                        p = len(Xy_1) / n_samples
                        y1 = Xy_1[:, -1]
                        y2 = Xy_2[:, -1]
                        info_gain = entropy - p * calculate_entropy(y1) - (
                            1 - p) * calculate_entropy(y2)

                        # If this threshold resulted in a higher information gain than previously
                        # recorded save the threshold value and the feature index
                        if info_gain > highest_info_gain:
                            highest_info_gain = info_gain
                            best_criteria = {
                                "feature_i": feature_i,
                                "threshold": threshold
                            }
                            best_sets = {
                                "left_branch": Xy_1,
                                "right_branch": Xy_2
                            }

        # If we have any information gain to go by we build the tree deeper
        if self.current_depth < self.max_depth and highest_info_gain > self.min_gain:
            leftX, leftY = best_sets["left_branch"][:, :-1], best_sets[
                "left_branch"][:, -1]  # X - all cols. but last, y - last
            rightX, rightY = best_sets["right_branch"][:, :-1], best_sets[
                "right_branch"][:, -1]  # X - all cols. but last, y - last
            true_branch = self._build_tree(leftX, leftY)
            false_branch = self._build_tree(rightX, rightY)
            self.current_depth += 1
            return DecisionNode(feature_i=best_criteria["feature_i"],
                                threshold=best_criteria["threshold"],
                                true_branch=true_branch,
                                false_branch=false_branch)
        # There's no recorded information gain so we are at a leaf
        most_common = None
        max_count = 0
        results = {}
        for label in np.unique(y):
            count = len(y[y == label])
            if count > max_count:
                most_common = label
                max_count = count
        return DecisionNode(label=most_common)
    def _calculate_information_gain(self, y, y1, y2):
        entropy = calculate_entropy(y)
        p = len(y1) / len(y)
        info_gain = entropy - p * calculate_entropy(y1) - (1 - p) * calculate_entropy(y2)

        return info_gain