def test_decidion_tree_predict(): dataset = datasets.load_iris() x = dataset.data y = dataset.target clf = DecisionTree() clf.fit(x, y) y_pred = clf.predict(x) assert np.array_equal(y, y_pred)
def test_decision_tree_fit(): tree = DecisionTree(3) x = np.array([[1,2],[2,3],[4,5]]) y = np.array([0,1,1]) tree.fit(x, y) assert tree.root.left.predict == 0 assert tree.root.right.predict == 1 assert tree.root.left.left is None assert tree.root.left.right is None assert tree.root.right.left is None assert tree.root.right.right is None assert tree.root.max_depth == 3 assert tree.root.left.max_depth == 2 assert tree.root.right.max_depth == 2 assert tree.root.feature == 1 assert tree.root.threshold == 3
def test_decision_tree_init(): tree = DecisionTree(3) assert type(tree.root) is TreeNode assert tree.root.max_depth == 3 assert tree.root.feature is None assert tree.root.threshold is None assert tree.root.predict is None assert tree.root.left is None assert tree.root.right is None
def fit(self, X, y): """ The fit function fits the Random Forest model based on the training data. X_train is a matrix or 2-D numpy array, represnting training instances. Each training instance is a feature vector. y_train contains the corresponding labels. """ for i in range(self.n_trees): np.random.seed() temp_clf = DecisionTree( max_depth=self.max_depth, size_allowed=self.size_allowed, n_features=self.n_features, n_split=self.n_split, ) temp_clf.fit(X, y) self.trees.append(temp_clf) return self
def test_gradient_boosting_predict(): clf = GradientBoosting(learning_rate=0.1, n_estimators=20, max_depth=5) dataset = datasets.load_iris() x = dataset.data y = dataset.target clf.fit(x, y) assert len(clf.predict(x)) == len(y) assert clf.learning_rate == 0.1 assert clf.n_estimators == 20 assert clf.max_depth == 5 assert len(clf.trees) == clf.n_estimators clf = GradientBoosting(learning_rate=0.1, n_estimators=1, max_depth=5000) dataset = datasets.load_iris() x = dataset.data y = dataset.target pred_gb = clf.fit(x, y) tree = DecisionTree() pred_t = tree.fit(x, y) assert np.array_equal(pred_gb, pred_t)
def main(): data_path = 'gielda.txt' decision_tree = DecisionTree(data_path, separator=',') decision_tree.create_decision_table() info_a1 = decision_tree.calculate_info_for_selected_column(0) gain_a2 = decision_tree.calculate_gain_for_selected_column(1) print('Info a1 = {} \nGain a2 = {}'.format(info_a1, gain_a2))
def train(self, training_set, learning_rate, learn_threshold, gradient_step_learning_rate, classifier_train_kwargs, classifier_init_args, classifier_init_kwargs, loss_function_derivative, # d(L(ŷ, y)) / d(ŷ) iteration_count, max_descent_iterations=500000, verbose=False): self.classifiers = [None] self.coefficients = np.zeros(1) self.coefficients[0] = training_set.y.mean() self.previous_model_leaves_count = 1 for iteration in range(1, iteration_count + 1): if verbose: print("Started iteration " + str(iteration)) self.classifiers.append(DecisionTree(*classifier_init_args, **classifier_init_kwargs)) current_training_set = self.get_pseudo_residuals(loss_function_derivative, training_set) self.classifiers[-1].train(current_training_set, **classifier_train_kwargs) new_leaves_count = self.classifiers[-1].root.set_leaf_index(self.previous_model_leaves_count) - self.previous_model_leaves_count self.coefficients = np.hstack((self.coefficients, np.zeros(new_leaves_count))) if verbose: print("Tree is trained, calculating coefficient...") self.coefficient_gradient_descent(training_set, iteration, loss_function_derivative, learning_rate, gradient_step_learning_rate, learn_threshold, max_descent_iterations) self.previous_model_leaves_count += new_leaves_count if verbose: print("") if verbose: print("Finished training")
def test_decision_tree_inf_criteria(): tree = DecisionTree(3) result = tree._inf_criteria(np.array([1,1,1,1,3,11,2])) assert result == pytest.approx(11.551, 0.001) assert tree._inf_criteria(np.array([])) == 0