def test_train_test_split(self): X = pd.DataFrame([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) train_X, test_X = train_test_split(X, test_size=0.2, random_state=42) expected_train_X = pd.DataFrame([6, 1, 8, 3, 10, 5, 4, 7]).set_index(pd.Series([5, 0, 7, 2, 9, 4, 3, 6])) expected_test_X = pd.DataFrame([9, 2]).set_index(pd.Series([8, 1])) pandas.testing.assert_frame_equal(expected_train_X, train_X) pandas.testing.assert_frame_equal(expected_test_X, test_X)
def plot_learning_curves(model, X, y): X_train, X_val = train_test_split(X, test_size=0.2, random_state=42) y_train, y_val = train_test_split(y, test_size=0.2, random_state=42) train_errors, val_errors = [], [] for m in range(1, len(X_train)): model.fit(X_train[:m], y_train[:m]) y_train_predict = model.predict(X_train[:m]) y_val_predict = model.predict(X_val) train_errors.append(mean_squared_error(y_train[:m], y_train_predict)) val_errors.append(mean_squared_error(y_val, y_val_predict)) plt.xlabel('Training set size', fontsize=14) plt.ylabel('RMSE', fontsize=14) plt.plot(np.sqrt(train_errors), 'r-', linewidth=2, label='train') plt.plot(np.sqrt(val_errors), 'b-', linewidth=2, label='val') plt.legend(loc='upper right', fontsize=14) plt.show()
def main(): """ :brief: The main function executes the program. """ filename = os.path.join(datasets_path, 'bank.csv') x, y, headers = read_dataset(filename, header=True, x_y=True) # we create the train = 70% dataset and the test = 30% dataset x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3) adaboost_classifier = AdaBoostClassifier( base_estimator=DecisionTree(max_depth=1), n_estimators=5) adaboost_classifier.fit(x_train, y_train, headers=headers) predictions = adaboost_classifier.predict(x_test) accuracy = accuracy_score(y_test, predictions) print("Accuracy of AdaBoost Classifier: {:.2f}% ".format(accuracy * 100))
def main(): """ :brief: The main function executes the program. """ filename = os.path.join(datasets_path, 'bank.csv') x, y, headers = read_dataset(filename, header=True, x_y=True) # we create the train = 70% dataset and the test = 30% dataset x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3) decision_tree = DecisionTree(max_depth=4, min_samples_split=5, min_samples_leaf=2) decision_tree.fit(x_train, y_train, headers=headers) print(decision_tree) predictions = decision_tree.predict(x_test) accuracy = accuracy_score(y_test, predictions) print("Accuracy of Decision Tree: {:.2f}% ".format(accuracy * 100))
def test_fit_predict(self): data = pd.read_csv('learnml/linear_model/tests/test_data.csv') train, test = train_test_split(data, test_size=0.2, random_state=42) X_train = train.drop('y', axis=1).values y_train = train['y'].values X_test = test.drop('y', axis=1).values y_test = test['y'].values scaler = StandardScaler() scaler.fit(X_train) X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) per_clf = Perceptron() per_clf.fit(X_train, y_train) y_pred = per_clf.predict(X_test) accuracy_score = np.mean(y_pred == y_test) self.assertTrue(accuracy_score >= 0.8)
def test_fit_predict(self): data = pd.read_csv('learnml/neural_network/tests/test_data.csv') train, test = train_test_split(data, 0.2, random_state=42) X_train = train.drop('y', axis=1).values y_train = train['y'].values X_test = test.drop('y', axis=1).values y_test = test['y'].values scaler = StandardScaler() scaler.fit(X_train) X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) nn_clf = NeuralNetwork(layer_dims=np.array([2, 1]), learning_rate=1, num_iterations=100) nn_clf.fit(X_train, y_train) y_pred = nn_clf.predict(X_test) accuracy_score = np.mean(y_pred == y_test) self.assertTrue(accuracy_score >= 0.8)
def test_fit_predict(self): data = pd.read_csv('learnml/linear_model/tests/test_data.csv') train, test = train_test_split(data, test_size=0.2, random_state=42) X_train = train.drop('y', axis=1).values y_train = train['y'].values X_test = test.drop('y', axis=1).values y_test = test['y'].values scaler = StandardScaler() scaler.fit(X_train) X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) for i, penalty in enumerate(['l2', 'l1']): with self.subTest(penalty=penalty): log_clf = SGDClassifier(penalty=penalty, max_iter=5000, eta0=1) log_clf.fit(X_train, y_train) y_pred = log_clf.predict(X_test) accuracy_score = np.mean(y_pred == y_test) self.assertTrue(accuracy_score >= 0.8)