def test_pandas_input(self): # check cross_val_score doesn't destroy pandas dataframe types = [(MockDataFrame, MockDataFrame)] try: from pandas import Series, DataFrame types.append((DataFrame, Series)) except ImportError: pass X = np.arange(100).reshape(10, 10) y = np.array([0] * 5 + [1] * 5) for InputFeatureType, TargetType in types: # X dataframe, y series X_df, y_ser = InputFeatureType(X), TargetType(y) clf = CheckingClassifier( check_X=lambda x: isinstance(x, InputFeatureType), check_y=lambda x: isinstance(x, TargetType), ) grid_search = TuneGridSearchCV(clf, {"foo_param": [1, 2, 3]}) grid_search.fit(X_df, y_ser).score(X_df, y_ser) grid_search.predict(X_df) self.assertTrue(hasattr(grid_search, "cv_results_"))
def test_grid_search_sparse_scoring(self): X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0) clf = LinearSVC() cv = TuneGridSearchCV(clf, {"C": [0.1, 1.0]}, scoring="f1") cv.fit(X_[:180], y_[:180]) y_pred = cv.predict(X_[180:]) C = cv.best_estimator_.C X_ = sp.csr_matrix(X_) clf = LinearSVC() cv = TuneGridSearchCV(clf, {"C": [0.1, 1.0]}, scoring="f1") cv.fit(X_[:180], y_[:180]) y_pred2 = cv.predict(X_[180:]) C2 = cv.best_estimator_.C assert_array_equal(y_pred, y_pred2) self.assertEqual(C, C2) # test loss where greater is worse def f1_loss(y_true_, y_pred_): return -f1_score(y_true_, y_pred_) F1Loss = make_scorer(f1_loss, greater_is_better=False) cv = TuneGridSearchCV(clf, {"C": [0.1, 1.0]}, scoring=F1Loss) cv.fit(X_[:180], y_[:180]) y_pred3 = cv.predict(X_[180:]) C3 = cv.best_estimator_.C self.assertEqual(C, C3) assert_array_equal(y_pred, y_pred3)
def test_diabetes(self): # load the diabetes datasets dataset = datasets.load_diabetes() X = dataset.data y = dataset.target X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0) # prepare a range of alpha values to test alphas = np.array([1, 0.1, 0.01, 0.001, 0.0001, 0]) param_grid = dict(alpha=alphas) # create and fit a ridge regression model, testing each alpha model = linear_model.Ridge() tune_search = TuneGridSearchCV( model, param_grid, ) tune_search.fit(X_train, y_train) pred = tune_search.predict(X_test) print(pred) error = sum(np.array(pred) - np.array(y_test)) / len(pred) print(error)
def test_digits(self): # Loading the Digits dataset digits = datasets.load_digits() # To apply an classifier on this data, we need to flatten the image, to # turn the data in a (samples, feature) matrix: n_samples = len(digits.images) X = digits.images.reshape((n_samples, -1)) y = digits.target # Split the dataset in two equal parts X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0) # Set the parameters by cross-validation tuned_parameters = { "kernel": ["rbf"], "gamma": [1e-3, 1e-4], "C": [1, 10, 100, 1000] } tune_search = TuneGridSearchCV(SVC(), tuned_parameters, max_iters=20) tune_search.fit(X_train, y_train) pred = tune_search.predict(X_test) print(pred) accuracy = np.count_nonzero( np.array(pred) == np.array(y_test)) / len(pred) print(accuracy)
def test_grid_search_precomputed_kernel(self): # Test that grid search works when the input features are given in the # form of a precomputed kernel matrix X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0) # compute the training kernel matrix corresponding to the linear kernel K_train = np.dot(X_[:180], X_[:180].T) y_train = y_[:180] clf = SVC(kernel="precomputed") cv = TuneGridSearchCV(clf, {"C": [0.1, 1.0]}) cv.fit(K_train, y_train) self.assertTrue(cv.best_score_ >= 0) # compute the test kernel matrix K_test = np.dot(X_[180:], X_[:180].T) y_test = y_[180:] y_pred = cv.predict(K_test) self.assertTrue(np.mean(y_pred == y_test) >= 0) # test error is raised when the precomputed kernel is not array-like # or sparse with self.assertRaises(TuneError): cv.fit(K_train.tolist(), y_train)
def test_grid_search_sparse(self): # Test that grid search works with both dense and sparse matrices X_, y_ = make_classification( n_samples=200, n_features=100, random_state=0) clf = LinearSVC() cv = TuneGridSearchCV(clf, {"C": [0.1, 1.0]}) cv.fit(X_[:180], y_[:180]) y_pred = cv.predict(X_[180:]) C = cv.best_estimator_.C X_ = sp.csr_matrix(X_) clf = LinearSVC() cv = TuneGridSearchCV(clf, {"C": [0.1, 1.0]}) cv.fit(X_[:180].tocoo(), y_[:180]) y_pred2 = cv.predict(X_[180:]) C2 = cv.best_estimator_.C self.assertTrue(np.mean(y_pred == y_pred2) >= 0.9) self.assertEqual(C, C2)
this will require the estimator to have `partial_fit`, but we use sklearn's `warm_start` parameter to do this here. We fit the estimator for one epoch, then `warm_start` to pick up from where we left off, continuing until the trial is early stopped or `max_iters` is reached. """ from tune_sklearn import TuneGridSearchCV from sklearn.model_selection import train_test_split from sklearn.datasets import load_iris from sklearn.ensemble import RandomForestClassifier import numpy as np x, y = load_iris(return_X_y=True) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.2) clf = RandomForestClassifier() parameter_grid = {"min_samples_split": [2, 3, 4]} tune_search = TuneGridSearchCV( clf, parameter_grid, early_stopping=True, max_iters=20, ) tune_search.fit(x_train, y_train) pred = tune_search.predict(x_test) accuracy = np.count_nonzero(np.array(pred) == np.array(y_test)) / len(pred) print(accuracy)