def test_predict_proba(self): trainable = KNeighborsClassifier() iris = sklearn.datasets.load_iris() trained = trainable.fit(iris.data, iris.target) #with self.assertWarns(DeprecationWarning): predicted = trainable.predict_proba(iris.data) predicted = trained.predict_proba(iris.data)
def test_with_multioutput_targets(self): from sklearn.datasets import make_classification, load_iris import numpy as np from sklearn.utils import shuffle X, y1 = make_classification(n_samples=10, n_features=100, n_informative=30, n_classes=3, random_state=1) y2 = shuffle(y1, random_state=1) y3 = shuffle(y1, random_state=2) Y = np.vstack((y1, y2, y3)).T trainable = KNeighborsClassifier() trained = trainable.fit(X, Y) predictions = trained.predict(X)
def test_trained_individual_op_freeze_trainable(self): from lale.lib.sklearn import KNeighborsClassifier from lale.operators import TrainedIndividualOp trainable = KNeighborsClassifier(n_neighbors=1) X = [[0.0], [1.0], [2.0]] y_old = [0.0, 0.0, 1.0] liquid = trainable.fit(X, y_old) self.assertIsInstance(liquid, TrainedIndividualOp) self.assertFalse(liquid.is_frozen_trainable()) self.assertIn('algorithm', liquid.free_hyperparams()) frozen = liquid.freeze_trainable() self.assertIsInstance(frozen, TrainedIndividualOp) self.assertTrue(frozen.is_frozen_trainable()) self.assertFalse(frozen.is_frozen_trained()) self.assertEqual(len(frozen.free_hyperparams()), 0)
def test_import_from_sklearn_pipeline_feature_union(self): from sklearn.pipeline import FeatureUnion from sklearn.decomposition import PCA from sklearn.kernel_approximation import Nystroem from sklearn.neighbors import KNeighborsClassifier from sklearn.pipeline import make_pipeline union = FeatureUnion([("pca", PCA(n_components=1)), ("nys", Nystroem(n_components=2, random_state=42))]) sklearn_pipeline = make_pipeline(union, KNeighborsClassifier()) lale_pipeline = import_from_sklearn_pipeline(sklearn_pipeline) self.assertEqual(len(lale_pipeline.edges()), 3) from lale.lib.sklearn.pca import PCAImpl from lale.lib.sklearn.nystroem import NystroemImpl from lale.lib.lale.concat_features import ConcatFeaturesImpl from lale.lib.sklearn.k_neighbors_classifier import KNeighborsClassifierImpl self.assertEqual(lale_pipeline.edges()[0][0]._impl_class(), PCAImpl) self.assertEqual(lale_pipeline.edges()[0][1]._impl_class(), ConcatFeaturesImpl) self.assertEqual(lale_pipeline.edges()[1][0]._impl_class(), NystroemImpl) self.assertEqual(lale_pipeline.edges()[1][1]._impl_class(), ConcatFeaturesImpl) self.assertEqual(lale_pipeline.edges()[2][0]._impl_class(), ConcatFeaturesImpl) self.assertEqual(lale_pipeline.edges()[2][1]._impl_class(), KNeighborsClassifierImpl) self.assert_equal_predictions(sklearn_pipeline, lale_pipeline)
def test_with_concat_features2(self): import warnings warnings.filterwarnings("ignore") from sklearn.datasets import load_iris from sklearn.metrics import accuracy_score from lale.lib.lale import Hyperopt data = load_iris() X, y = data.data, data.target pca = PCA(n_components=3) nys = Nystroem(n_components=10) concat = ConcatFeatures() lr = LogisticRegression(random_state=42, C=0.1) from lale.operators import make_pipeline pipeline = make_pipeline( ((((SimpleImputer() | NoOp()) >> pca) & nys) >> concat >> lr) | KNeighborsClassifier() ) clf = Hyperopt(estimator=pipeline, max_evals=1, handle_cv_failure=True) trained = clf.fit(X, y) predictions = trained.predict(X) print(accuracy_score(y, predictions)) warnings.resetwarnings()
def test_invalid_args(self): import jsonschema clf = LGBMClassifier() nsga2_args = { 'estimator': clf, 'cv': 3, 'max_evals': 50, 'population_size': 10 } # No scorer specified with self.assertRaises(jsonschema.exceptions.ValidationError): _ = NSGA2(**nsga2_args) # Less scorers provided with self.assertRaises(AssertionError): _ = NSGA2(scoring=['accuracy'], **nsga2_args) # Specify LALE Pipeline as estimator. It should raise # AssertionError as MOO over pipelines is not supported pipeline = MinMaxScaler() >> KNeighborsClassifier() fpr_scorer = make_scorer(compute_fpr, greater_is_better=False) with self.assertRaises(AssertionError): _ = NSGA2(estimator=pipeline, scoring=['accuracy', fpr_scorer])
def test_with_lale_classifiers(self): clf = VotingClassifier( estimators=[("knn", KNeighborsClassifier()), ("lr", LogisticRegression())]) trained = clf.fit(self.X_train, self.y_train) trained.predict(self.X_test)
def test_get_named_pipeline(self): pipeline = MinMaxScaler() >> KNeighborsClassifier() trained_pipeline = pipeline.fit(self.X_train, self.y_train) fpr_scorer = make_scorer(compute_fpr, greater_is_better=False) nsga2_args = { 'scoring': ['accuracy', fpr_scorer], 'best_score': [1, 0], 'cv': 3, 'max_evals': 20, 'population_size': 10 } opt_last = OptimizeLast(estimator=trained_pipeline, last_optimizer=NSGA2, optimizer_args=nsga2_args) res_last = opt_last.fit(self.X_train, self.y_train) df_summary = res_last.summary() pareto_pipeline = res_last.get_pipeline(pipeline_name='p0') self.assertEqual(type(trained_pipeline), type(pareto_pipeline)) if (df_summary.shape[0] > 1): pareto_pipeline = res_last.get_pipeline(pipeline_name='p1') self.assertEqual(type(trained_pipeline), type(pareto_pipeline))
def test_export_to_sklearn_pipeline2(self): from sklearn.feature_selection import SelectKBest from sklearn.pipeline import FeatureUnion lale_pipeline = ( ( ( (PCA(svd_solver="randomized", random_state=42) & SelectKBest(k=3)) >> ConcatFeatures() ) & Nystroem(random_state=42) ) >> ConcatFeatures() >> KNeighborsClassifier() ) trained_lale_pipeline = lale_pipeline.fit(self.X_train, self.y_train) sklearn_pipeline = trained_lale_pipeline.export_to_sklearn_pipeline() self.assertIsInstance( sklearn_pipeline.named_steps["featureunion"], FeatureUnion ) from sklearn.neighbors import KNeighborsClassifier as SklearnKNN self.assertIsInstance( sklearn_pipeline.named_steps["kneighborsclassifier"], SklearnKNN ) self.assert_equal_predictions(sklearn_pipeline, trained_lale_pipeline)
def dont_test_car_hyperopt(self): from lale.datasets.auto_weka import fetch_car from sklearn.metrics import accuracy_score, make_scorer from sklearn.preprocessing import LabelEncoder import pandas as pd from lale.lib.weka import J48 from lalegpl.lib.r import ArulesCBAClassifier from lale.operators import make_pipeline from lale.lib.lale import HyperoptClassifier from lale.lib.sklearn import LogisticRegression, KNeighborsClassifier (X_train, y_train), (X_test, y_test) = fetch_car() y_name = y_train.name le = LabelEncoder() y_train = le.fit_transform(y_train) y_test = le.transform(y_test) y_train = pd.Series(y_train, name=y_name) y_test = pd.Series(y_test, name=y_name) planned_pipeline = make_pipeline(ArulesCBAClassifier() | LogisticRegression() | KNeighborsClassifier()) clf = HyperoptClassifier(model=planned_pipeline, max_evals=1) best_pipeline = clf.fit(X_train, y_train) print(accuracy_score(y_test, best_pipeline.predict(X_test)))
def test_import_from_sklearn_pipeline_nested_pipeline(self): from sklearn.pipeline import FeatureUnion, make_pipeline from sklearn.decomposition import PCA from sklearn.kernel_approximation import Nystroem from sklearn.feature_selection import SelectKBest from sklearn.neighbors import KNeighborsClassifier from sklearn.pipeline import make_pipeline union = FeatureUnion([("selectkbest_pca", make_pipeline(SelectKBest(k=3), PCA(n_components=1))), ("nys", Nystroem(n_components=2, random_state=42))]) sklearn_pipeline = make_pipeline(union, KNeighborsClassifier()) lale_pipeline = import_from_sklearn_pipeline(sklearn_pipeline) self.assertEqual(len(lale_pipeline.edges()), 4) from lale.lib.sklearn.pca import PCAImpl from lale.lib.sklearn.nystroem import NystroemImpl from lale.lib.lale.concat_features import ConcatFeaturesImpl from lale.lib.sklearn.k_neighbors_classifier import KNeighborsClassifierImpl #These assertions assume topological sort self.assertIsInstance(lale_pipeline.edges()[0][0]._impl, SelectKBest) self.assertIsInstance(lale_pipeline.edges()[0][1]._impl, PCAImpl) self.assertIsInstance(lale_pipeline.edges()[1][0]._impl, PCAImpl) self.assertIsInstance(lale_pipeline.edges()[1][1]._impl, ConcatFeaturesImpl) self.assertIsInstance(lale_pipeline.edges()[2][0]._impl, NystroemImpl) self.assertIsInstance(lale_pipeline.edges()[2][1]._impl, ConcatFeaturesImpl) self.assertIsInstance(lale_pipeline.edges()[3][0]._impl, ConcatFeaturesImpl) self.assertIsInstance(lale_pipeline.edges()[3][1]._impl, KNeighborsClassifierImpl) self.assert_equal_predictions(sklearn_pipeline, lale_pipeline)
def test_individual_op_freeze_trained(self): from lale.lib.sklearn import KNeighborsClassifier trainable = KNeighborsClassifier(n_neighbors=1) X = [[0.0], [1.0], [2.0]] y_old = [0.0, 0.0, 1.0] y_new = [1.0, 0.0, 0.0] liquid_old = trainable.fit(X, y_old) self.assertEqual(list(liquid_old.predict(X)), list(y_old)) liquid_new = liquid_old.fit(X, y_new) self.assertEqual(list(liquid_new.predict(X)), list(y_new)) frozen_old = trainable.fit(X, y_old).freeze_trained() self.assertFalse(liquid_old.is_frozen_trained()) self.assertTrue(frozen_old.is_frozen_trained()) self.assertEqual(list(frozen_old.predict(X)), list(y_old)) frozen_new = frozen_old.fit(X, y_new) self.assertEqual(list(frozen_new.predict(X)), list(y_old))
def test_pipeline_choice_with_hyperopt(self): from lale.lib.lale import Hyperopt from lale.lib.sklearn import BaggingClassifier clf = BaggingClassifier( base_estimator=PCA() >> (LogisticRegression() | KNeighborsClassifier()) ) _ = clf.auto_configure(self.X_train, self.y_train, Hyperopt, max_evals=1)
def test_with_hyperopt(self): from lale.lib.lale import Hyperopt from lale.lib.sklearn import VotingClassifier clf = VotingClassifier( estimators=[("knn", KNeighborsClassifier()), ("lr", LogisticRegression())] ) _ = clf.auto_configure(self.X_train, self.y_train, Hyperopt, max_evals=1)
def test_with_lale_pipeline(self): from lale.lib.sklearn import VotingClassifier clf = VotingClassifier(estimators=[( 'knn', KNeighborsClassifier()), ('pca_lr', PCA() >> LogisticRegression())]) trained = clf.fit(self.X_train, self.y_train) trained.predict(self.X_test)
def test_export_to_sklearn_pipeline_with_noop_4(self): from lale.lib.sklearn import KNeighborsClassifier from lale.lib.lale import NoOp from sklearn.pipeline import make_pipeline lale_pipeline = NoOp() >> KNeighborsClassifier() trained_lale_pipeline = lale_pipeline.fit(self.X_train, self.y_train) sklearn_pipeline = trained_lale_pipeline.export_to_sklearn_pipeline() self.assert_equal_predictions(sklearn_pipeline, trained_lale_pipeline)
def test_trained_individual_op_freeze_trainable(self): from lale.lib.sklearn import KNeighborsClassifier from lale.operators import TrainedIndividualOp with EnableSchemaValidation(): trainable = KNeighborsClassifier(n_neighbors=1) X = np.array([[0.0], [1.0], [2.0]]) y_old = np.array([0.0, 0.0, 1.0]) liquid = trainable.fit(X, y_old) self.assertIsInstance(liquid, TrainedIndividualOp) self.assertFalse(liquid.is_frozen_trainable()) self.assertIn("algorithm", liquid.free_hyperparams()) frozen = liquid.freeze_trainable() self.assertIsInstance(frozen, TrainedIndividualOp) self.assertTrue(frozen.is_frozen_trainable()) self.assertFalse(frozen.is_frozen_trained()) self.assertEqual(len(frozen.free_hyperparams()), 0)
def test_multiple_estimators_predict_predict_proba(self): pipeline = (StandardScaler() >> (LogisticRegression() & PCA()) >> ConcatFeatures() >> (NoOp() & LinearSVC()) >> ConcatFeatures() >> KNeighborsClassifier()) pipeline.fit(self.X_train, self.y_train) _ = pipeline.predict_proba(self.X_test) _ = pipeline.predict(self.X_test)
def test_export_to_sklearn_pipeline_with_noop_3(self): from lale.lib.sklearn import PCA, KNeighborsClassifier from lale.lib.lale import NoOp from sklearn.pipeline import make_pipeline # This test is probably unnecessary, but doesn't harm at this point lale_pipeline = PCA(n_components=3) >> KNeighborsClassifier() >> NoOp() trained_lale_pipeline = lale_pipeline.fit(self.X_train, self.y_train) sklearn_pipeline = trained_lale_pipeline.export_to_sklearn_pipeline()
def test_individual_op_freeze_trained(self): from lale.lib.sklearn import KNeighborsClassifier with EnableSchemaValidation(): trainable = KNeighborsClassifier(n_neighbors=1) X = np.array([[0.0], [1.0], [2.0]]) y_old = np.array([0.0, 0.0, 1.0]) y_new = np.array([1.0, 0.0, 0.0]) liquid_old = trainable.fit(X, y_old) self.assertEqual(list(liquid_old.predict(X)), list(y_old)) liquid_new = liquid_old.fit(X, y_new) self.assertEqual(list(liquid_new.predict(X)), list(y_new)) frozen_old = trainable.fit(X, y_old).freeze_trained() self.assertFalse(liquid_old.is_frozen_trained()) self.assertTrue(frozen_old.is_frozen_trained()) self.assertEqual(list(frozen_old.predict(X)), list(y_old)) frozen_new = frozen_old.fit(X, y_new) self.assertEqual(list(frozen_new.predict(X)), list(y_old))
def test_import_from_sklearn_pipeline_nested_pipeline1(self): from sklearn.pipeline import FeatureUnion, make_pipeline from sklearn.decomposition import PCA from sklearn.kernel_approximation import Nystroem from sklearn.feature_selection import SelectKBest from sklearn.neighbors import KNeighborsClassifier from sklearn.pipeline import make_pipeline union = FeatureUnion([ ("selectkbest_pca", make_pipeline( SelectKBest(k=3), FeatureUnion([('pca', PCA(n_components=1)), ('nested_pipeline', make_pipeline(SelectKBest(k=2), Nystroem()))]))), ("nys", Nystroem(n_components=2, random_state=42)) ]) sklearn_pipeline = make_pipeline(union, KNeighborsClassifier()) lale_pipeline = import_from_sklearn_pipeline(sklearn_pipeline) self.assertEqual(len(lale_pipeline.edges()), 8) #These assertions assume topological sort, which may not be unique. So the assertions are brittle. from lale.lib.sklearn.pca import PCAImpl from lale.lib.sklearn.nystroem import NystroemImpl from lale.lib.lale.concat_features import ConcatFeaturesImpl from lale.lib.sklearn.k_neighbors_classifier import KNeighborsClassifierImpl from lale.lib.sklearn.select_k_best import SelectKBestImpl self.assertEqual(lale_pipeline.edges()[0][0]._impl_class(), SelectKBestImpl) self.assertEqual(lale_pipeline.edges()[0][1]._impl_class(), PCAImpl) self.assertEqual(lale_pipeline.edges()[1][0]._impl_class(), SelectKBestImpl) self.assertEqual(lale_pipeline.edges()[1][1]._impl_class(), SelectKBestImpl) self.assertEqual(lale_pipeline.edges()[2][0]._impl_class(), SelectKBestImpl) self.assertEqual(lale_pipeline.edges()[2][1]._impl_class(), NystroemImpl) self.assertEqual(lale_pipeline.edges()[3][0]._impl_class(), PCAImpl) self.assertEqual(lale_pipeline.edges()[3][1]._impl_class(), ConcatFeaturesImpl) self.assertEqual(lale_pipeline.edges()[4][0]._impl_class(), NystroemImpl) self.assertEqual(lale_pipeline.edges()[4][1]._impl_class(), ConcatFeaturesImpl) self.assertEqual(lale_pipeline.edges()[5][0]._impl_class(), ConcatFeaturesImpl) self.assertEqual(lale_pipeline.edges()[5][1]._impl_class(), ConcatFeaturesImpl) self.assertEqual(lale_pipeline.edges()[6][0]._impl_class(), NystroemImpl) self.assertEqual(lale_pipeline.edges()[6][1]._impl_class(), ConcatFeaturesImpl) self.assertEqual(lale_pipeline.edges()[7][0]._impl_class(), ConcatFeaturesImpl) self.assertEqual(lale_pipeline.edges()[7][1]._impl_class(), KNeighborsClassifierImpl) self.assert_equal_predictions(sklearn_pipeline, lale_pipeline)
def test_with_observed_gridsearch(self): from lale.lib.sklearn import VotingClassifier from lale.lib.lale import GridSearchCV from lale.lib.lale import Observing from lale.lib.lale.observing import LoggingObserver from sklearn.metrics import accuracy_score, make_scorer clf = VotingClassifier(estimators=[('knn', KNeighborsClassifier()), ('rc', RidgeClassifier())], voting='hard') trained = clf.auto_configure(self.X_train, self.y_train, GridSearchCV, lale_num_samples=1, lale_num_grids=1, cv=2, scoring=make_scorer(accuracy_score), observer=LoggingObserver)
def test_with_voting_classifier1(self): lr = LogisticRegression() knn = KNeighborsClassifier() from sklearn.ensemble import VotingClassifier vclf = VotingClassifier(estimators=[("lr", lr), ("knn", knn)]) iris = load_iris() X, y = iris.data, iris.target vclf.fit(X, y)
def test_smac2(self): from sklearn.metrics import accuracy_score from lale.lib.lale import SMAC planned_pipeline = (PCA | NoOp) >> KNeighborsClassifier(n_neighbors = 10000) opt = SMAC(estimator=planned_pipeline, max_evals=1) # run optimizer res = opt.fit(self.X_train, self.y_train) # Get the trials object and make sure that SMAC assigned cost_for_crash which is MAXINT by default to #at least one trial (correspond to KNN). trials = res._impl.get_trials() assert 2147483647.0 in trials.cost_per_config.values()
def test_fit_args(self): from lale.lib.lale import TopKVotingClassifier from lale.lib.sklearn import Nystroem ensemble = TopKVotingClassifier( estimator=(PCA() | Nystroem()) >> (LogisticRegression() | KNeighborsClassifier()), k=2, ) trained = ensemble.fit(self.X_train, self.y_train) trained.predict(self.X_test)
def test_export_to_sklearn_pipeline(self): lale_pipeline = PCA(n_components=3) >> KNeighborsClassifier() trained_lale_pipeline = lale_pipeline.fit(self.X_train, self.y_train) sklearn_pipeline = trained_lale_pipeline.export_to_sklearn_pipeline() for i, pipeline_step in enumerate(sklearn_pipeline.named_steps): sklearn_step_params = sklearn_pipeline.named_steps[ pipeline_step].get_params() lale_sklearn_params = self.get_sklearn_params( trained_lale_pipeline.steps()[i]) self.assertEqual(sklearn_step_params, lale_sklearn_params) self.assert_equal_predictions(sklearn_pipeline, trained_lale_pipeline)
def test_nested_pipeline1(self): from sklearn.datasets import load_iris from lale.lib.lale import Hyperopt from sklearn.metrics import accuracy_score data = load_iris() X, y = data.data, data.target #pipeline = KNeighborsClassifier() | (OneHotEncoder(handle_unknown = 'ignore') >> LogisticRegression()) pipeline = KNeighborsClassifier() | (SimpleImputer() >> LogisticRegression()) clf = Hyperopt(estimator=pipeline, max_evals=1) trained = clf.fit(X, y) predictions = trained.predict(X) print(accuracy_score(y, predictions))
def test_fit_args(self): from sklearn.datasets import load_iris from lale.lib.lale import TopKVotingClassifier from lale.lib.sklearn import Nystroem from sklearn.metrics import accuracy_score ensemble = TopKVotingClassifier( estimator=(PCA() | Nystroem()) >> (LogisticRegression() | KNeighborsClassifier()), k=2) trained = ensemble.fit(self.X_train, self.y_train) trained.predict(self.X_test)
def test_schema_validation(self): trainable_16 = KNeighborsClassifier(n_neighbors=16) with self.assertRaises(jsonschema.ValidationError): _ = trainable_16.fit(self.train_X, self.train_y) trainable_15 = KNeighborsClassifier(n_neighbors=15) trained_15 = trainable_15.fit(self.train_X, self.train_y) _ = trained_15.predict(self.test_X)
def test_fit_smaller_trials(self): from lale.lib.lale import TopKVotingClassifier from lale.lib.sklearn import Nystroem ensemble = TopKVotingClassifier( estimator=(PCA() | Nystroem()) >> (LogisticRegression() | KNeighborsClassifier()), args_to_optimizer={"max_evals": 3}, k=20, ) trained = ensemble.fit(self.X_train, self.y_train) final_ensemble = trained._impl._best_estimator self.assertLessEqual(len(final_ensemble._impl._wrapped_model.estimators), 3)