def test_two_estimators_predict1(self): pipeline = ( StandardScaler() >> (PCA() & Nystroem() & PassiveAggressiveClassifier()) >> ConcatFeatures() >> NoOp() >> PassiveAggressiveClassifier() ) trained = pipeline.fit(self.X_train, self.y_train) trained.predict(self.X_test)
def test_two_estimators_predict_proba(self): pipeline = ( StandardScaler() >> (PCA() & Nystroem() & LogisticRegression()) >> ConcatFeatures() >> NoOp() >> LogisticRegression() ) trained = pipeline.fit(self.X_train, self.y_train) trained.predict_proba(self.X_test)
def test_pipeline_freeze_trainable(self): from lale.lib.sklearn import PCA, LogisticRegression liquid = PCA() >> LogisticRegression() self.assertFalse(liquid.is_frozen_trainable()) liquid_grid = get_grid_search_parameter_grids(liquid) self.assertTrue(len(liquid_grid) > 1, f'grid size {len(liquid_grid)}') frozen = liquid.freeze_trainable() self.assertTrue(frozen.is_frozen_trainable()) frozen_grid = get_grid_search_parameter_grids(frozen) self.assertEqual(len(frozen_grid), 1)
def test_clone_with_scikit2(self): lr = LogisticRegression() from sklearn.model_selection import cross_val_score from sklearn.metrics import accuracy_score, make_scorer from sklearn.datasets import load_iris pca = PCA() trainable = pca >> lr from sklearn.base import clone iris = load_iris() X, y = iris.data, iris.target trainable2 = clone(trainable) with warnings.catch_warnings(): warnings.simplefilter("ignore") result = cross_val_score(trainable, X, y, scoring=make_scorer(accuracy_score), cv=2) result2 = cross_val_score(trainable2, X, y, scoring=make_scorer(accuracy_score), cv=2) for i in range(len(result)): self.assertEqual(result[i], result2[i]) # Testing clone with nested linear pipelines trainable = PCA() >> trainable trainable2 = clone(trainable) with warnings.catch_warnings(): warnings.simplefilter("ignore") result = cross_val_score(trainable, X, y, scoring=make_scorer(accuracy_score), cv=2) result2 = cross_val_score(trainable2, X, y, scoring=make_scorer(accuracy_score), cv=2) for i in range(len(result)): self.assertEqual(result[i], result2[i])
def test_pipeline_choice_with_hyperopt(self): from lale.lib.lale import Hyperopt from lale.lib.sklearn import BaggingClassifier clf = BaggingClassifier( base_estimator=PCA() >> (LogisticRegression() | KNeighborsClassifier())) _ = clf.auto_configure(self.X_train, self.y_train, Hyperopt, max_evals=1)
def test_with_lale_pipeline(self): from lale.lib.sklearn import VotingClassifier clf = VotingClassifier( estimators=[ ("knn", KNeighborsClassifier()), ("pca_lr", PCA() >> LogisticRegression()), ] ) trained = clf.fit(self.X_train, self.y_train) trained.predict(self.X_test)
def test_higher_order_1(self): from lale.json_operator import from_json from lale.lib.lale import Both from lale.lib.sklearn import PCA, Nystroem operator = Both(op1=PCA(n_components=2), op2=Nystroem) json_expected = { "class": Both.class_name(), "state": "trainable", "operator": "Both", "label": "Both", "documentation_url": "https://lale.readthedocs.io/en/latest/modules/lale.lib.lale.both.html", "hyperparams": { "op1": { "$ref": "../steps/pca" }, "op2": { "$ref": "../steps/nystroem" }, }, "steps": { "pca": { "class": PCA.class_name(), "state": "trainable", "operator": "PCA", "label": "PCA", "documentation_url": "https://lale.readthedocs.io/en/latest/modules/lale.lib.sklearn.pca.html", "hyperparams": { "n_components": 2 }, "is_frozen_trainable": False, }, "nystroem": { "class": Nystroem.class_name(), "state": "planned", "operator": "Nystroem", "label": "Nystroem", "documentation_url": "https://lale.readthedocs.io/en/latest/modules/lale.lib.sklearn.nystroem.html", }, }, "is_frozen_trainable": False, } json = operator.to_json() self.assertEqual(json, json_expected) operator_2 = from_json(json) json_2 = operator_2.to_json() self.assertEqual(json, json_2)
def test_remove_last4(self): pipeline = ( StandardScaler() >> (PCA() & Nystroem() & PassiveAggressiveClassifier()) >> ConcatFeatures() >> NoOp() >> PassiveAggressiveClassifier() ) new_pipeline = pipeline.remove_last(inplace=True) self.assertEqual(len(new_pipeline._steps), 6) self.assertEqual(len(pipeline._steps), 6)
def test_make_choice_with_instance(self): from lale.operators import make_union, make_choice, make_pipeline from sklearn.datasets import load_iris iris = load_iris() X, y = iris.data, iris.target tfm = PCA() | Nystroem() | NoOp() with self.assertRaises(AttributeError): trained = tfm.fit(X, y) planned_pipeline1 = (OneHotEncoder | NoOp) >> tfm >> (LogisticRegression | KNeighborsClassifier) planned_pipeline2 = (OneHotEncoder | NoOp) >> (PCA | Nystroem) >> (LogisticRegression | KNeighborsClassifier) planned_pipeline3 = make_choice(OneHotEncoder, NoOp) >> make_choice(PCA, Nystroem) >> make_choice(LogisticRegression, KNeighborsClassifier)
def test_with_voting_classifier2(self): lr = LogisticRegression() pca = PCA() trainable = pca >> lr from sklearn.ensemble import VotingClassifier vclf = VotingClassifier(estimators=[('lr', lr), ('pipe', trainable)]) from sklearn.datasets import load_iris iris = load_iris() X, y = iris.data, iris.target vclf.fit(X, y)
def test_export_to_sklearn_pipeline(self): lale_pipeline = PCA(n_components=3) >> KNeighborsClassifier() trained_lale_pipeline = lale_pipeline.fit(self.X_train, self.y_train) sklearn_pipeline = trained_lale_pipeline.export_to_sklearn_pipeline() for i, pipeline_step in enumerate(sklearn_pipeline.named_steps): sklearn_step_params = sklearn_pipeline.named_steps[ pipeline_step].get_params() lale_sklearn_params = self.get_sklearn_params( trained_lale_pipeline.steps()[i]) self.assertEqual(sklearn_step_params, lale_sklearn_params) self.assert_equal_predictions(sklearn_pipeline, trained_lale_pipeline)
def test_fit_args(self): from lale.lib.lale import TopKVotingClassifier from lale.lib.sklearn import Nystroem ensemble = TopKVotingClassifier( estimator=(PCA() | Nystroem()) >> (LogisticRegression() | KNeighborsClassifier()), k=2, ) trained = ensemble.fit(self.X_train, self.y_train) trained.predict(self.X_test)
def test_no_max_schema(self): pca = PCA().customize_schema(n_components=schemas.Float(min=0.0)) plan = ( (pca & (MinMaxScaler | Normalizer)) >> ConcatFeatures() >> (MinMaxScaler | Normalizer) >> (LogisticRegression | KNeighborsClassifier) ) from lale.search.search_space import SearchSpaceError with self.assertRaises(SearchSpaceError): run_hyperopt_on_planned_pipeline(plan)
def test_fit_args(self): from sklearn.datasets import load_iris from lale.lib.lale import TopKVotingClassifier from lale.lib.sklearn import Nystroem from sklearn.metrics import accuracy_score ensemble = TopKVotingClassifier( estimator=(PCA() | Nystroem()) >> (LogisticRegression() | KNeighborsClassifier()), k=2) trained = ensemble.fit(self.X_train, self.y_train) trained.predict(self.X_test)
def test_multiple_estimators_predict_predict_proba(self): pipeline = ( StandardScaler() >> (LogisticRegression() & PCA()) >> ConcatFeatures() >> (NoOp() & LinearSVC()) >> ConcatFeatures() >> KNeighborsClassifier() ) pipeline.fit(self.X_train, self.y_train) _ = pipeline.predict_proba(self.X_test) _ = pipeline.predict(self.X_test)
def test_with_gridsearchcv_auto_wrapped_pipe2(self): from sklearn.datasets import load_iris from sklearn.metrics import accuracy_score, make_scorer lr = LogisticRegression() pca1 = PCA() pca1._name = "PCA1" pca2 = PCA() pca2._name = "PCA2" trainable = (pca1 | pca2) >> lr with warnings.catch_warnings(): warnings.simplefilter("ignore") from lale.lib.lale import GridSearchCV clf = GridSearchCV(estimator=trainable, lale_num_samples=1, lale_num_grids=1, cv=2, scoring=make_scorer(accuracy_score)) iris = load_iris() clf.fit(iris.data, iris.target)
def test_pipeline_create_trainable(self): pipeline = lale.lib.sklearn.Pipeline( steps=[("pca1", PCA()), ("lr1", LogisticRegression())]) self.assertIsInstance(pipeline, lale.operators.TrainableIndividualOp) trained = pipeline.fit(self.X_train, self.y_train) pca_trained, lr_trained = [ op for _, op in trained.hyperparams()["steps"] ] self.assertIsInstance(pca_trained, lale.operators.TrainedIndividualOp) self.assertIsInstance(lr_trained, lale.operators.TrainedIndividualOp) predictions = trained.predict(self.X_test) accuracy_score(self.y_test, predictions)
def test_empty_schema(self): pca = PCA().customize_schema(whiten=schemas.Schema()) plan = ( (pca & (MinMaxScaler | Normalizer)) >> ConcatFeatures() >> (MinMaxScaler | Normalizer) >> (LogisticRegression | KNeighborsClassifier) ) from lale.search.schema2search_space import OperatorSchemaError with self.assertRaises(OperatorSchemaError): run_hyperopt_on_planned_pipeline(plan)
def test_higher_order_1(self): from lale.lib.lale import Both from lale.lib.sklearn import PCA, Nystroem from lale.json_operator import from_json operator = Both(op1=PCA(n_components=2), op2=Nystroem) json_expected = { 'class': 'lale.lib.lale.both.BothImpl', 'state': 'trainable', 'operator': 'Both', 'label': 'Both', 'documentation_url': 'https://lale.readthedocs.io/en/latest/modules/lale.lib.lale.both.html', 'hyperparams': { 'op1': { '$ref': '../steps/pca' }, 'op2': { '$ref': '../steps/nystroem' } }, 'steps': { 'pca': { 'class': 'lale.lib.sklearn.pca.PCAImpl', 'state': 'trainable', 'operator': 'PCA', 'label': 'PCA', 'documentation_url': 'https://lale.readthedocs.io/en/latest/modules/lale.lib.sklearn.pca.html', 'hyperparams': { 'n_components': 2 }, 'is_frozen_trainable': False }, 'nystroem': { 'class': 'lale.lib.sklearn.nystroem.NystroemImpl', 'state': 'planned', 'operator': 'Nystroem', 'label': 'Nystroem', 'documentation_url': 'https://lale.readthedocs.io/en/latest/modules/lale.lib.sklearn.nystroem.html' } }, 'is_frozen_trainable': False } json = operator.to_json() self.assertEqual(json, json_expected) operator_2 = from_json(json) json_2 = operator_2.to_json() self.assertEqual(json, json_2)
def test_pipeline_clone(self): from sklearn.base import clone from lale.operators import Pipeline pipeline = Pipeline(([('pca1', PCA()), ('lr1', LogisticRegression())])) trained = pipeline.fit(self.X_train, self.y_train) predictions = trained.predict(self.X_test) orig_acc = accuracy_score(self.y_test, predictions) cloned_pipeline = clone(pipeline) trained = cloned_pipeline.fit(self.X_train, self.y_train) predictions = trained.predict(self.X_test) cloned_acc = accuracy_score(self.y_test, predictions) self.assertEqual(orig_acc, cloned_acc)
def test_pipeline_create_trained(self): import lale.lib.sklearn import lale.operators orig_trainable = PCA() >> LogisticRegression() orig_trained = orig_trainable.fit(self.X_train, self.y_train) self.assertIsInstance(orig_trained, lale.operators.TrainedPipeline) pca_trained, lr_trained = orig_trained.steps() pre_trained = lale.lib.sklearn.Pipeline( steps=[("pca1", pca_trained), ("lr1", lr_trained)]) self.assertIsInstance(pre_trained, lale.operators.TrainedIndividualOp) predictions = pre_trained.predict(self.X_test) accuracy_score(self.y_test, predictions)
def test_with_pandas(self): from lale.datasets import load_iris_df import warnings warnings.filterwarnings("ignore") pca = PCA(n_components=3) nys = Nystroem(n_components=10) concat = ConcatFeatures() lr = LogisticRegression(random_state=42, C=0.1) trainable = (pca & nys) >> concat >> lr (X_train, y_train), (X_test, y_test) = load_iris_df() trained = trainable.fit(X_train, y_train) predicted = trained.predict(X_test)
def test_concat_with_hyperopt(self): from lale.lib.lale import Hyperopt pca = PCA(n_components=3) nys = Nystroem(n_components=10) concat = ConcatFeatures() lr = LogisticRegression(random_state=42, C=0.1) trainable = (pca & nys) >> concat >> lr clf = Hyperopt(estimator=trainable, max_evals=2) from sklearn.datasets import load_iris iris_data = load_iris() clf.fit(iris_data.data, iris_data.target) clf.predict(iris_data.data)
def test_string_labels(self): from lale.lib.imblearn import CondensedNearestNeighbour print(type(CondensedNearestNeighbour)) from lale.operators import make_pipeline y_train = ["low" if label == 0 else "high" for label in self.y_train] pipeline = CondensedNearestNeighbour( operator=make_pipeline(PCA(), LogisticRegression()), sampling_strategy=["high"], ) trained = pipeline.fit(self.X_train, y_train) _ = trained.predict(self.X_test)
def test_fit_smaller_trials(self): from lale.lib.lale import TopKVotingClassifier from lale.lib.sklearn import Nystroem ensemble = TopKVotingClassifier( estimator=(PCA() | Nystroem()) >> (LogisticRegression() | KNeighborsClassifier()), args_to_optimizer={"max_evals": 3}, k=20, ) trained = ensemble.fit(self.X_train, self.y_train) final_ensemble = trained._impl._best_estimator self.assertLessEqual(len(final_ensemble._impl._wrapped_model.estimators), 3)
def test_feature_preprocessor(self): X_train, y_train = self.X_train, self.y_train import importlib module_name = ".".join(fproc_name.split(".")[0:-1]) class_name = fproc_name.split(".")[-1] module = importlib.import_module(module_name) class_ = getattr(module, class_name) fproc = class_() from lale.lib.sklearn.one_hot_encoder import OneHotEncoder if isinstance(fproc, OneHotEncoder): # type: ignore # fproc = OneHotEncoder(handle_unknown = 'ignore') # remove the hack when this is fixed fproc = PCA() # test_schemas_are_schemas lale.type_checking.validate_is_schema(fproc.input_schema_fit()) lale.type_checking.validate_is_schema(fproc.input_schema_transform()) lale.type_checking.validate_is_schema(fproc.output_schema_transform()) lale.type_checking.validate_is_schema(fproc.hyperparam_schema()) # test_init_fit_transform trained = fproc.fit(self.X_train, self.y_train) _ = trained.transform(self.X_test) # test_predict_on_trainable trained = fproc.fit(X_train, y_train) fproc.transform(X_train) # test_to_json fproc.to_json() # test_in_a_pipeline # This test assumes that the output of feature processing is compatible with LogisticRegression from lale.lib.sklearn import LogisticRegression pipeline = fproc >> LogisticRegression() trained = pipeline.fit(self.X_train, self.y_train) _ = trained.predict(self.X_test) # Tune the pipeline with LR using Hyperopt from lale.lib.lale import Hyperopt hyperopt = Hyperopt(estimator=pipeline, max_evals=1, verbose=True, cv=3) trained = hyperopt.fit(self.X_train, self.y_train) _ = trained.predict(self.X_test)
def test_import_from_sklearn_pipeline_nested_pipeline1(self): from sklearn.pipeline import FeatureUnion, make_pipeline from sklearn.decomposition import PCA from sklearn.kernel_approximation import Nystroem from sklearn.feature_selection import SelectKBest from sklearn.neighbors import KNeighborsClassifier from sklearn.pipeline import make_pipeline union = FeatureUnion([ ("selectkbest_pca", make_pipeline( SelectKBest(k=3), FeatureUnion([('pca', PCA(n_components=1)), ('nested_pipeline', make_pipeline(SelectKBest(k=2), Nystroem()))]))), ("nys", Nystroem(n_components=2, random_state=42)) ]) sklearn_pipeline = make_pipeline(union, KNeighborsClassifier()) lale_pipeline = import_from_sklearn_pipeline(sklearn_pipeline) print(lale_pipeline.to_json()) self.assertEqual(len(lale_pipeline.edges()), 8) #These assertions assume topological sort, which may not be unique. So the assertions are brittle. from lale.lib.sklearn.pca import PCAImpl from lale.lib.sklearn.nystroem import NystroemImpl from lale.lib.lale.concat_features import ConcatFeaturesImpl from lale.lib.sklearn.k_neighbors_classifier import KNeighborsClassifierImpl self.assertIsInstance(lale_pipeline.edges()[0][0]._impl, SelectKBest) self.assertIsInstance(lale_pipeline.edges()[0][1]._impl, PCAImpl) self.assertIsInstance(lale_pipeline.edges()[1][0]._impl, SelectKBest) self.assertIsInstance(lale_pipeline.edges()[1][1]._impl, SelectKBest) self.assertIsInstance(lale_pipeline.edges()[2][0]._impl, SelectKBest) self.assertIsInstance(lale_pipeline.edges()[2][1]._impl, NystroemImpl) self.assertIsInstance(lale_pipeline.edges()[3][0]._impl, PCAImpl) self.assertIsInstance(lale_pipeline.edges()[3][1]._impl, ConcatFeaturesImpl) self.assertIsInstance(lale_pipeline.edges()[4][0]._impl, NystroemImpl) self.assertIsInstance(lale_pipeline.edges()[4][1]._impl, ConcatFeaturesImpl) self.assertIsInstance(lale_pipeline.edges()[5][0]._impl, ConcatFeaturesImpl) self.assertIsInstance(lale_pipeline.edges()[5][1]._impl, ConcatFeaturesImpl) self.assertIsInstance(lale_pipeline.edges()[6][0]._impl, NystroemImpl) self.assertIsInstance(lale_pipeline.edges()[6][1]._impl, ConcatFeaturesImpl) self.assertIsInstance(lale_pipeline.edges()[7][0]._impl, ConcatFeaturesImpl) self.assertIsInstance(lale_pipeline.edges()[7][1]._impl, KNeighborsClassifierImpl) self.assert_equal_predictions(sklearn_pipeline, lale_pipeline)
def test_compose4(self): from lale.operators import make_choice digits = sklearn.datasets.load_digits() ohe = OneHotEncoder(handle_unknown=OneHotEncoder.handle_unknown.ignore) ohe.get_params() no_op = NoOp() pca = PCA() nys = Nystroem() lr = LogisticRegression() knn = KNeighborsClassifier() step1 = ohe | no_op step2 = pca | nys step3 = lr | knn model_plan = step1 >> step2 >> step3
def test_import_from_sklearn_pipeline1(self): from sklearn.decomposition import PCA from sklearn.neighbors import KNeighborsClassifier from sklearn.pipeline import make_pipeline sklearn_pipeline = make_pipeline(PCA(n_components=3), KNeighborsClassifier()) lale_pipeline = import_from_sklearn_pipeline(sklearn_pipeline) for i, pipeline_step in enumerate(sklearn_pipeline.named_steps): sklearn_step_params = sklearn_pipeline.named_steps[ pipeline_step].get_params() lale_sklearn_params = lale_pipeline.steps( )[i]._impl._wrapped_model.get_params() self.assertEqual(sklearn_step_params, lale_sklearn_params) self.assert_equal_predictions(sklearn_pipeline, lale_pipeline)
def test_export_to_sklearn_pipeline(self): from lale.lib.sklearn import PCA from lale.lib.sklearn import KNeighborsClassifier from sklearn.pipeline import make_pipeline lale_pipeline = PCA(n_components=3) >> KNeighborsClassifier() trained_lale_pipeline = lale_pipeline.fit(self.X_train, self.y_train) sklearn_pipeline = trained_lale_pipeline.export_to_sklearn_pipeline() for i, pipeline_step in enumerate(sklearn_pipeline.named_steps): sklearn_step_params = sklearn_pipeline.named_steps[ pipeline_step].get_params() lale_sklearn_params = trained_lale_pipeline.steps( )[i]._impl._wrapped_model.get_params() self.assertEqual(sklearn_step_params, lale_sklearn_params) self.assert_equal_predictions(sklearn_pipeline, trained_lale_pipeline)