def test_runtime_limit_hoc(self): import time planned_pipeline = (MinMaxScaler | Normalizer) >> ( LogisticRegression | KNeighborsClassifier ) from sklearn.datasets import load_iris X, y = load_iris(return_X_y=True) max_opt_time = 2.0 hoc = Hyperopt( estimator=planned_pipeline, max_evals=1, cv=3, scoring="accuracy", max_opt_time=max_opt_time, ) start = time.time() _ = hoc.fit(X, y) end = time.time() opt_time = end - start rel_diff = (opt_time - max_opt_time) / max_opt_time assert ( rel_diff < 0.7 ), "Max time: {}, Actual time: {}, relative diff: {}".format( max_opt_time, opt_time, rel_diff )
def test_using_pipeline(self): from sklearn.metrics import f1_score, make_scorer from lale.lib.lale import Hyperopt, OptimizeLast planned_pipeline = (PCA | NoOp) >> LogisticRegression # Let's first use Hyperopt to find the best pipeline opt = Hyperopt(estimator=planned_pipeline, max_evals=1) # run optimizer res = opt.fit(self.X_train, self.y_train) best_pipeline = res.get_pipeline() # Now let's use Hyperopt to optimize only the # last step (i.e., classifier) in the best pipeline hyperopt_args = { "scoring": make_scorer(f1_score, average="macro"), "cv": 3, "max_evals": 2, } opt_last = OptimizeLast( estimator=best_pipeline, last_optimizer=Hyperopt, optimizer_args=hyperopt_args, ) res_last = opt_last.fit(self.X_train, self.y_train) predictions = res_last.predict(self.X_test) predictions_1 = opt_last.predict(self.X_test) best_pipeline2 = res_last.get_pipeline() self.assertEqual(type(best_pipeline), type(best_pipeline2)) assert np.array_equal(predictions_1, predictions)
def test_using_pipeline(self): import lale.datasets.openml import pandas as pd (X_train, y_train), (X_test, y_test) = lale.datasets.openml.fetch('credit-g', 'classification', preprocess=False) project_nums = Project(columns={'type': 'number'}) project_cats = Project(columns={'type': 'string'}) planned_pipeline = ( (project_nums >> (Normalizer | NoOp) & project_cats >> OneHotEncoder) >> ConcatFeatures >> (LGBMClassifier | GradientBoostingClassifier)) # Let's first use Hyperopt to find the best pipeline opt = Hyperopt(estimator=planned_pipeline, max_evals=3) # run optimizer res = opt.fit(X_train, y_train) best_pipeline = res.get_pipeline() # Now let's use NSGA2 to perform multi-objective # optimization on the last step (i.e., classifier) # in the best pipeline returned by Hyperopt fpr_scorer = make_scorer(compute_fpr, greater_is_better=False) nsga2_args = { 'scoring': ['roc_auc', fpr_scorer], 'best_score': [1, 0], 'cv': 3, 'max_evals': 20, 'population_size': 10 } opt_last = OptimizeLast(estimator=best_pipeline, last_optimizer=NSGA2, optimizer_args=nsga2_args) res_last = opt_last.fit(X_train, y_train) df_summary = res_last.summary() print(df_summary) self.assertTrue(df_summary.shape[0] > 0) # check if summary contains valid loss values valid_objs = True for i in range(df_summary.shape[0]): record = df_summary.iloc[i] valid_objs = valid_objs and \ all([0 <= record['loss1'], record['loss1'] <= 1, 0 <= record['loss2'], record['loss2'] <= 1]) self.assertTrue(valid_objs, msg="Invalid loss values in summary") _ = res_last.predict(X_test) best_pipeline2 = res_last.get_pipeline() self.assertEqual(type(best_pipeline), type(best_pipeline2)) auc_scorer = get_scorer('roc_auc') print(f'test_using_pipeline: \n' 'AUC, FPR scorer values on test split - %.3f %.3f' % (auc_scorer(best_pipeline2, X_test, y_test), fpr_scorer(best_pipeline2, X_test, y_test)))
def test_runtime_limit_hor(self): import time planned_pipeline = (MinMaxScaler | Normalizer) >> LinearRegression from sklearn.datasets import load_boston X, y = load_boston(return_X_y=True) max_opt_time = 3.0 hor = Hyperopt( estimator=planned_pipeline, max_evals=1, cv=3, max_opt_time=max_opt_time, scoring="r2", ) start = time.time() _ = hor.fit(X[:500, :], y[:500]) end = time.time() opt_time = end - start rel_diff = (opt_time - max_opt_time) / max_opt_time assert ( rel_diff < 0.2 ), "Max time: {}, Actual time: {}, relative diff: {}".format( max_opt_time, opt_time, rel_diff )
def test_with_concat_features2(self): import warnings warnings.filterwarnings("ignore") from sklearn.datasets import load_iris from sklearn.metrics import accuracy_score from lale.lib.lale import Hyperopt data = load_iris() X, y = data.data, data.target pca = PCA(n_components=3) nys = Nystroem(n_components=10) concat = ConcatFeatures() lr = LogisticRegression(random_state=42, C=0.1) from lale.operators import make_pipeline pipeline = make_pipeline( ((((SimpleImputer() | NoOp()) >> pca) & nys) >> concat >> lr) | KNeighborsClassifier() ) clf = Hyperopt(estimator=pipeline, max_evals=1, handle_cv_failure=True) trained = clf.fit(X, y) predictions = trained.predict(X) print(accuracy_score(y, predictions)) warnings.resetwarnings()
def test_08_refine_model_with_lale(self): from lale import wrap_imported_operators from lale.lib.lale import Hyperopt wrap_imported_operators() try: println_pos( f"type(prefix_model) {type(TestAutoAIOutputConsumption.prefix_model)}" ) println_pos(f"type(LR) {type(LR)}") # This is for classifiers, regressors needs to have different operators & different scoring metrics (e.g 'r2') new_model = TestAutoAIOutputConsumption.prefix_model >> (LR | Tree | KNN) train_X = TestAutoAIOutputConsumption.training_df.drop( ["Risk"], axis=1).values train_y = TestAutoAIOutputConsumption.training_df["Risk"].values hyperopt = Hyperopt(estimator=new_model, cv=2, max_evals=3, scoring="roc_auc") hyperopt_pipelines = hyperopt.fit(train_X, train_y) TestAutoAIOutputConsumption.refined_model = ( hyperopt_pipelines.get_pipeline()) except Exception as e: assert False, f"Exception was thrown during model refinery: {e}"
def test_using_scoring(self): from sklearn.metrics import hinge_loss, make_scorer, f1_score, accuracy_score lr = LogisticRegression() clf = Hyperopt(estimator=lr, scoring='accuracy', cv=5, max_evals=1) trained = clf.fit(self.X_train, self.y_train) predictions = trained.predict(self.X_test) predictions_1 = clf.predict(self.X_test) assert np.array_equal(predictions_1, predictions)
def test_using_scoring(self): lr = LogisticRegression() clf = Hyperopt(estimator=lr, scoring="accuracy", cv=5, max_evals=1) trained = clf.fit(self.X_train, self.y_train) predictions = trained.predict(self.X_test) predictions_1 = clf.predict(self.X_test) assert np.array_equal(predictions_1, predictions)
def test_custom_scoring(self): from sklearn.metrics import f1_score, make_scorer lr = LogisticRegression() clf = Hyperopt(estimator=lr, scoring=make_scorer(f1_score, average='macro'), cv = 5, max_evals=1) trained = clf.fit(self.X_train, self.y_train) predictions = trained.predict(self.X_test) predictions_1 = clf.predict(self.X_test) assert np.array_equal(predictions_1, predictions)
def dont_test_planned_pipe_right(self): from lale.lib.lale import NoOp from lale.lib.sklearn import LogisticRegression from sklearn.decomposition import PCA from lale.lib.lale import Hyperopt iris = sklearn.datasets.load_iris() pipeline = PCA >> LogisticRegression clf = Hyperopt(estimator=pipeline, max_evals=1) clf.fit(iris.data, iris.target)
def test_classifier(self): X_train, y_train = self.X_train, self.y_train X_test, y_test = self.X_test, self.y_test import importlib module_name = ".".join(clf_name.split('.')[0:-1]) class_name = clf_name.split('.')[-1] module = importlib.import_module(module_name) class_ = getattr(module, class_name) clf = class_() #test_schemas_are_schemas lale.type_checking.validate_is_schema(clf.input_schema_fit()) lale.type_checking.validate_is_schema(clf.input_schema_predict()) lale.type_checking.validate_is_schema(clf.output_schema_predict()) lale.type_checking.validate_is_schema(clf.hyperparam_schema()) #test_init_fit_predict trained = clf.fit(self.X_train, self.y_train) predictions = trained.predict(self.X_test) #test_with_hyperopt from lale.lib.lale import Hyperopt hyperopt = Hyperopt(estimator=clf, max_evals=1) trained = hyperopt.fit(self.X_train, self.y_train) predictions = trained.predict(self.X_test) #test_cross_validation from lale.helpers import cross_val_score cv_results = cross_val_score(clf, X_train, y_train, cv = 2) self.assertEqual(len(cv_results), 2) #test_with_gridsearchcv_auto_wrapped from sklearn.metrics import accuracy_score, make_scorer with warnings.catch_warnings(): warnings.simplefilter("ignore") from lale.lib.sklearn.gradient_boosting_classifier import GradientBoostingClassifierImpl if clf._impl_class() == GradientBoostingClassifierImpl: #because exponential loss does not work with iris dataset as it is not binary classification import lale.schemas as schemas clf = clf.customize_schema(loss=schemas.Enum(default='deviance', values=['deviance'])) grid_search = lale.lib.lale.GridSearchCV( estimator=clf, lale_num_samples=1, lale_num_grids=1, cv=2, scoring=make_scorer(accuracy_score)) grid_search.fit(X_train, y_train) #test_predict_on_trainable trained = clf.fit(X_train, y_train) clf.predict(X_train) #test_to_json clf.to_json() #test_in_a_pipeline pipeline = NoOp() >> clf trained = pipeline.fit(self.X_train, self.y_train) predictions = trained.predict(self.X_test)
def run_hyperopt_on_planned_pipeline(planned_pipeline, max_iters=1): # data from sklearn.datasets import load_iris features, labels = load_iris(return_X_y=True) # set up optimizer from lale.lib.lale.hyperopt import Hyperopt opt = Hyperopt(estimator=planned_pipeline, max_evals=max_iters) # run optimizer res = opt.fit(features, labels)
def test_other_algorithms(self): for alg in ["rand", "tpe", "atpe", "anneal"]: hyperopt = Hyperopt( estimator=LogisticRegression, algo=alg, cv=3, max_evals=3 ) trained = hyperopt.fit(self.X_train, self.y_train) predictions = trained.predict(self.X_test) predictions_1 = hyperopt.predict(self.X_test) self.assertTrue(np.array_equal(predictions_1, predictions), alg)
def test_trained_get_pipeline_success(self): from lale.lib.lale import Hyperopt from sklearn.datasets import load_iris iris_data = load_iris() op = Hyperopt(estimator=LogisticRegression(), max_evals=1) with warnings.catch_warnings(): warnings.simplefilter("ignore") op2 = op.fit(iris_data.data[10:], iris_data.target[10:]) x = op2.get_pipeline
def test_lr_run(self): pgo = PGO.load_pgo_file(example_pgo_fp) from lale.lib.lale import Hyperopt from sklearn.datasets import load_iris lr = LogisticRegression() clf = Hyperopt(estimator=lr, max_evals=5, pgo=pgo) iris = load_iris() clf.fit(iris.data, iris.target)
def do1DTest(self, trainable, train_X, train_y, test_X, test_y): #Test for 1-D array as input to the transformers train_X = train_X[:,0] test_X = test_X[:,0] trainable_pipeline = (trainable & NoOp()) >> ConcatFeatures() >> float32_transform() >> LR() trained_pipeline = trainable_pipeline.fit(train_X, train_y) trained_pipeline.predict(test_X) hyperopt = Hyperopt(estimator=trainable_pipeline, max_evals=1) trained_hyperopt = hyperopt.fit(train_X, train_y) trained_hyperopt.predict(test_X)
def _fit_hyperopt(self, X, y): from lale.lib.lale import Hyperopt, NoOp from lale.lib.sklearn import ( PCA, DecisionTreeClassifier, DecisionTreeRegressor, KNeighborsClassifier, KNeighborsRegressor, MinMaxScaler, RandomForestClassifier, RandomForestRegressor, RobustScaler, SelectKBest, SGDClassifier, SGDRegressor, StandardScaler, ) prep = auto_prep(X) scale = MinMaxScaler | StandardScaler | RobustScaler | NoOp reduce_dims = PCA | SelectKBest | NoOp gbt = auto_gbt(self.prediction_type) if self.prediction_type == "regression": estim_trees = gbt | DecisionTreeRegressor | RandomForestRegressor estim_notree = SGDRegressor | KNeighborsRegressor else: estim_trees = gbt | DecisionTreeClassifier | RandomForestClassifier estim_notree = SGDClassifier | KNeighborsClassifier model_trees = reduce_dims >> estim_trees model_notree = scale >> reduce_dims >> estim_notree planned = prep >> (model_trees | model_notree) trainable = Hyperopt( estimator=planned, max_evals=self.max_evals - self._summary.shape[0], scoring=self.scoring, best_score=self.best_score, max_opt_time=self.max_opt_time - (time.time() - self._start_fit), max_eval_time=self.max_eval_time, verbose=self.verbose, show_progressbar=False, ) trained = trainable.fit(X, y) best_trial = trained._impl._trials.best_trial if "loss" in best_trial["result"]: if ( best_trial["result"]["loss"] < self._summary.at[self._name_of_best, "loss"] ): self._name_of_best = f'p{best_trial["tid"]}' summary = trained.summary() self._summary = pd.concat([self._summary, summary]) for name in summary.index: assert name not in self._pipelines if summary.at[name, "status"] == hyperopt.STATUS_OK: self._pipelines[name] = trained.get_pipeline(name)
def test_trained_summary_success(self): from lale.lib.lale import Hyperopt iris_data = load_iris() op = Hyperopt( estimator=LogisticRegression(), max_evals=1, show_progressbar=False ) with warnings.catch_warnings(): warnings.simplefilter("ignore") op2 = op.fit(iris_data.data[10:], iris_data.target[10:]) _ = op2.summary
def test(i): if i > max_evals: assert False try: X, y = data_loader() clf = Hyperopt(estimator=pipeline, max_evals=i, scoring=scoring) trained_pipeline = clf.fit(X, y) trained_pipeline.predict(X) return True except Exception: test(3 * i)
def doTest(self, trainable, train_X, train_y, test_X, test_y): trained = trainable.fit(train_X, train_y) transformed = trained.transform(test_X) with self.assertWarns(DeprecationWarning): trainable.transform(train_X) trainable.to_json() trainable_pipeline = trainable >> float32_transform() >> LR() trained_pipeline = trainable_pipeline.fit(train_X, train_y) trained_pipeline.predict(test_X) hyperopt = Hyperopt(estimator=trainable_pipeline, max_evals=1) trained_hyperopt = hyperopt.fit(train_X, train_y) trained_hyperopt.predict(test_X)
def test_with_hyperopt(self): from lale.lib.lale import Hyperopt def my_scorer(estimator, X, y=None): return 1 hyperopt = Hyperopt(estimator=KMeans(n_clusters=3), max_evals=5, verbose=True, scoring=my_scorer) trained = hyperopt.fit(self.X_train) _ = trained.predict(self.X_test)
def test_nested_pipeline1(self): from sklearn.datasets import load_iris from lale.lib.lale import Hyperopt from sklearn.metrics import accuracy_score data = load_iris() X, y = data.data, data.target #pipeline = KNeighborsClassifier() | (OneHotEncoder(handle_unknown = 'ignore') >> LogisticRegression()) pipeline = KNeighborsClassifier() | (SimpleImputer() >> LogisticRegression()) clf = Hyperopt(estimator=pipeline, max_evals=1) trained = clf.fit(X, y) predictions = trained.predict(X) print(accuracy_score(y, predictions))
def test_runtime_limit_zero_time_hor(self): planned_pipeline = (MinMaxScaler | Normalizer) >> LinearRegression from sklearn.datasets import load_boston X, y = load_boston(return_X_y=True) hor = Hyperopt(estimator=planned_pipeline, max_evals=1, cv=3, max_opt_time=0.0, scoring='r2') hor_fitted = hor.fit(X, y) assert hor_fitted.get_pipeline() is None
def test_runtime_limit_zero_time_hoc(self): planned_pipeline = (MinMaxScaler | Normalizer) >> ( LogisticRegression | KNeighborsClassifier) from sklearn.datasets import load_iris X, y = load_iris(return_X_y=True) hoc = Hyperopt(estimator=planned_pipeline, max_evals=1, cv=3, scoring='accuracy', max_opt_time=0.0) hoc_fitted = hoc.fit(X, y) assert hoc_fitted.get_pipeline() is None
def test_feature_preprocessor(self): X_train, y_train = self.X_train, self.y_train import importlib module_name = ".".join(fproc_name.split(".")[0:-1]) class_name = fproc_name.split(".")[-1] module = importlib.import_module(module_name) class_ = getattr(module, class_name) fproc = class_() from lale.lib.sklearn.one_hot_encoder import OneHotEncoder if isinstance(fproc, OneHotEncoder): # type: ignore # fproc = OneHotEncoder(handle_unknown = 'ignore') # remove the hack when this is fixed fproc = PCA() # test_schemas_are_schemas lale.type_checking.validate_is_schema(fproc.input_schema_fit()) lale.type_checking.validate_is_schema(fproc.input_schema_transform()) lale.type_checking.validate_is_schema(fproc.output_schema_transform()) lale.type_checking.validate_is_schema(fproc.hyperparam_schema()) # test_init_fit_transform trained = fproc.fit(self.X_train, self.y_train) _ = trained.transform(self.X_test) # test_predict_on_trainable trained = fproc.fit(X_train, y_train) fproc.transform(X_train) # test_to_json fproc.to_json() # test_in_a_pipeline # This test assumes that the output of feature processing is compatible with LogisticRegression from lale.lib.sklearn import LogisticRegression pipeline = fproc >> LogisticRegression() trained = pipeline.fit(self.X_train, self.y_train) _ = trained.predict(self.X_test) # Tune the pipeline with LR using Hyperopt from lale.lib.lale import Hyperopt hyperopt = Hyperopt(estimator=pipeline, max_evals=1, verbose=True, cv=3) trained = hyperopt.fit(self.X_train, self.y_train) _ = trained.predict(self.X_test)
def test_with_hyperopt(self): from lale.lib.sklearn import OrdinalEncoder X_train, y_train = self.X_train, self.y_train X_test, y_test = self.X_test, self.y_test fproc = OrdinalEncoder() from lale.lib.sklearn import LogisticRegression pipeline = fproc >> LogisticRegression() #Tune the pipeline with LR using Hyperopt from lale.lib.lale import Hyperopt hyperopt = Hyperopt(estimator=pipeline, max_evals=1) trained = hyperopt.fit(self.X_train, self.y_train) predictions = trained.predict(self.X_test)
def test_preprocessing_union(self): from lale.datasets import openml (train_X, train_y), (test_X, test_y) = openml.fetch( 'credit-g', 'classification', preprocess=False) from lale.lib.lale import Project from lale.lib.sklearn import Normalizer, OneHotEncoder from lale.lib.lale import ConcatFeatures as Concat from lale.lib.sklearn import RandomForestClassifier as Forest prep_num = Project(columns={'type': 'number'}) >> Normalizer prep_cat = Project(columns={'not': {'type': 'number'}}) >> OneHotEncoder(sparse=False) planned = (prep_num & prep_cat) >> Concat >> Forest from lale.lib.lale import Hyperopt hyperopt_classifier = Hyperopt(estimator=planned, max_evals=1) best_found = hyperopt_classifier.fit(train_X, train_y)
def test_with_hyperopt(self): def my_scorer(estimator, X, y=None): return 1 from lale.lib.lale import Hyperopt hyperopt = Hyperopt( estimator=IsolationForest(max_features=1.0, max_samples=1.0), max_evals=5, verbose=True, scoring=my_scorer, ) trained = hyperopt.fit(self.X_train) _ = trained.predict(self.X_test)
def test_with_hyperopt(self): from lale.lib.sklearn import OrdinalEncoder fproc = OrdinalEncoder(handle_unknown="ignore") from lale.lib.sklearn import LogisticRegression pipeline = fproc >> LogisticRegression() # Tune the pipeline with LR using Hyperopt from lale.lib.lale import Hyperopt hyperopt = Hyperopt(estimator=pipeline, max_evals=1) trained = hyperopt.fit(self.X_train, self.y_train) _ = trained.predict(self.X_test)
def test_regressor(self): X_train, y_train = self.X_train, self.y_train import importlib module_name = ".".join(clf_name.split(".")[0:-1]) class_name = clf_name.split(".")[-1] module = importlib.import_module(module_name) class_ = getattr(module, class_name) regr = None if class_name in ["StackingRegressor", "VotingRegressor"]: regr = class_(estimators=[("base", SGDRegressor())]) else: regr = class_() # test_schemas_are_schemas lale.type_checking.validate_is_schema(regr.input_schema_fit()) lale.type_checking.validate_is_schema(regr.input_schema_predict()) lale.type_checking.validate_is_schema(regr.output_schema_predict()) lale.type_checking.validate_is_schema(regr.hyperparam_schema()) # test_init_fit_predict trained = regr.fit(self.X_train, self.y_train) _ = trained.predict(self.X_test) # test score _ = trained.score(self.X_test, self.y_test) # test_predict_on_trainable trained = regr.fit(X_train, y_train) regr.predict(X_train) # test_to_json regr.to_json() # test_in_a_pipeline pipeline = NoOp() >> regr trained = pipeline.fit(self.X_train, self.y_train) _ = trained.predict(self.X_test) # test_with_hyperopt from lale.lib.sklearn.ridge import Ridge if isinstance(regr, Ridge): # type: ignore from lale.lib.lale import Hyperopt hyperopt = Hyperopt(estimator=pipeline, max_evals=1) trained = hyperopt.fit(self.X_train, self.y_train) _ = trained.predict(self.X_test)