def test_using_pipeline(self): from sklearn.metrics import f1_score, make_scorer from lale.lib.lale import Hyperopt, OptimizeLast planned_pipeline = (PCA | NoOp) >> LogisticRegression # Let's first use Hyperopt to find the best pipeline opt = Hyperopt(estimator=planned_pipeline, max_evals=1) # run optimizer res = opt.fit(self.X_train, self.y_train) best_pipeline = res.get_pipeline() # Now let's use Hyperopt to optimize only the # last step (i.e., classifier) in the best pipeline hyperopt_args = { "scoring": make_scorer(f1_score, average="macro"), "cv": 3, "max_evals": 2, } opt_last = OptimizeLast( estimator=best_pipeline, last_optimizer=Hyperopt, optimizer_args=hyperopt_args, ) res_last = opt_last.fit(self.X_train, self.y_train) predictions = res_last.predict(self.X_test) predictions_1 = opt_last.predict(self.X_test) best_pipeline2 = res_last.get_pipeline() self.assertEqual(type(best_pipeline), type(best_pipeline2)) assert np.array_equal(predictions_1, predictions)
def test_get_named_pipeline(self): pipeline = MinMaxScaler() >> KNeighborsClassifier() trained_pipeline = pipeline.fit(self.X_train, self.y_train) fpr_scorer = make_scorer(compute_fpr, greater_is_better=False) nsga2_args = { 'scoring': ['accuracy', fpr_scorer], 'best_score': [1, 0], 'cv': 3, 'max_evals': 20, 'population_size': 10 } opt_last = OptimizeLast(estimator=trained_pipeline, last_optimizer=NSGA2, optimizer_args=nsga2_args) res_last = opt_last.fit(self.X_train, self.y_train) df_summary = res_last.summary() pareto_pipeline = res_last.get_pipeline(pipeline_name='p0') self.assertEqual(type(trained_pipeline), type(pareto_pipeline)) if (df_summary.shape[0] > 1): pareto_pipeline = res_last.get_pipeline(pipeline_name='p1') self.assertEqual(type(trained_pipeline), type(pareto_pipeline))
def test_using_pipeline(self): import lale.datasets.openml import pandas as pd (X_train, y_train), (X_test, y_test) = lale.datasets.openml.fetch('credit-g', 'classification', preprocess=False) project_nums = Project(columns={'type': 'number'}) project_cats = Project(columns={'type': 'string'}) planned_pipeline = ( (project_nums >> (Normalizer | NoOp) & project_cats >> OneHotEncoder) >> ConcatFeatures >> (LGBMClassifier | GradientBoostingClassifier)) # Let's first use Hyperopt to find the best pipeline opt = Hyperopt(estimator=planned_pipeline, max_evals=3) # run optimizer res = opt.fit(X_train, y_train) best_pipeline = res.get_pipeline() # Now let's use NSGA2 to perform multi-objective # optimization on the last step (i.e., classifier) # in the best pipeline returned by Hyperopt fpr_scorer = make_scorer(compute_fpr, greater_is_better=False) nsga2_args = { 'scoring': ['roc_auc', fpr_scorer], 'best_score': [1, 0], 'cv': 3, 'max_evals': 20, 'population_size': 10 } opt_last = OptimizeLast(estimator=best_pipeline, last_optimizer=NSGA2, optimizer_args=nsga2_args) res_last = opt_last.fit(X_train, y_train) df_summary = res_last.summary() print(df_summary) self.assertTrue(df_summary.shape[0] > 0) # check if summary contains valid loss values valid_objs = True for i in range(df_summary.shape[0]): record = df_summary.iloc[i] valid_objs = valid_objs and \ all([0 <= record['loss1'], record['loss1'] <= 1, 0 <= record['loss2'], record['loss2'] <= 1]) self.assertTrue(valid_objs, msg="Invalid loss values in summary") _ = res_last.predict(X_test) best_pipeline2 = res_last.get_pipeline() self.assertEqual(type(best_pipeline), type(best_pipeline2)) auc_scorer = get_scorer('roc_auc') print(f'test_using_pipeline: \n' 'AUC, FPR scorer values on test split - %.3f %.3f' % (auc_scorer(best_pipeline2, X_test, y_test), fpr_scorer(best_pipeline2, X_test, y_test)))
def test_unspecified_arguments(self): from lale.lib.lale import OptimizeLast from lale.operators import TrainedIndividualOp opt = OptimizeLast(optimizer_args={"max_evals": 1}) # No arguments res = opt.fit(self.X_train, self.y_train) predictions = res.predict(self.X_test) predictions_1 = opt.predict(self.X_test) best_pipeline = res.get_pipeline() assert np.array_equal(predictions_1, predictions) self.assertEqual(type(best_pipeline), TrainedIndividualOp)
def test_get_named_pipeline(self): from lale.lib.lale import Hyperopt, OptimizeLast pipeline = MinMaxScaler() >> KNeighborsClassifier() trained_pipeline = pipeline.fit(self.X_train, self.y_train) hyperopt_args = {"cv": 3, "max_evals": 2} opt_last = OptimizeLast( estimator=trained_pipeline, last_optimizer=Hyperopt, optimizer_args=hyperopt_args, ) res_last = opt_last.fit(self.X_train, self.y_train) pipeline2 = res_last.get_pipeline(pipeline_name="p1") if pipeline2 is not None: trained_pipeline2 = pipeline2.fit(self.X_train, self.y_train) _ = trained_pipeline2.predict(self.X_test) self.assertEqual(type(trained_pipeline), type(trained_pipeline2))
def test_using_individual_operator(self): from lale.lib.lale import Hyperopt, OptimizeLast lr = LogisticRegression() # Individual Operator trained_operator = lr.fit(self.X_train, self.y_train) # Now let's use Hyperopt to optimize the classifier hyperopt_args = {"scoring": "accuracy", "cv": 3, "max_evals": 2} opt_last = OptimizeLast( estimator=trained_operator, last_optimizer=Hyperopt, optimizer_args=hyperopt_args, ) res_last = opt_last.fit(self.X_train, self.y_train) predictions = res_last.predict(self.X_test) predictions_1 = opt_last.predict(self.X_test) best_pipeline = res_last.get_pipeline() self.assertEqual(type(trained_operator), type(best_pipeline)) assert np.array_equal(predictions_1, predictions)