def test_model_selection_with_pipeline_works_as_expected( self, pipeline_logistic: Pipeline, pipeline_dummy_classifier: Pipeline, train_iris_dataset, ): estimators = [pipeline_logistic, pipeline_dummy_classifier] best_estimator, results = Model.test_estimators( data=train_iris_dataset, estimators=estimators, metrics="accuracy") assert best_estimator.estimator == estimators[0]
def test_model_selection_with_nonstandard_metric_works_as_expected( self, train_iris_dataset): estimators = [ LogisticRegression(solver="liblinear"), RandomForestClassifier(n_estimators=10), ] best_estimator, results = Model.test_estimators(train_iris_dataset, estimators, metrics="roc_auc") for result in results: assert "roc_auc" in result.metrics
def test_model_selection_refits_final_model(self, train_iris_dataset): estimators = [LogisticRegression(solver="liblinear")] model = LogisticRegression(solver="liblinear").fit( train_iris_dataset.train_x, train_iris_dataset.train_y) model2, results2 = Model.test_estimators(train_iris_dataset, estimators, cv=2, refit=True, metrics="accuracy") assert np.all(model.coef_ == model2.estimator.coef_)
def test_model_selection_works_with_default_metric(self, train_iris_dataset): models = [ LogisticRegression(solver="liblinear"), RandomForestClassifier(n_estimators=2), ] best_model, results = Model.test_estimators(train_iris_dataset, models) assert models[1] is best_model.estimator assert 2 == len(results) assert results[0].metrics[0].name == "accuracy" assert results[1].metrics[0].name == "accuracy"
def test_test_models_logs_when_given_dir(self, tmp_path: pathlib.Path, train_iris_dataset): test_models_log = tmp_path / "test_estimators" Model.test_estimators( train_iris_dataset, [ RandomForestClassifier(n_estimators=10), DummyClassifier(strategy="prior"), ], log_dir=str(test_models_log), metrics="accuracy", ) for file in test_models_log.rglob("*.yaml"): with file.open() as f: result = yaml.safe_load(f) model_name = result["model_name"] assert model_name in { "IrisData_RandomForestClassifier", "IrisData_DummyClassifier", }
def test_model_selection_works_with_multiple_metrics( self, train_iris_dataset): models = [ LogisticRegression(solver="liblinear"), RandomForestClassifier(n_estimators=2), ] best_model, results = Model.test_estimators( train_iris_dataset, models, metrics=["accuracy", "roc_auc"]) assert models[1] is best_model.estimator assert 2 == len(results) assert 2 == len(results[0].metrics) assert 2 == len(results[1].metrics)
def test_model_selection_works_as_expected(self, train_iris_dataset): models = [ LogisticRegression(solver="liblinear"), RandomForestClassifier(n_estimators=10), ] best_model, results = Model.test_estimators(train_iris_dataset, models, metrics="accuracy") assert models[1] is best_model.estimator assert 2 == len(results) assert results[0].metrics[0].score >= results[1].metrics[0].score for result in results: assert isinstance(result, Result)
def test_model_selection_works_with_feature_pipeline( self, train_iris_dataset: Dataset): estimators = [ RandomForestClassifier(), DummyClassifier(strategy="stratified") ] feature_pipeline = Pipeline([("scale", DFStandardScaler())]) best_estimator, results = Model.test_estimators( data=train_iris_dataset, estimators=estimators, feature_pipeline=feature_pipeline, ) expected = Pipeline([("features", feature_pipeline), ("estimator", estimators[0])]) assert best_estimator.estimator.get_params() == expected.get_params()