예제 #1
0
    def test_train_model_followed_by_score_model_returns_correctly(
            self, pipeline_logistic: Pipeline, train_iris_dataset):
        model = Model(pipeline_logistic)
        model.train_estimator(train_iris_dataset)
        model.score_estimator(train_iris_dataset)

        assert isinstance(model.result, Result)
예제 #2
0
    def test_score_estimator_creates_train_test_data_classification(
            self, iris_dataset, train_iris_dataset):
        model = Model(LogisticRegression())
        data = iris_dataset()
        model.score_estimator(data)

        test = train_iris_dataset

        pd.testing.assert_frame_equal(data.test_x, test.test_x)
        assert np.array_equal(data.test_y, test.test_y)
        pd.testing.assert_frame_equal(data.train_x, test.train_x)
        assert np.array_equal(data.train_y, test.train_y)
예제 #3
0
    def test_score_estimator_creates_train_test_data(self, boston_dataset,
                                                     train_boston_dataset):
        model = Model(LinearRegression())
        data = boston_dataset()
        model.score_estimator(data)

        test = train_boston_dataset

        pd.testing.assert_frame_equal(data.test_x, test.test_x)
        assert np.array_equal(data.test_y, test.test_y)
        pd.testing.assert_frame_equal(data.train_x, test.train_x)
        assert np.array_equal(data.train_y, test.train_y)
예제 #4
0
    def test_regression_model_can_be_saved(self, classifier: Model,
                                           tmp_path: pathlib.Path,
                                           train_iris_dataset):
        classifier.score_estimator(train_iris_dataset)
        load_storage = FileStorage(tmp_path)

        storage = FileStorage(tmp_path)
        saved_model_path = classifier.save_estimator(storage)
        assert saved_model_path.exists()
        loaded_model = classifier.load_estimator(saved_model_path,
                                                 storage=load_storage)
        assert loaded_model.estimator.get_params(
        ) == classifier.estimator.get_params()
예제 #5
0
 def test_rare_feature_encoder_works_gridsearch(self,
                                                train_iris_dataset: Dataset,
                                                rare: RareFeatureEncoder):
     grid = create_gridsearch(rare)
     model = Model(grid)
     result = model.score_estimator(train_iris_dataset)
     assert isinstance(result, Result)
예제 #6
0
    def test_has_correct_title_when_using_trees(self, dataset: Dataset):
        """Expect the plot to not have Class in the title"""
        model = Model(RandomForestClassifier())
        result = model.score_estimator(dataset)

        ax = result.plot.feature_importance(class_index=10)
        assert "Class 10" not in ax.title.get_text()
예제 #7
0
    def test_score_estimator_creates_train_test_data_with_changed_config_and_classification_data(
            self, iris_dataset):
        model = Model(LogisticRegression())
        model.config.RANDOM_STATE = 1
        model.config.TEST_SIZE = 0.50
        data = iris_dataset()
        model.score_estimator(data)

        test = iris_dataset()
        test.create_train_test(stratify=True, seed=1, test_size=0.50)

        pd.testing.assert_frame_equal(data.test_x, test.test_x)
        assert np.array_equal(data.test_y, test.test_y)
        pd.testing.assert_frame_equal(data.train_x, test.train_x)
        assert np.array_equal(data.train_y, test.train_y)
        model.config.reset_config()
예제 #8
0
 def test_roc_curve_fails_correctly_without_predict_proba(self):
     dataset = load_demo_dataset("iris")
     svc = Model(SVC(gamma="scale"))
     result = svc.score_estimator(dataset)
     with pytest.raises(VizError):
         result.plot.roc_curve()
     plt.close()
예제 #9
0
 def test_has_correct_xlabel_when_using_trees(self, dataset: Dataset):
     """Expect plotting feature_importance of a RandomForest to show Feature Importances as
     xlabels instead of coef"""
     model = Model(RandomForestClassifier())
     result = model.score_estimator(dataset)
     ax = result.plot.feature_importance()
     assert ax.get_xlabel() == "Feature Importances"
     plt.close()
예제 #10
0
    def test_can_score_estimator_with_multiple_metrics(self,
                                                       train_iris_dataset):
        model = Model(LogisticRegression(solver="liblinear"))
        result = model.score_estimator(train_iris_dataset,
                                       metrics=["accuracy", "roc_auc"])

        assert len(result.metrics) == 2
        assert "accuracy" in result.metrics
        assert "roc_auc" in result.metrics
예제 #11
0
 def test_has_correct_xlabel_when_using_trees(self, dataset: Dataset):
     """Expect plotting feature_importance of a RandomForest to show Feature Importances as
     xlabels instead of coef"""
     model = Model(RandomForestClassifier())
     result = model.score_estimator(dataset)
     ax = result.plot.permutation_importance()
     assert (ax.get_xlabel() ==
             "Permuted Feature Importance (Accuracy) Relative to Baseline")
     plt.close()
예제 #12
0
    def test_score_estimator_creates_train_test_data_with_changed_config(
            self, boston_dataset):
        model = Model(LinearRegression())
        model.config.RANDOM_STATE = 1
        model.config.TEST_SIZE = 0.5
        model.config.TRAIN_TEST_SHUFFLE = False
        data = boston_dataset()
        model.score_estimator(data)

        test = boston_dataset()
        test.create_train_test(stratify=False,
                               shuffle=False,
                               seed=1,
                               test_size=0.5)

        pd.testing.assert_frame_equal(data.test_x, test.test_x)
        assert np.array_equal(data.test_y, test.test_y)
        pd.testing.assert_frame_equal(data.train_x, test.train_x)
        assert np.array_equal(data.train_y, test.train_y)
        model.config.reset_config()
예제 #13
0
    def test_raises_if_passed_model_without_feature_importance_or_coefs(
            self, dataset: Dataset):
        """
        Expect an exception if trying to plot an estimator that doesn't have
        coefficients or feature_importance
        """
        model = Model(KNeighborsClassifier())
        result = model.score_estimator(dataset)

        with pytest.raises(VizError):
            result.plot.feature_importance()
 def test_pr_curve_fails_correctly_without_predict_proba(self):
     """
     Expect that the plot will raise an exception if the estimator
     does not have a predict_proba method
     """
     dataset = load_demo_dataset("iris")
     svc = Model(SVC(gamma="scale"))
     result = svc.score_estimator(dataset)
     with pytest.raises(VizError):
         result.plot.precision_recall_curve()
     plt.close()
예제 #15
0
    def test_log_context_manager_logs_when_scoring_model(
            self, tmp_path: pathlib.Path, train_iris_dataset):
        model = Model(LinearRegression())

        runs = tmp_path / "runs"
        with model.log(str(runs)):
            result = model.score_estimator(train_iris_dataset)

        for file in runs.rglob("LinearRegression_*"):
            with file.open() as f:
                log_result = yaml.safe_load(f)

            assert result.metrics.score == log_result["metrics"]["r2"]
            assert result.model.estimator_name == log_result["estimator_name"]
예제 #16
0
    def test_plots_have_correct_title_when_using_pipeline(
            self, dataset: Dataset):
        """
        Expect plots to work correctly with pipelines,
        showing the title of the estimator and not Pipeline
        """
        pipe = Pipeline([
            ("scale", DFStandardScaler()),
            ("clf", RandomForestClassifier(n_estimators=10)),
        ])

        model = Model(pipe)
        result = model.score_estimator(dataset)
        ax = result.plot.permutation_importance()

        assert (ax.title.get_text() ==
                "Permutation Importances (Accuracy) - RandomForestClassifier")

        assert 4 == len(list(ax.get_yticklabels()))
        plt.close()
예제 #17
0
def train_model(year, month, day, graphs=True, clf=RandomForestRegressor()):
    dataset = AirBnBDataset(year=year, month=month, day=day)
    dataset.create_train_test()

    model = Model(clf, feature_pipeline=features)
    result = model.score_estimator(dataset)
    model.config.N_JOBS = 6
    with model.log("randomforest"):
        model.save_estimator()

    if graphs:
        result.plot.feature_importance()
        plt.savefig(VISUALIZATIONS / "confusion_matrix.png")

        result.plot.residuals()
        plt.savefig(VISUALIZATIONS / "residuals.png")

        result.plot.prediction_error()
        plt.savefig(VISUALIZATIONS / "prediction_error.png")

    return result
예제 #18
0
    def test_can_score_estimator_with_default_metric(self, train_iris_dataset):
        model = Model(LogisticRegression(solver="liblinear"))
        result = model.score_estimator(train_iris_dataset)

        assert result.metrics.name == "accuracy"
 def classifier_result(self) -> Result:
     """Setup a classiifer Result"""
     dataset = load_demo_dataset("iris")
     model = Model(LogisticRegression())
     return model.score_estimator(dataset)
예제 #20
0
 def result_cv(self, model: Model) -> Result:
     """Setup a Result from a cross-validated scoring"""
     dataset = load_demo_dataset("boston")
     return model.score_estimator(dataset, cv=2)
예제 #21
0
 def result(self, model: Model) -> Result:
     """Setup a Result from a score_estimator without cv"""
     dataset = load_demo_dataset("boston")
     return model.score_estimator(dataset)
예제 #22
0
 def test_df_selector_works_gridsearch(self, train_iris_dataset):
     grid = create_gridsearch(Select("sepal length (cm)"))
     model = Model(grid)
     result = model.score_estimator(train_iris_dataset)
     assert isinstance(result, Result)
예제 #23
0
 def test_binarize_works_in_gridsearch(self, train_iris_dataset):
     grid = create_gridsearch(Binarize(value=2))
     model = Model(grid)
     result = model.score_estimator(train_iris_dataset)
     assert isinstance(result, Result)
예제 #24
0
 def test_func_transformer_works_in_gridsearch(self, train_iris_dataset):
     grid = create_gridsearch(FuncTransformer(np.mean))
     model = Model(grid)
     result = model.score_estimator(train_iris_dataset)
     assert isinstance(result, Result)
예제 #25
0
 def test_dfrowfunc_works_in_gridsearch(self, train_iris_dataset):
     grid = create_gridsearch(DFRowFunc(strategy="mean"))
     model = Model(grid)
     result = model.score_estimator(train_iris_dataset)
     assert isinstance(result, Result)
예제 #26
0
 def test_standard_scaler_works_in_gridsearch(self, train_iris_dataset):
     grid = create_gridsearch(DFStandardScaler())
     model = Model(grid)
     result = model.score_estimator(train_iris_dataset)
     assert isinstance(result, Result)
예제 #27
0
 def test_renamer_works_gridsearch(self, train_iris_dataset):
     grid = create_gridsearch(Renamer(["1", "2", "3", "4"]))
     model = Model(grid)
     result = model.score_estimator(train_iris_dataset)
     assert isinstance(result, Result)
예제 #28
0
 def test_to_categorical_works_gridsearch(self, train_iris_dataset):
     grid = create_gridsearch(ToCategorical())
     model = Model(grid)
     result = model.score_estimator(train_iris_dataset)
     assert isinstance(result, Result)
예제 #29
0
 def regression_result(self) -> Result:
     """Setup a regression Result"""
     dataset = load_demo_dataset("boston")
     model = Model(LinearRegression())
     return model.score_estimator(dataset)
예제 #30
0
    def test_can_score_estimator_with_specified_metric(self,
                                                       train_iris_dataset):
        model = Model(LogisticRegression(solver="liblinear"))
        result = model.score_estimator(train_iris_dataset, metrics="roc_auc")

        assert result.metrics.name == "roc_auc"