Пример #1
0
    def test_forward_selection(self, mocker, max_predictors: int):

        # create list of elements [var1_enc, var2_c, ..., var10_enc]
        predictors_list = [f"var{i+1}_enc" for i in range(10)]

        # extract sublist [var1_enc, var5_enc, var9_enc]:
        forced_predictors = predictors_list[::4]
        # remove these from predictors list to have clean version
        predictors = [
            pred for pred in predictors_list if pred not in forced_predictors
        ]

        ordered_output_list = forced_predictors + predictors

        def mock_find_next_best_model(self, train_data, target_column_name,
                                      candidate_predictors,
                                      current_predictors):
            return mock_model(current_predictors + candidate_predictors[0:1])

        mocker.patch(("cobra.model_building.ForwardFeatureSelection."
                      "_find_next_best_model"), mock_find_next_best_model)

        fw_selection = ForwardFeatureSelection(max_predictors=max_predictors)

        fitted_models = (fw_selection._forward_selection(
            pd.DataFrame(), "target", predictors, forced_predictors))

        actual = [sorted(model.predictors) for model in fitted_models]

        expected = [
            sorted(ordered_output_list[:i + 1])
            for i in range(min(max_predictors, len(predictors_list)))
        ]

        assert actual == expected
Пример #2
0
    def test_fit(self, mocker, max_predictors: int, expectation):

        # create list of elements [var1_enc, var2_c, ..., var10_enc]
        predictors_list = [f"var{i+1}_enc" for i in range(10)]
        # extract sublist [var1_enc, var5_enc, var9_enc]:
        forced_predictors_list = predictors_list[::4]

        ordered_output_list = (forced_predictors_list + [
            pred
            for pred in predictors_list if pred not in forced_predictors_list
        ])

        fw_selection = ForwardFeatureSelection(max_predictors=max_predictors)

        def mock_train_model(self, train_data, target_column_name, predictors):
            return mock_model(predictors)

        def mock_forward_selection(self, train_data, target_column_name,
                                   predictors, forced_predictors):
            n_models = min(max_predictors,
                           len(predictors) + len(forced_predictors))

            return [
                mock_model(ordered_output_list[:i + 1])
                for i in range(n_models)
            ]

        (mocker.patch(
            "cobra.model_building.ForwardFeatureSelection._train_model",
            mock_train_model))

        mocker.patch(("cobra.model_building.ForwardFeatureSelection"
                      "._forward_selection"), mock_forward_selection)

        with expectation:
            fw_selection.fit(pd.DataFrame(),
                             "target",
                             predictors=predictors_list,
                             forced_predictors=forced_predictors_list,
                             excluded_predictors=[])

            # for each fitted model, check number of predictors
            actual = [
                model.predictors for model in fw_selection._fitted_models
            ]

            expected = [
                ordered_output_list[:i + 1]
                for i in range(min(max_predictors, len(predictors_list)))
            ]

            if max_predictors == len(forced_predictors_list):
                expected = [forced_predictors_list]

            assert actual == expected
Пример #3
0
    def test_compute_model_performances(self, mocker, model_type):

        data = mock_data(add_split_col=True, model_type=model_type)

        fw_selection = ForwardFeatureSelection(model_type=model_type)
        fw_selection._fitted_models = [
            mock_model_num_pred(1, model_type=model_type),
            mock_model_num_pred(2, model_type=model_type),
            mock_model_num_pred(3, model_type=model_type)
        ]

        def mock_evaluate(self, X, y, split, metric):  # on AUC scale, but gives the same for RMSE as it is a mock
            if split == "train":
                return 0.612
            else:
                return 0.609

        if model_type == "classification":
            patch_fct = "cobra.model_building.forward_selection.LogisticRegressionModel.evaluate"
        elif model_type == "regression":
            patch_fct = "cobra.model_building.forward_selection.LinearRegressionModel.evaluate"

        mocker.patch(patch_fct, mock_evaluate)

        actual = (fw_selection
                  .compute_model_performances(data, "target",
                                              splits=["train", "selection"],
                                              metric=None))

        expected = pd.DataFrame([
            {"predictors": ["var1_enc"],
             "last_added_predictor": "var1_enc",
             "train_performance": 0.612, "selection_performance": 0.609,
             "model_type": model_type},
            {"predictors": ["var1_enc", "var2_enc"],
             "last_added_predictor": "var2_enc",
             "train_performance": 0.612, "selection_performance": 0.609,
             "model_type": model_type},
            {"predictors": ["var1_enc", "var2_enc", "var3_enc"],
             "last_added_predictor": "var3_enc",
             "train_performance": 0.612, "selection_performance": 0.609,
             "model_type": model_type}
        ])

        pd.testing.assert_frame_equal(actual, expected)
Пример #4
0
    def test_compute_model_performances(self, mocker):

        data = mock_data(add_split_col=True)

        fw_selection = ForwardFeatureSelection()
        fw_selection._fitted_models = [
            mock_model_num_pred(1),
            mock_model_num_pred(2),
            mock_model_num_pred(3)
        ]

        def mock_evaluate(self, X, y, split):
            if split == "train":
                return 0.612
            else:
                return 0.609

        (mocker.patch(("cobra.model_building.forward_selection"
                       ".MLModel.evaluate"), mock_evaluate))

        actual = (fw_selection.compute_model_performances(
            data, "target", splits=["train", "selection"]))
        expected = pd.DataFrame([{
            "predictors": ["var1_enc"],
            "last_added_predictor": "var1_enc",
            "train_performance": 0.612,
            "selection_performance": 0.609
        }, {
            "predictors": ["var1_enc", "var2_enc"],
            "last_added_predictor": "var2_enc",
            "train_performance": 0.612,
            "selection_performance": 0.609
        }, {
            "predictors": ["var1_enc", "var2_enc", "var3_enc"],
            "last_added_predictor":
            "var3_enc",
            "train_performance":
            0.612,
            "selection_performance":
            0.609
        }])

        pd.testing.assert_frame_equal(actual, expected)
Пример #5
0
    def test_ffs_train_data_assertions(self, model_type):

        fw_selection = ForwardFeatureSelection(model_type=model_type)

        with pytest.raises(AssertionError):  # no split column
            fw_selection.fit(pd.DataFrame(), "target", predictors=[""])

        df = mock_data(add_split_col=True, model_type=model_type)
        with pytest.raises(AssertionError):  # not at least train & selection sets
            fw_selection.fit(df[df["split"] == "train"], "target", predictors=[""])
Пример #6
0
    def test_get_model_from_step(self):

        forward_selection = ForwardFeatureSelection()

        with pytest.raises(ValueError):
            forward_selection.get_model_from_step(2)