예제 #1
0
def test_plot_results(regressor, tmp_path):
    skl = SklearnMethod(LinearRegression(), ["r2", "max_error"],
                        export_model=True)
    # using test set
    an = Analysis(
        methods=[("dummy", regressor), ("linear", skl)],
        metric_names=["r2", "max_error"],
        datasets=["adult", "cars", "pima"],
        use_test_set=True,
        random_state=SEED,
        output_dir=tmp_path,
        local_cache_dir=PMLB_CACHE,
    )
    an.run()
    ax = an.plot_results()
    xticklabels = [lab.get_text() for lab in ax.get_xticklabels()]
    assert xticklabels == ["adult", "cars", "pima"]

    # without test set
    an = Analysis(
        methods=[("dummy", regressor), ("linear", skl)],
        metric_names=["r2", "max_error"],
        datasets=["adult", "cars", "pima"],
        use_test_set=False,
        random_state=SEED,
        output_dir=tmp_path,
        local_cache_dir=PMLB_CACHE,
    )
    an.run()
    ax = an.plot_results()
    xticklabels = [lab.get_text() for lab in ax.get_xticklabels()]
    assert xticklabels == ["adult", "cars", "pima"]
예제 #2
0
    def test_string(self, regressor, tmp_path):
        # should work with "all", "classification", "regression"
        an = Analysis(
            methods=[("dummy", regressor)],
            metric_names=["r2", "max_error"],
            datasets="all",
            n_datasets=5,
            random_state=SEED,
            output_dir=tmp_path,
            local_cache_dir=PMLB_CACHE,
        )
        assert len(an.datasets) == 5

        an = Analysis(
            methods=[("dummy", regressor)],
            metric_names=["r2", "max_error"],
            datasets="classification",
            n_datasets=5,
            random_state=SEED,
            output_dir=tmp_path,
            local_cache_dir=PMLB_CACHE,
        )
        assert len(an.datasets) == 5

        an = Analysis(
            methods=[("dummy", regressor)],
            metric_names=["r2", "max_error"],
            datasets="regression",
            n_datasets=5,
            random_state=SEED,
            output_dir=tmp_path,
            local_cache_dir=PMLB_CACHE,
        )
        assert len(an.datasets) == 5
예제 #3
0
def test_dropna(dropna, tmp_path):
    X, y = make_regression(n_samples=200, n_features=5, random_state=SEED)
    X[0, 2] = np.nan
    y[3] = np.nan

    an = Analysis(
        methods=[("mock", MockMethodNA(dropna=dropna))],
        metric_names=["r2", "max_error"],
        datasets=[("data_with_na", (X, y))],
        random_state=SEED,
        drop_na=dropna,
        use_test_set=False,
        output_dir=tmp_path,
        local_cache_dir=PMLB_CACHE,
    )
    an.run()
예제 #4
0
    def test_mixed(self, regressor, tmp_path):
        # should work with a mix of strings, tuples, tuple of tuples
        datasets = [
            (
                "data_1",
                *make_regression(
                    n_samples=200, n_features=5, random_state=SEED),
            ),
            (
                "data_2",
                *make_regression(
                    n_samples=1000, n_features=50, random_state=SEED),
            ),
        ]

        def test_split(data):
            name, X, y = data
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, random_state=SEED)
            return name, (X_train, y_train), (X_test, y_test)

        datasets = (list(map(test_split, datasets)) + datasets +
                    ["adult", "cars", "pima"])

        an = Analysis(
            methods=[("dummy", regressor)],
            metric_names=["r2", "max_error"],
            datasets=datasets,
            random_state=SEED,
            output_dir=tmp_path,
            local_cache_dir=PMLB_CACHE,
        )
        assert len(an.datasets) == 7
예제 #5
0
def test_result_test_split(tmp_path):
    linear = SklearnMethod(LinearRegression(), ["r2", "max_error"])
    tree = SklearnMethod(
        DecisionTreeRegressor(random_state=SEED),
        ["r2", "max_error"],
    )
    dummy = SklearnMethod(DummyRegressor(), ["r2", "max_error"])
    an = Analysis(
        methods=[("linear", linear), ("tree", tree), ("dummy", dummy)],
        metric_names=["r2", "max_error"],
        datasets="regression",
        n_datasets=3,
        random_state=SEED,
        use_test_set=True,
        output_dir=tmp_path,
        local_cache_dir=PMLB_CACHE,
    )
    an.run()

    assert an.results.shape == (3, 3, 2, 2)
    assert not np.isnan(an.results.values).any()

    linear = SklearnMethod(LinearRegression(), ["r2", "max_error"])
    tree = SklearnMethod(
        DecisionTreeRegressor(random_state=SEED),
        ["r2", "max_error"],
    )
    # check if the results match
    for data in an.datasets:
        X, y = pmlb.fetch_data(data, return_X_y=True)
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.25,
                                                            shuffle=True,
                                                            random_state=SEED)

        linear_r2, linear_max = linear.train(X_train, y_train)
        tree_r2, tree_max = tree.train(X_train, y_train)

        assert round(
            float(an.results.loc[data, "linear", "max_error", "train"].values),
            4) == round(linear_max, 4)

        assert round(
            float(an.results.loc[data, "tree", "max_error", "train"].values),
            4) == round(tree_max, 4)
예제 #6
0
def test_result_cv(tmp_path):
    linear = SklearnMethod(LinearRegression(), ["r2", "max_error"], cv=5)
    tree = SklearnMethod(
        DecisionTreeRegressor(random_state=SEED),
        ["r2", "max_error"],
        cv=5,
    )
    dummy = SklearnMethod(DummyRegressor(), ["r2", "max_error"])
    an = Analysis(
        methods=[("linear", linear), ("tree", tree), ("dummy", dummy)],
        metric_names=["r2", "max_error"],
        datasets="regression",
        n_datasets=3,
        random_state=SEED,
        use_test_set=False,
        output_dir=tmp_path,
        local_cache_dir=PMLB_CACHE,
    )
    an.run()

    assert an.results.shape == (3, 3, 2, 5)
    assert not np.isnan(an.results.values).any()

    linear = SklearnMethod(LinearRegression(), ["r2", "max_error"])
    tree = SklearnMethod(
        DecisionTreeRegressor(random_state=SEED),
        ["r2", "max_error"],
    )
    linear.set_test_set(False)
    tree.set_test_set(False)
    # check if the results match
    for data in an.datasets:
        X, y = pmlb.fetch_data(data, return_X_y=True)

        linear_r2, linear_max = linear.train(X, y)
        tree_r2, tree_max = tree.train(X, y)

        an_linear_max = an.results.loc[data, "linear",
                                       "max_error"].values  # check all folds
        np.testing.assert_allclose(an_linear_max, linear_max, rtol=1e-13)

        an_tree_max = an.results.loc[data, "tree",
                                     "max_error"].values  # check all folds
        np.testing.assert_allclose(an_tree_max, tree_max, rtol=1e-13)
예제 #7
0
def test_output_dir(tmp_path):
    n_folds = 3
    skl = SklearnMethod(
        LinearRegression(),
        ["neg_mean_squared_error", "r2"],
        export_model=True,
        cv=n_folds,
    )

    # Using test set
    test_path = os.path.join(tmp_path, "test_output")
    an = Analysis(
        methods=[("dummy", skl)],
        metric_names=["r2", "max_error"],
        datasets=["adult", "cars", "pima"],
        random_state=SEED,
        output_dir=test_path,
        local_cache_dir=PMLB_CACHE,
    )
    an.run()

    out_dir = os.path.join(test_path, "Analysis_1")
    exports = map(
        lambda x: os.path.join(out_dir, x, "dummy", "estimator.joblib"),
        ["adult", "cars", "pima"],
    )

    for export in exports:
        assert os.path.exists(export)

    # Cross-validation
    test_path = os.path.join(tmp_path, "cv_output")
    an = Analysis(
        methods=[("dummy", skl)],
        metric_names=["r2", "max_error"],
        datasets=["adult", "cars", "pima"],
        random_state=SEED,
        output_dir=test_path,
        local_cache_dir=PMLB_CACHE,
        use_test_set=False,
    )
    an.run()

    out_dir = os.path.join(test_path, "Analysis_1")
    exports = map(
        lambda x: os.path.join(out_dir, x[0], "dummy",
                               f"estimator_fold_{x[1]}.joblib"),
        itertools.product(["adult", "cars", "pima"], range(1, n_folds + 1)),
    )

    for export in exports:
        assert os.path.exists(export)
예제 #8
0
    def test_list(self, regressor, tmp_path):
        # should work with a list of valid pmlb datasets names
        an = Analysis(
            methods=[("dummy", regressor)],
            metric_names=["r2", "max_error"],
            datasets=[
                "503_wind", "581_fri_c3_500_25", "adult", "cars", "pima"
            ],
            random_state=SEED,
            output_dir=tmp_path,
            local_cache_dir=PMLB_CACHE,
        )
        assert len(an.datasets) == 5

        # throw error if invalid dataset name is passed
        with pytest.raises(ValueError):
            an = Analysis(
                methods=[("dummy", regressor)],
                metric_names=["r2", "max_error"],
                datasets=["adult", "invalid"],
                random_state=SEED,
                output_dir=tmp_path,
                local_cache_dir=PMLB_CACHE,
            )
예제 #9
0
 def test_tuple(self, regressor, tmp_path):
     # should works with a list of (X, y) tuples
     datasets = [
         (
             "data_1",
             *make_regression(
                 n_samples=200, n_features=5, random_state=SEED),
         ),
         (
             "data_2",
             *make_regression(
                 n_samples=1000, n_features=50, random_state=SEED),
         ),
     ]
     an = Analysis(
         methods=[("dummy", regressor)],
         metric_names=["r2", "max_error"],
         datasets=datasets,
         random_state=SEED,
         output_dir=tmp_path,
         local_cache_dir=PMLB_CACHE,
     )
     assert len(an.datasets) == 2
예제 #10
0
    (
        "lightgbm",
        SklearnMethod(LGBMClassifier(n_jobs=-1, verbose=0), ["accuracy", "f1_micro"]),
    ),
    (
        "catboost",
        SklearnMethod(
            CatBoostClassifier(thread_count=-1, verbose=0), ["accuracy", "f1_micro"]
        ),
    ),
    (
        "gbm",
        SklearnMethod(GradientBoostingClassifier(verbose=0), ["accuracy", "f1_micro"]),
    ),
]


an = Analysis(
    methods=methods,
    metric_names=["accuracy", "f1 score"],
    datasets="classification",
    n_datasets=10,
    random_state=SEED,
    # use_test_set=False   # to use cross-validation
)
an.run()

print(an.get_result_as_df("f1 score"))
an.plot_results("f1 score")
plt.show()
예제 #11
0
def test_results_none(tmp_path):
    linear = SklearnMethod(LinearRegression(), ["r2", "max_error"])
    tree = SklearnMethod(
        DecisionTreeRegressor(random_state=SEED),
        ["r2", "max_error"],
    )
    dummy = SklearnMethod(DummyRegressor(), ["r2", "max_error"])

    # with test set
    an = Analysis(
        methods=[("linear", linear), ("tree", tree), ("dummy", dummy)],
        metric_names=None,
        datasets="regression",
        n_datasets=3,
        random_state=SEED,
        use_test_set=True,
        output_dir=tmp_path,
        local_cache_dir=PMLB_CACHE,
    )
    an.run()

    assert an.results == None

    with pytest.raises(AttributeError):
        an.get_result_as_df()

    with pytest.raises(AttributeError):
        an.plot_results()

    # without test set
    an = Analysis(
        methods=[("linear", linear), ("tree", tree), ("dummy", dummy)],
        metric_names=None,
        datasets="regression",
        n_datasets=3,
        random_state=SEED,
        use_test_set=False,
        output_dir=tmp_path,
        local_cache_dir=PMLB_CACHE,
    )
    an.run()

    assert an.results == None

    with pytest.raises(AttributeError):
        an.get_result_as_df()

    with pytest.raises(AttributeError):
        an.plot_results()
예제 #12
0
def test_get_results_as_df(regressor, tmp_path):
    skl = SklearnMethod(LinearRegression(), ["r2", "max_error"],
                        export_model=True)
    # using test set
    an = Analysis(
        methods=[("dummy", regressor), ("linear", skl)],
        metric_names=["r2", "max_error"],
        datasets=["adult", "cars", "pima"],
        use_test_set=True,
        random_state=SEED,
        output_dir=tmp_path,
        local_cache_dir=PMLB_CACHE,
    )
    an.run()

    # only test set
    df = an.get_result_as_df("r2")
    assert (df.loc["adult", "dummy"] == an.results.loc["adult", "dummy", "r2",
                                                       "test"].values.item())
    df = an.get_result_as_df("max_error")
    assert (df.loc["cars",
                   "linear"] == an.results.loc["cars", "linear", "max_error",
                                               "test"].values.item())

    # with train & test set
    df = an.get_result_as_df("r2", train=True)
    assert (df.loc["adult",
                   ("linear",
                    "train")] == an.results.loc["adult", "linear", "r2",
                                                "train"].values.item())

    # using 5-fold validation
    an = Analysis(
        methods=[("dummy", regressor), ("linear", skl)],
        metric_names=["r2", "max_error"],
        datasets=["adult", "cars", "pima"],
        use_test_set=False,
        random_state=SEED,
        output_dir=tmp_path,
        local_cache_dir=PMLB_CACHE,
    )
    an.run()

    # check the mean and std
    df = an.get_result_as_df("r2")
    assert (df.loc["adult",
                   ("dummy", "mean")] == an.results.loc["adult", "dummy",
                                                        "r2", :].mean().item())
    assert (df.loc["adult",
                   ("dummy", "std")] == an.results.loc["adult", "dummy",
                                                       "r2", :].std().item())

    df = an.get_result_as_df("max_error")
    assert (df.loc["cars",
                   ("linear",
                    "mean")] == an.results.loc["cars", "linear",
                                               "max_error", :].mean().item())
    assert (df.loc["cars",
                   ("linear",
                    "std")] == an.results.loc["cars", "linear",
                                              "max_error", :].std().item())

    # check multiple folds
    df = an.get_result_as_df("r2", mean_folds=False)
    assert df.loc["adult", ("dummy", slice(None))].shape == (5, )
    np.testing.assert_array_equal(df.loc["adult", ("dummy", slice(None))],
                                  an.results.loc["adult", "dummy", "r2"])

    df = an.get_result_as_df("max_error", mean_folds=False)
    np.testing.assert_array_equal(
        df.loc["cars", ("linear", slice(None))],
        an.results.loc["cars", "linear", "max_error"],
    )
예제 #13
0
        self.gbmestimator.fit(X_train, y_train)

        fig, ax = plt.subplots()
        self.plot_imp(
            self.gbmestimator,
            ax=ax,
            xlabel="Feature importance",
            importance_type="gain",
        )
        # self.output_dir is made available through the Analysis class
        fig.savefig(os.path.join(self.output_dir, "importance.png"),
                    bbox_inches="tight")
        plt.close(fig)


methods = [
    ("xgb", GBImportancePlot(XGBClassifier(n_jobs=-1, verbose=0))),
    ("lgb", GBImportancePlot(LGBMClassifier(n_jobs=-1, verbose=0))),
]

an = Analysis(
    methods=methods,
    datasets="classification",
    n_datasets=3,
    random_state=SEED,
    use_test_set=False,  # our method does not implement testing
    output_dir="importance_output",
)

an.run()