def test_plot_results(regressor, tmp_path): skl = SklearnMethod(LinearRegression(), ["r2", "max_error"], export_model=True) # using test set an = Analysis( methods=[("dummy", regressor), ("linear", skl)], metric_names=["r2", "max_error"], datasets=["adult", "cars", "pima"], use_test_set=True, random_state=SEED, output_dir=tmp_path, local_cache_dir=PMLB_CACHE, ) an.run() ax = an.plot_results() xticklabels = [lab.get_text() for lab in ax.get_xticklabels()] assert xticklabels == ["adult", "cars", "pima"] # without test set an = Analysis( methods=[("dummy", regressor), ("linear", skl)], metric_names=["r2", "max_error"], datasets=["adult", "cars", "pima"], use_test_set=False, random_state=SEED, output_dir=tmp_path, local_cache_dir=PMLB_CACHE, ) an.run() ax = an.plot_results() xticklabels = [lab.get_text() for lab in ax.get_xticklabels()] assert xticklabels == ["adult", "cars", "pima"]
def test_string(self, regressor, tmp_path): # should work with "all", "classification", "regression" an = Analysis( methods=[("dummy", regressor)], metric_names=["r2", "max_error"], datasets="all", n_datasets=5, random_state=SEED, output_dir=tmp_path, local_cache_dir=PMLB_CACHE, ) assert len(an.datasets) == 5 an = Analysis( methods=[("dummy", regressor)], metric_names=["r2", "max_error"], datasets="classification", n_datasets=5, random_state=SEED, output_dir=tmp_path, local_cache_dir=PMLB_CACHE, ) assert len(an.datasets) == 5 an = Analysis( methods=[("dummy", regressor)], metric_names=["r2", "max_error"], datasets="regression", n_datasets=5, random_state=SEED, output_dir=tmp_path, local_cache_dir=PMLB_CACHE, ) assert len(an.datasets) == 5
def test_dropna(dropna, tmp_path): X, y = make_regression(n_samples=200, n_features=5, random_state=SEED) X[0, 2] = np.nan y[3] = np.nan an = Analysis( methods=[("mock", MockMethodNA(dropna=dropna))], metric_names=["r2", "max_error"], datasets=[("data_with_na", (X, y))], random_state=SEED, drop_na=dropna, use_test_set=False, output_dir=tmp_path, local_cache_dir=PMLB_CACHE, ) an.run()
def test_mixed(self, regressor, tmp_path): # should work with a mix of strings, tuples, tuple of tuples datasets = [ ( "data_1", *make_regression( n_samples=200, n_features=5, random_state=SEED), ), ( "data_2", *make_regression( n_samples=1000, n_features=50, random_state=SEED), ), ] def test_split(data): name, X, y = data X_train, X_test, y_train, y_test = train_test_split( X, y, random_state=SEED) return name, (X_train, y_train), (X_test, y_test) datasets = (list(map(test_split, datasets)) + datasets + ["adult", "cars", "pima"]) an = Analysis( methods=[("dummy", regressor)], metric_names=["r2", "max_error"], datasets=datasets, random_state=SEED, output_dir=tmp_path, local_cache_dir=PMLB_CACHE, ) assert len(an.datasets) == 7
def test_result_test_split(tmp_path): linear = SklearnMethod(LinearRegression(), ["r2", "max_error"]) tree = SklearnMethod( DecisionTreeRegressor(random_state=SEED), ["r2", "max_error"], ) dummy = SklearnMethod(DummyRegressor(), ["r2", "max_error"]) an = Analysis( methods=[("linear", linear), ("tree", tree), ("dummy", dummy)], metric_names=["r2", "max_error"], datasets="regression", n_datasets=3, random_state=SEED, use_test_set=True, output_dir=tmp_path, local_cache_dir=PMLB_CACHE, ) an.run() assert an.results.shape == (3, 3, 2, 2) assert not np.isnan(an.results.values).any() linear = SklearnMethod(LinearRegression(), ["r2", "max_error"]) tree = SklearnMethod( DecisionTreeRegressor(random_state=SEED), ["r2", "max_error"], ) # check if the results match for data in an.datasets: X, y = pmlb.fetch_data(data, return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=True, random_state=SEED) linear_r2, linear_max = linear.train(X_train, y_train) tree_r2, tree_max = tree.train(X_train, y_train) assert round( float(an.results.loc[data, "linear", "max_error", "train"].values), 4) == round(linear_max, 4) assert round( float(an.results.loc[data, "tree", "max_error", "train"].values), 4) == round(tree_max, 4)
def test_result_cv(tmp_path): linear = SklearnMethod(LinearRegression(), ["r2", "max_error"], cv=5) tree = SklearnMethod( DecisionTreeRegressor(random_state=SEED), ["r2", "max_error"], cv=5, ) dummy = SklearnMethod(DummyRegressor(), ["r2", "max_error"]) an = Analysis( methods=[("linear", linear), ("tree", tree), ("dummy", dummy)], metric_names=["r2", "max_error"], datasets="regression", n_datasets=3, random_state=SEED, use_test_set=False, output_dir=tmp_path, local_cache_dir=PMLB_CACHE, ) an.run() assert an.results.shape == (3, 3, 2, 5) assert not np.isnan(an.results.values).any() linear = SklearnMethod(LinearRegression(), ["r2", "max_error"]) tree = SklearnMethod( DecisionTreeRegressor(random_state=SEED), ["r2", "max_error"], ) linear.set_test_set(False) tree.set_test_set(False) # check if the results match for data in an.datasets: X, y = pmlb.fetch_data(data, return_X_y=True) linear_r2, linear_max = linear.train(X, y) tree_r2, tree_max = tree.train(X, y) an_linear_max = an.results.loc[data, "linear", "max_error"].values # check all folds np.testing.assert_allclose(an_linear_max, linear_max, rtol=1e-13) an_tree_max = an.results.loc[data, "tree", "max_error"].values # check all folds np.testing.assert_allclose(an_tree_max, tree_max, rtol=1e-13)
def test_output_dir(tmp_path): n_folds = 3 skl = SklearnMethod( LinearRegression(), ["neg_mean_squared_error", "r2"], export_model=True, cv=n_folds, ) # Using test set test_path = os.path.join(tmp_path, "test_output") an = Analysis( methods=[("dummy", skl)], metric_names=["r2", "max_error"], datasets=["adult", "cars", "pima"], random_state=SEED, output_dir=test_path, local_cache_dir=PMLB_CACHE, ) an.run() out_dir = os.path.join(test_path, "Analysis_1") exports = map( lambda x: os.path.join(out_dir, x, "dummy", "estimator.joblib"), ["adult", "cars", "pima"], ) for export in exports: assert os.path.exists(export) # Cross-validation test_path = os.path.join(tmp_path, "cv_output") an = Analysis( methods=[("dummy", skl)], metric_names=["r2", "max_error"], datasets=["adult", "cars", "pima"], random_state=SEED, output_dir=test_path, local_cache_dir=PMLB_CACHE, use_test_set=False, ) an.run() out_dir = os.path.join(test_path, "Analysis_1") exports = map( lambda x: os.path.join(out_dir, x[0], "dummy", f"estimator_fold_{x[1]}.joblib"), itertools.product(["adult", "cars", "pima"], range(1, n_folds + 1)), ) for export in exports: assert os.path.exists(export)
def test_list(self, regressor, tmp_path): # should work with a list of valid pmlb datasets names an = Analysis( methods=[("dummy", regressor)], metric_names=["r2", "max_error"], datasets=[ "503_wind", "581_fri_c3_500_25", "adult", "cars", "pima" ], random_state=SEED, output_dir=tmp_path, local_cache_dir=PMLB_CACHE, ) assert len(an.datasets) == 5 # throw error if invalid dataset name is passed with pytest.raises(ValueError): an = Analysis( methods=[("dummy", regressor)], metric_names=["r2", "max_error"], datasets=["adult", "invalid"], random_state=SEED, output_dir=tmp_path, local_cache_dir=PMLB_CACHE, )
def test_tuple(self, regressor, tmp_path): # should works with a list of (X, y) tuples datasets = [ ( "data_1", *make_regression( n_samples=200, n_features=5, random_state=SEED), ), ( "data_2", *make_regression( n_samples=1000, n_features=50, random_state=SEED), ), ] an = Analysis( methods=[("dummy", regressor)], metric_names=["r2", "max_error"], datasets=datasets, random_state=SEED, output_dir=tmp_path, local_cache_dir=PMLB_CACHE, ) assert len(an.datasets) == 2
( "lightgbm", SklearnMethod(LGBMClassifier(n_jobs=-1, verbose=0), ["accuracy", "f1_micro"]), ), ( "catboost", SklearnMethod( CatBoostClassifier(thread_count=-1, verbose=0), ["accuracy", "f1_micro"] ), ), ( "gbm", SklearnMethod(GradientBoostingClassifier(verbose=0), ["accuracy", "f1_micro"]), ), ] an = Analysis( methods=methods, metric_names=["accuracy", "f1 score"], datasets="classification", n_datasets=10, random_state=SEED, # use_test_set=False # to use cross-validation ) an.run() print(an.get_result_as_df("f1 score")) an.plot_results("f1 score") plt.show()
def test_results_none(tmp_path): linear = SklearnMethod(LinearRegression(), ["r2", "max_error"]) tree = SklearnMethod( DecisionTreeRegressor(random_state=SEED), ["r2", "max_error"], ) dummy = SklearnMethod(DummyRegressor(), ["r2", "max_error"]) # with test set an = Analysis( methods=[("linear", linear), ("tree", tree), ("dummy", dummy)], metric_names=None, datasets="regression", n_datasets=3, random_state=SEED, use_test_set=True, output_dir=tmp_path, local_cache_dir=PMLB_CACHE, ) an.run() assert an.results == None with pytest.raises(AttributeError): an.get_result_as_df() with pytest.raises(AttributeError): an.plot_results() # without test set an = Analysis( methods=[("linear", linear), ("tree", tree), ("dummy", dummy)], metric_names=None, datasets="regression", n_datasets=3, random_state=SEED, use_test_set=False, output_dir=tmp_path, local_cache_dir=PMLB_CACHE, ) an.run() assert an.results == None with pytest.raises(AttributeError): an.get_result_as_df() with pytest.raises(AttributeError): an.plot_results()
def test_get_results_as_df(regressor, tmp_path): skl = SklearnMethod(LinearRegression(), ["r2", "max_error"], export_model=True) # using test set an = Analysis( methods=[("dummy", regressor), ("linear", skl)], metric_names=["r2", "max_error"], datasets=["adult", "cars", "pima"], use_test_set=True, random_state=SEED, output_dir=tmp_path, local_cache_dir=PMLB_CACHE, ) an.run() # only test set df = an.get_result_as_df("r2") assert (df.loc["adult", "dummy"] == an.results.loc["adult", "dummy", "r2", "test"].values.item()) df = an.get_result_as_df("max_error") assert (df.loc["cars", "linear"] == an.results.loc["cars", "linear", "max_error", "test"].values.item()) # with train & test set df = an.get_result_as_df("r2", train=True) assert (df.loc["adult", ("linear", "train")] == an.results.loc["adult", "linear", "r2", "train"].values.item()) # using 5-fold validation an = Analysis( methods=[("dummy", regressor), ("linear", skl)], metric_names=["r2", "max_error"], datasets=["adult", "cars", "pima"], use_test_set=False, random_state=SEED, output_dir=tmp_path, local_cache_dir=PMLB_CACHE, ) an.run() # check the mean and std df = an.get_result_as_df("r2") assert (df.loc["adult", ("dummy", "mean")] == an.results.loc["adult", "dummy", "r2", :].mean().item()) assert (df.loc["adult", ("dummy", "std")] == an.results.loc["adult", "dummy", "r2", :].std().item()) df = an.get_result_as_df("max_error") assert (df.loc["cars", ("linear", "mean")] == an.results.loc["cars", "linear", "max_error", :].mean().item()) assert (df.loc["cars", ("linear", "std")] == an.results.loc["cars", "linear", "max_error", :].std().item()) # check multiple folds df = an.get_result_as_df("r2", mean_folds=False) assert df.loc["adult", ("dummy", slice(None))].shape == (5, ) np.testing.assert_array_equal(df.loc["adult", ("dummy", slice(None))], an.results.loc["adult", "dummy", "r2"]) df = an.get_result_as_df("max_error", mean_folds=False) np.testing.assert_array_equal( df.loc["cars", ("linear", slice(None))], an.results.loc["cars", "linear", "max_error"], )
self.gbmestimator.fit(X_train, y_train) fig, ax = plt.subplots() self.plot_imp( self.gbmestimator, ax=ax, xlabel="Feature importance", importance_type="gain", ) # self.output_dir is made available through the Analysis class fig.savefig(os.path.join(self.output_dir, "importance.png"), bbox_inches="tight") plt.close(fig) methods = [ ("xgb", GBImportancePlot(XGBClassifier(n_jobs=-1, verbose=0))), ("lgb", GBImportancePlot(LGBMClassifier(n_jobs=-1, verbose=0))), ] an = Analysis( methods=methods, datasets="classification", n_datasets=3, random_state=SEED, use_test_set=False, # our method does not implement testing output_dir="importance_output", ) an.run()