from sklearn.linear_model import LogisticRegression, LinearRegression from sklearn.exceptions import NotFittedError from sklego.common import flatten from sklego.meta import Thresholder from tests.conftest import general_checks, classifier_checks, select_tests @pytest.mark.parametrize( "test_fn", select_tests(flatten([general_checks, classifier_checks]), exclude=[ "check_sample_weights_invariance", "check_fit2d_predict1d", "check_methods_subset_invariance", "check_dont_overwrite_parameters", "check_classifiers_classes", "check_classifiers_train", "check_supervised_y_2d", ] # outliers train wont work because we have two thresholds ), ) def test_standard_checks(test_fn): trf = Thresholder(LogisticRegression(), threshold=0.5) test_fn(Thresholder.__name__, trf) def test_same_threshold(): mod1 = Thresholder(LogisticRegression(), threshold=0.5) mod2 = LogisticRegression() X = np.random.normal(0, 1, (100, 3))
import numpy as np from sklearn.ensemble import IsolationForest from sklearn.cluster import KMeans from sklearn.pipeline import Pipeline from sklego.common import flatten from sklego.meta import OutlierRemover from sklego.mixture import GMMOutlierDetector from tests.conftest import general_checks, select_tests @pytest.mark.parametrize("test_fn", select_tests(flatten([general_checks]), exclude=[ "check_sample_weights_invariance", "check_methods_subset_invariance" ])) def test_estimator_checks(test_fn): gmm_remover = OutlierRemover(outlier_detector=GMMOutlierDetector(), refit=True) test_fn(OutlierRemover.__name__, gmm_remover) isolation_forest_remover = OutlierRemover( outlier_detector=IsolationForest(), refit=True) test_fn(OutlierRemover.__name__, isolation_forest_remover) def test_no_outliers(mocker): mock_outlier_detector = mocker.Mock() mock_outlier_detector.fit.return_value = None
import numpy as np import pytest from sklego.common import flatten from sklego.linear_model import ProbWeightRegression from tests.conftest import nonmeta_checks, regressor_checks, general_checks, select_tests @pytest.mark.parametrize( "test_fn", select_tests(flatten([general_checks, nonmeta_checks, regressor_checks]), exclude=[ "check_sample_weights_invariance", ])) @pytest.mark.cvxpy def test_estimator_checks(test_fn): regr_min_zero = ProbWeightRegression(non_negative=True) test_fn(ProbWeightRegression.__name__ + "_min_zero_true", regr_min_zero) regr_not_min_zero = ProbWeightRegression(non_negative=False) test_fn(ProbWeightRegression.__name__ + "_min_zero_true_false", regr_not_min_zero) @pytest.mark.cvxpy def test_shape_trained_model(random_xy_dataset_regr): X, y = random_xy_dataset_regr mod_no_intercept = ProbWeightRegression() assert mod_no_intercept.fit(X, y).coefs_.shape == (X.shape[1], ) np.testing.assert_approx_equal(mod_no_intercept.fit(X, y).coefs_.sum(), 1.0, significant=4)
import numpy as np import pytest from sklego.common import flatten from sklego.linear_model import ProbWeightRegression from tests.conftest import nonmeta_checks, regressor_checks, general_checks, select_tests @pytest.mark.parametrize( "test_fn", select_tests( flatten([general_checks, nonmeta_checks, regressor_checks]), exclude=[ "check_sample_weights_invariance", "check_sample_weights_list", "check_sample_weights_pandas_series" ] ) ) @pytest.mark.cvxpy def test_estimator_checks(test_fn): regr_min_zero = ProbWeightRegression(non_negative=True) test_fn(ProbWeightRegression.__name__ + "_min_zero_true", regr_min_zero) regr_not_min_zero = ProbWeightRegression(non_negative=False) test_fn(ProbWeightRegression.__name__ + "_min_zero_true_false", regr_not_min_zero) @pytest.mark.cvxpy def test_shape_trained_model(random_xy_dataset_regr): X, y = random_xy_dataset_regr mod_no_intercept = ProbWeightRegression()
from sklego.common import flatten from sklego.meta import ZeroInflatedRegressor from sklego.testing import check_shape_remains_same_regressor from tests.conftest import general_checks, select_tests, regressor_checks @pytest.mark.parametrize("test_fn", [check_shape_remains_same_regressor]) def test_zir(test_fn): regr = ZeroInflatedRegressor( classifier=ExtraTreesClassifier(random_state=0), regressor=ExtraTreesRegressor(random_state=0)) test_fn(ZeroInflatedRegressor.__name__, regr) @pytest.mark.parametrize("test_fn", select_tests( flatten([general_checks, regressor_checks]), )) def test_estimator_checks(test_fn): test_fn( ZeroInflatedRegressor.__name__, ZeroInflatedRegressor(classifier=ExtraTreesClassifier(random_state=0), regressor=ExtraTreesRegressor(random_state=0))) def test_zero_inflated_example(): from sklearn.model_selection import cross_val_score np.random.seed(0) X = np.random.randn(10000, 4) y = ((X[:, 0] > 0) & (X[:, 1] > 0)) * np.abs( X[:, 2] * X[:, 3]**2) # many zeroes here, in about 75% of the cases.
from sklearn.model_selection import train_test_split from sklego.common import flatten from sklego.preprocessing import RandomAdder from tests.conftest import select_tests, transformer_checks, nonmeta_checks, general_checks @pytest.mark.parametrize( "test_fn", select_tests( flatten([general_checks, transformer_checks, nonmeta_checks]), exclude=[ "check_sample_weights_invariance", "check_methods_subset_invariance", "check_transformer_data_not_an_array", "check_transformer_general" ] ) ) def test_estimator_checks(test_fn): adder = RandomAdder() test_fn(RandomAdder.__name__, adder) def test_dtype_regression(random_xy_dataset_regr): X, y = random_xy_dataset_regr assert RandomAdder().fit(X, y).transform(X).dtype == np.float
import numpy as np import pandas as pd import pytest from sklego.common import flatten from sklego.mixture import GMMOutlierDetector, BayesianGMMOutlierDetector from tests.conftest import general_checks, nonmeta_checks, select_tests, outlier_checks @pytest.mark.parametrize( "test_fn", select_tests( flatten([general_checks, nonmeta_checks, outlier_checks]), exclude=[ "check_sample_weights_invariance", "check_outliers_train" ] ) ) def test_estimator_checks(test_fn): clf_quantile = GMMOutlierDetector(threshold=0.999, method="quantile") test_fn(GMMOutlierDetector.__name__ + "_quantile", clf_quantile) clf_stddev = GMMOutlierDetector(threshold=2, method="stddev") test_fn(GMMOutlierDetector.__name__ + "_stddev", clf_stddev) bayes_clf_quantile = BayesianGMMOutlierDetector(threshold=0.999, method="quantile") test_fn(BayesianGMMOutlierDetector.__name__ + "_quantile", bayes_clf_quantile) bayes_clf_stddev = BayesianGMMOutlierDetector(threshold=2, method="stddev") test_fn(BayesianGMMOutlierDetector.__name__ + "_stddev", bayes_clf_stddev)
import numpy as np import pandas as pd import pytest from sklego.common import flatten from sklego.preprocessing import DictMapper from tests.conftest import select_tests, transformer_checks, general_checks, nonmeta_checks @pytest.mark.parametrize( "test_fn", select_tests( flatten([general_checks, nonmeta_checks, transformer_checks]), exclude=["check_sample_weights_invariance", "check_dtype_object"])) def test_estimator_checks(test_fn): test_fn(DictMapper.__name__, DictMapper(mapper={"foo": 1}, default=-1)) @pytest.fixture() def mapper(): return {"foo": 1, "bar": 2, "baz": 3} @pytest.mark.parametrize( "input_array,expected_array", [ (["foo", "bar", "baz"], [1, 2, 3]), (["foo", "bar", "bar"], [1, 2, 2]), (["foo", "bar", "monty"], [1, 2, -1]), (["foo", "bar", np.nan], [1, 2, -1]),
from tests.conftest import ( nonmeta_checks, general_checks, transformer_checks, select_tests, ) @pytest.mark.parametrize( "test_fn", select_tests( flatten([nonmeta_checks, transformer_checks, general_checks]), exclude=[ "check_transformer_data_not_an_array", "check_estimators_nan_inf", "check_fit2d_predict1d", "check_sample_weights_invariance", ], ), ) def test_estimator_checks_binary(test_fn): random_proj = BinaryRandomProjection(random_seed=42) test_fn(random_proj, random_proj) @pytest.mark.parametrize( "test_fn", select_tests( flatten([nonmeta_checks, transformer_checks, general_checks]), exclude=[
import pytest import numpy as np from sklego.common import flatten from sklego.decomposition import PCAOutlierDetection from tests.conftest import general_checks, outlier_checks, select_tests, nonmeta_checks @pytest.mark.parametrize( "test_fn", select_tests(flatten([general_checks, nonmeta_checks, outlier_checks]), exclude=[ "check_sample_weights_invariance", "check_outliers_fit_predict", "check_outliers_train", "check_sample_weights_list", "check_sample_weights_pandas_series" ])) def test_estimator_checks(test_fn): outlier_mod = PCAOutlierDetection(n_components=2, threshold=0.05, random_state=42, variant='absolute') test_fn(PCAOutlierDetection.__name__, outlier_mod) @pytest.fixture def dataset(): np.random.seed(42) return np.concatenate([np.random.normal(0, 1, (2000, 10))])
import pytest import numpy as np from sklego.common import flatten from sklego.decomposition import UMAPOutlierDetection from tests.conftest import general_checks, outlier_checks, select_tests, nonmeta_checks @pytest.mark.parametrize( "test_fn", select_tests( flatten([general_checks, nonmeta_checks, outlier_checks]), exclude=[ "check_sample_weights_invariance", "check_outliers_fit_predict", "check_outliers_train", "check_fit2d_predict1d", "check_methods_subset_invariance", "check_fit2d_1sample", "check_fit2d_1feature", "check_dict_unchanged", "check_dont_overwrite_parameters", "check_classifier_data_not_an_array", "check_sample_weights_list", "check_sample_weights_pandas_series" ])) def test_estimator_checks(test_fn): outlier_mod = UMAPOutlierDetection(n_components=2, threshold=0.1) test_fn(UMAPOutlierDetection.__name__, outlier_mod) @pytest.fixture def dataset(): np.random.seed(42) return np.concatenate([np.random.normal(0, 1, (200, 10))])
import pytest import numpy as np import pandas as pd from sklearn.linear_model import LinearRegression, LogisticRegression from sklego.common import flatten from sklego.meta import RegressionOutlierDetector from tests.conftest import general_checks, select_tests, outlier_checks @pytest.mark.parametrize( "test_fn", select_tests( flatten([general_checks, outlier_checks]), exclude=['check_fit2d_predict1d', 'check_fit2d_1feature', 'check_outliers_train'] # outliers train wont work because we have two thresholds ) ) def test_estimator_checks(test_fn): mod = RegressionOutlierDetector(LinearRegression(), column=0) test_fn(RegressionOutlierDetector.__name__, mod) def test_obvious_example(): # generate random data for illustrative example np.random.seed(42) X = np.random.normal(0, 1, (100, 1)) y = 1 + np.sum(X, axis=1).reshape(-1, 1) + np.random.normal(0, 0.2, (100, 1)) for i in [20, 25, 50, 80]: y[i] += 2
import numpy as np import pytest from sklearn.cluster import DBSCAN from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import Ridge, LogisticRegression from sklego.common import flatten from sklego.meta import SubjectiveClassifier from tests.conftest import general_checks, classifier_checks, select_tests @pytest.mark.parametrize( "test_fn", select_tests( flatten([general_checks, classifier_checks]), exclude=['check_sample_weights_invariance'] # outliers train wont work because we have two thresholds ) ) def test_estimator_checks_classification(test_fn): if test_fn.__name__ == "check_classifiers_classes": prior = { "one": 0.1, "two": 0.1, "three": 0.1, -1: 0.1, 1: 0.6, } # nonsensical prior to make sklearn check pass else: prior = {0: 0.7, 1: 0.2, 2: 0.1}
from sklearn.pipeline import Pipeline from sklearn.linear_model import LinearRegression from sklearn.model_selection import GridSearchCV from sklearn.datasets import load_boston from sklego.common import flatten from sklego.preprocessing import InformationFilter from tests.conftest import select_tests, transformer_checks, nonmeta_checks, general_checks @pytest.mark.parametrize( "test_fn", select_tests(flatten([general_checks, transformer_checks, nonmeta_checks]), exclude=[ "check_sample_weights_invariance", "check_estimators_empty_data_messages" ])) def test_estimator_checks(test_fn): test_fn(InformationFilter.__name__, InformationFilter(columns=[0])) def test_v_columns_orthogonal(): X, y = load_boston(return_X_y=True) ifilter = InformationFilter(columns=[11, 12]).fit(X, y) v_values = ifilter._make_v_vectors(X, [11, 12]) assert v_values.prod(axis=1).sum() == pytest.approx(0, abs=1e-5) def test_output_orthogonal(): X, y = load_boston(return_X_y=True)
import numpy as np from sklearn.ensemble import IsolationForest from sklearn.svm import OneClassSVM from sklearn.linear_model import LinearRegression from sklearn.neighbors import LocalOutlierFactor from sklego.common import flatten from sklego.mixture import GMMOutlierDetector from sklego.meta import OutlierClassifier from tests.conftest import general_checks, select_tests @pytest.mark.parametrize("test_fn", select_tests(flatten([general_checks]), exclude=[ "check_sample_weights_invariance", ])) def test_estimator_checks(test_fn): mod_quantile = GMMOutlierDetector(threshold=0.999, method="quantile") clf_quantile = OutlierClassifier(mod_quantile) test_fn('OutlierClassifier', clf_quantile) @pytest.fixture def dataset(): np.random.seed(42) return np.random.normal(0, 1, (2000, 2)) @pytest.mark.parametrize( 'outlier_model',
from sklearn.dummy import DummyClassifier from sklearn.ensemble import StackingClassifier from sklego.common import flatten from sklego.meta import Thresholder from tests.conftest import general_checks, classifier_checks, select_tests @pytest.mark.parametrize( "test_fn", select_tests( flatten([general_checks, classifier_checks]), exclude=[ "check_sample_weights_invariance", "check_fit2d_predict1d", "check_methods_subset_invariance", "check_dont_overwrite_parameters", "check_classifiers_classes", "check_classifiers_train", "check_supervised_y_2d", "check_classifier_data_not_an_array", # https://github.com/koaning/scikit-lego/issues/490 ] # outliers train wont work because we have two thresholds ), ) def test_standard_checks(test_fn): trf = Thresholder(LogisticRegression(), threshold=0.5) test_fn(Thresholder.__name__, trf) def test_same_threshold(): mod1 = Thresholder(LogisticRegression(), threshold=0.5) mod2 = LogisticRegression()
return np.array( [1 if r > 0.0 else 0 for r in np.random.normal(0, 1, len(X))]) @pytest.mark.parametrize( "test_fn", select_tests( include=flatten([general_checks, regressor_checks, nonmeta_checks]), exclude=[ "check_methods_subset_invariance", "check_fit2d_1sample", "check_fit2d_1feature", "check_regressors_train", "check_fit2d_predict1d", "check_fit1d", "check_regressor_data_not_an_array", "check_supervised_y_2d", "check_supervised_y_no_nan", "check_dtype_object", "check_complex_data", "check_estimators_empty_data_messages", "check_estimators_nan_inf", "check_estimator_sparse_data", ], ), ) def test_estimator_checks(test_fn): clf = FunctionRegressor(func=predict) test_fn(FunctionRegressor.__name__ + "_fallback", clf)
import numpy as np import pytest from sklego.common import flatten from sklego.dummy import RandomRegressor from tests.conftest import nonmeta_checks, regressor_checks, general_checks, select_tests @pytest.mark.parametrize( "test_fn", select_tests(flatten([general_checks, nonmeta_checks, regressor_checks]), exclude=[ "check_sample_weights_invariance", "check_methods_subset_invariance", "check_regressors_train" ])) def test_estimator_checks(test_fn): # Tests that are skipped: # 'check_methods_subset_invariance': Since we add noise, the method is not invariant on a subset # 'check_regressors_train': score is not always greater than 0.5 due to randomness regr_normal = RandomRegressor(strategy="normal") test_fn(RandomRegressor.__name__ + "_normal", regr_normal) regr_uniform = RandomRegressor(strategy="uniform") test_fn(RandomRegressor.__name__ + "_uniform", regr_uniform) def test_values_uniform(random_xy_dataset_regr): X, y = random_xy_dataset_regr mod = RandomRegressor(strategy="uniform") predictions = mod.fit(X, y).predict(X)
nonmeta_checks, ) def double(x, factor=2): return x * factor @pytest.mark.parametrize( "test_fn", select_tests( include=flatten([general_checks, transformer_checks, nonmeta_checks]), exclude=[ "check_estimators_nan_inf", "check_estimators_empty_data_messages", "check_transformer_data_not_an_array", "check_dtype_object", "check_complex_data", "check_fit1d", ], ), ) def test_estimator_checks(test_fn): clf = PipeTransformer(func=double) test_fn(PipeTransformer.__name__, clf) @pytest.mark.parametrize("factor", [1.0, 2.0, 5.0]) def test_basic_example(factor): np.random.seed(42) X = np.random.normal(0, 1, (1000, 4))
@pytest.mark.parametrize( "test_fn", select_tests( include=flatten([general_checks, classifier_checks, nonmeta_checks]), exclude=[ "check_estimators_pickle", "check_estimator_sparse_data", "check_estimators_nan_inf", "check_pipeline_consistency", "check_complex_data", "check_fit2d_predict1d", "check_methods_subset_invariance", "check_fit1d", "check_dict_unchanged", "check_classifier_data_not_an_array", "check_classifiers_one_label", "check_classifiers_classes", "check_classifiers_train", "check_supervised_y_2d", "check_supervised_y_no_nan", "check_estimators_unfitted", "check_estimators_dtypes", "check_fit_score_takes_y", "check_dtype_object", "check_estimators_empty_data_messages", "check_sample_weights_list", "check_sample_weights_pandas_series", ], ), ) def test_estimator_checks(test_fn):
import pytest import numpy as np import pandas as pd from sklearn.utils.validation import FLOAT_DTYPES from sklego.common import flatten from sklego.preprocessing import ColumnCapper from tests.conftest import select_tests, transformer_checks, general_checks, nonmeta_checks @pytest.mark.parametrize( "test_fn", select_tests(flatten([general_checks, nonmeta_checks, transformer_checks]), exclude=[ "check_sample_weights_invariance", "check_estimators_nan_inf", "check_sample_weights_list", "check_sample_weights_pandas_series" ])) def test_estimator_checks(test_fn): test_fn(ColumnCapper.__name__, ColumnCapper()) def test_quantile_range(): def expect_type_error(quantile_range): with pytest.raises(TypeError): ColumnCapper(quantile_range) def expect_value_error(quantile_range): with pytest.raises(ValueError): ColumnCapper(quantile_range)
import numpy as np import pytest from sklego.common import flatten from sklego.mixture import GMMClassifier, BayesianGMMClassifier from tests.conftest import general_checks, nonmeta_checks, select_tests @pytest.mark.parametrize( "test_fn", select_tests( flatten([general_checks, nonmeta_checks]), exclude=[ "check_sample_weights_invariance", "check_non_transformer_estimators_n_iter", ] ) ) def test_estimator_checks(test_fn): clf = GMMClassifier() test_fn(GMMClassifier.__name__, clf) clf = BayesianGMMClassifier() test_fn(BayesianGMMClassifier.__name__, clf) def test_obvious_usecase(): X = np.concatenate( [np.random.normal(-10, 1, (100, 2)), np.random.normal(10, 1, (100, 2))] ) y = np.concatenate([np.zeros(100), np.ones(100)]) assert (GMMClassifier().fit(X, y).predict(X) == y).all()
from sklearn.dummy import DummyRegressor from sklego.common import flatten from sklego.meta import GroupedPredictor from sklego.datasets import load_chicken from tests.conftest import general_checks, select_tests @pytest.mark.parametrize( "test_fn", select_tests( flatten([general_checks]), exclude=[ # Nonsense checks because we always need at least two columns (group and value) "check_fit1d", "check_fit2d_predict1d", "check_fit2d_1feature", "check_transformer_data_not_an_array", ], ), ) def test_estimator_checks(test_fn): clf = GroupedPredictor( estimator=LinearRegression(), groups=0, use_global_model=True ) test_fn(GroupedPredictor.__name__ + "_fallback", clf) clf = GroupedPredictor( estimator=LinearRegression(), groups=0, use_global_model=False ) test_fn(GroupedPredictor.__name__ + "_nofallback", clf)
general_checks, nonmeta_checks, ) @pytest.mark.parametrize( "test_fn", select_tests( include=flatten([general_checks, nonmeta_checks]), exclude=[ "check_estimators_pickle", "check_estimators_nan_inf", "check_estimators_empty_data_messages", "check_complex_data", "check_dtype_object", "check_estimators_dtypes", "check_dict_unchanged", "check_fit1d", "check_methods_subset_invariance", "check_fit2d_predict1d", "check_sample_weights_list", "check_sample_weights_pandas_series", ], ), ) def test_estimator_checks(test_fn): """ We're skipping a lot of tests here mainly because this model is "bespoke" it is *not* general. Therefore a lot of assumptions are broken. """ clf = InteractivePreprocessor.from_json(
def class_based(dataf, sex="male", pclass=1): predicate = (dataf["sex"] == sex) & (dataf["pclass"] == pclass) return np.array(predicate).astype(int) * 2 - 1 @pytest.mark.parametrize( "test_fn", select_tests( include=flatten([general_checks, outlier_checks, nonmeta_checks]), exclude=[ "check_outliers_train", "check_estimators_nan_inf", "check_estimators_empty_data_messages", "check_complex_data", "check_dtype_object", "check_classifier_data_not_an_array", "check_fit1d", "check_methods_subset_invariance", "check_fit2d_predict1d", "check_estimator_sparse_data", ], ), ) def test_estimator_checks(test_fn): clf = FunctionOutlierDetector(func=predict) test_fn(FunctionOutlierDetector.__name__ + "_fallback", clf) def test_works_with_gridsearch(random_xy_dataset_clf): X, y = random_xy_dataset_clf
return np.array(predicate).astype(int) @pytest.mark.parametrize( "test_fn", select_tests( include=flatten([general_checks, classifier_checks, nonmeta_checks]), exclude=[ "check_methods_subset_invariance", "check_fit2d_1sample", "check_fit2d_1feature", "check_classifier_data_not_an_array", "check_classifiers_one_label", "check_classifiers_classes", "check_classifiers_train", "check_supervised_y_2d", "check_estimators_pickle", "check_pipeline_consistency", "check_fit2d_predict1d", "check_fit1d", "check_dtype_object", "check_complex_data", "check_estimators_empty_data_messages", "check_estimators_nan_inf", "check_estimator_sparse_data", ], ), ) def test_estimator_checks(test_fn): clf = FunctionClassifier(func=predict) test_fn(FunctionClassifier.__name__ + "_fallback", clf)