def test_distribution_statistics_attributes_ks():
    d1 = np.histogram(np.random.normal(size=1000), 10)[0]
    d2 = np.histogram(np.random.normal(size=1000), 10)[0]
    myTest = DistributionStatistics('ks', binning_strategy=None)
    _ = myTest.compute(d1, d2, verbose=False)
    ks_value, p_value = ks(d1, d2)
    assert myTest.statistic == ks_value
def test_distribution_statistics_ks_no_binning():
    d1 = np.histogram(np.random.normal(size=1000), 10)[0]
    d2 = np.histogram(np.random.weibull(1, size=1000) - 1, 10)[0]
    myTest = DistributionStatistics('ks', binning_strategy=None)
    assert not myTest.fitted
    res = myTest.compute(d1, d2)
    assert myTest.fitted
    assert isinstance(res, tuple)
def test_distribution_statistics_tuple_output():
    d1 = np.histogram(np.random.normal(size=1000), 10)[0]
    d2 = np.histogram(np.random.weibull(1, size=1000) - 1, 10)[0]
    myTest = DistributionStatistics('ks', 'SimpleBucketer', bin_count=10)
    assert not myTest.fitted
    res = myTest.compute(d1, d2)
    assert myTest.fitted
    assert isinstance(res, tuple)
def test_distribution_statistics_psi():
    d1 = np.histogram(np.random.normal(size=1000), 10)[0]
    d2 = np.histogram(np.random.weibull(1, size=1000) - 1, 10)[0]
    myTest = DistributionStatistics('psi', 'SimpleBucketer', bin_count=10)
    assert not myTest.fitted
    psi_test, p_value_test = myTest.compute(d1, d2)
    assert myTest.fitted
    assert isinstance(psi_test, numbers.Number)
def test_distribution_statistics_base():
    with pytest.raises(NotImplementedError):
        assert DistributionStatistics('doesnotexist',
                                      'SimpleBucketer',
                                      bin_count=10)
    with pytest.raises(NotImplementedError):
        assert DistributionStatistics('psi', 'doesnotexist', bin_count=10)
    myTest = DistributionStatistics('psi', 'SimpleBucketer', bin_count=10)
    assert repr(myTest).startswith('DistributionStatistics')
def test_distribution_statistics_attributes_psi():
    a = np.random.normal(size=1000)
    b = np.random.normal(size=1000)
    d1 = np.histogram(a, 10)[0]
    d2 = np.histogram(b, 10)[0]
    myTest = DistributionStatistics('psi', binning_strategy=None)
    _ = myTest.compute(d1, d2, verbose=False)
    psi_value_test, p_value_test = psi(d1, d2, verbose=False)
    assert myTest.statistic == psi_value_test
示例#7
0
def test_distribution_statistics_base():
    """
    Test.
    """
    with pytest.raises(NotImplementedError):
        assert DistributionStatistics("doesnotexist", "SimpleBucketer", bin_count=10)
    with pytest.raises(NotImplementedError):
        assert DistributionStatistics("psi", "doesnotexist", bin_count=10)
    myTest = DistributionStatistics("psi", "SimpleBucketer", bin_count=10)
    assert repr(myTest).startswith("DistributionStatistics")
示例#8
0
def test_distribution_statistics_autodist_base():
    """
    Test.
    """
    nr_features = 2
    size = 1000
    np.random.seed(0)
    df1 = pd.DataFrame(np.random.normal(size=(size, nr_features)), columns=[f"feat_{x}" for x in range(nr_features)])
    df2 = pd.DataFrame(np.random.normal(size=(size, nr_features)), columns=[f"feat_{x}" for x in range(nr_features)])
    features = df1.columns
    myAutoDist = AutoDist(statistical_tests="all", binning_strategies="all", bin_count=[10, 20])
    assert repr(myAutoDist).startswith("AutoDist")
    assert not myAutoDist.fitted
    res = myAutoDist.compute(df1, df2, column_names=features)
    assert myAutoDist.fitted
    pd.testing.assert_frame_equal(res, myAutoDist.result)
    assert isinstance(res, pd.DataFrame)
    assert res["column"].values.tolist() == features.to_list()

    dist = DistributionStatistics(statistical_test="ks", binning_strategy="simplebucketer", bin_count=10)
    dist.compute(df1["feat_0"], df2["feat_0"])
    assert dist.p_value == res.loc[res["column"] == "feat_0", "p_value_KS_simplebucketer_10"][0]
    assert dist.statistic == res.loc[res["column"] == "feat_0", "statistic_KS_simplebucketer_10"][0]

    dist = DistributionStatistics(statistical_test="ks", binning_strategy=None, bin_count=10)
    dist.compute(df1["feat_0"], df2["feat_0"])
    assert dist.p_value == res.loc[res["column"] == "feat_0", "p_value_KS_no_bucketing_0"][0]
    assert dist.statistic == res.loc[res["column"] == "feat_0", "statistic_KS_no_bucketing_0"][0]
示例#9
0
def test_distribution_statistics_autodist_base():
    '''DistributionStatiistics autodist base'''
    nr_features = 2
    size = 1000
    np.random.seed(0)
    df1 = pd.DataFrame(np.random.normal(size=(size, nr_features)),
                       columns=[f'feat_{x}' for x in range(nr_features)])
    df2 = pd.DataFrame(np.random.normal(size=(size, nr_features)),
                       columns=[f'feat_{x}' for x in range(nr_features)])
    features = df1.columns
    myAutoDist = AutoDist(statistical_tests='all',
                          binning_strategies='all',
                          bin_count=[10, 20])
    assert repr(myAutoDist).startswith('AutoDist')
    assert not myAutoDist.fitted
    res = myAutoDist.compute(df1, df2, column_names=features)
    assert myAutoDist.fitted
    pd.testing.assert_frame_equal(res, myAutoDist.result)
    assert isinstance(res, pd.DataFrame)
    assert res['column'].values.tolist() == features.to_list()

    dist = DistributionStatistics(statistical_test='ks',
                                  binning_strategy='simplebucketer',
                                  bin_count=10)
    dist.compute(df1['feat_0'], df2['feat_0'])
    assert dist.p_value == res.loc[res['column'] == 'feat_0',
                                   'p_value_KS_simplebucketer_10'][0]
    assert dist.statistic == res.loc[res['column'] == 'feat_0',
                                     'statistic_KS_simplebucketer_10'][0]

    dist = DistributionStatistics(statistical_test='ks',
                                  binning_strategy=None,
                                  bin_count=10)
    dist.compute(df1['feat_0'], df2['feat_0'])
    assert dist.p_value == res.loc[res['column'] == 'feat_0',
                                   'p_value_KS_no_bucketing_0'][0]
    assert dist.statistic == res.loc[res['column'] == 'feat_0',
                                     'statistic_KS_no_bucketing_0'][0]
示例#10
0
    def __init__(
        self,
        clf,
        scoring="roc_auc",
        test_prc=0.25,
        n_jobs=1,
        stats_tests_to_apply=None,
        verbose=0,
        random_state=None,
    ):
        """
        Initializes the class.

        Args:
            clf (model object):
                Binary classification model or pipeline.

            scoring (string, list of strings, probatus.utils.Scorer or list of probatus.utils.Scorers, optional):
                Metrics for which the score is calculated. It can be either a name or list of names metric names and
                needs to be aligned with predefined classification scorers names in sklearn
                 ([link](https://scikit-learn.org/stable/modules/model_evaluation.html)).
                Another option is using probatus.utils.Scorer to define a custom metric.

            test_prc (float, optional):
                Percentage of input data used as test. By default 0.25.

            n_jobs (int, optional):
                Number of parallel executions. If -1 use all available cores. By default 1.

            stats_tests_to_apply (None, string or list of strings, optional):
                List of tests to apply. Available options:

                - `'ES'`: Epps-Singleton,
                - `'KS'`: Kolmogorov-Smirnov statistic,
                - `'PSI'`: Population Stability Index,
                - `'SW'`: Shapiro-Wilk based difference statistic,
                - `'AD'`: Anderson-Darling TS.

            verbose (int, optional):
                Controls verbosity of the output:

                - 0 - nether prints nor warnings are shown
                - 1 - 50 - only most important warnings and indication of progress in fitting the object.
                - 51 - 100 - shows other warnings and prints
                - above 100 - presents all prints and all warnings (including SHAP warnings).

            random_state (int, optional):
                Random state set at each round of feature elimination. If it is None, the results will not be
                reproducible and in random search at each iteration a different hyperparameters might be tested. For
                reproducible results set it to integer.
        """
        self.clf = clf
        self.n_jobs = n_jobs
        self.random_state = random_state
        self.test_prc = test_prc
        self.iterations_results = None
        self.report = None
        self.verbose = verbose
        self.allowed_stats_tests = list(DistributionStatistics.statistical_test_dict.keys())

        # TODO set reasonable default value for the parameter, to choose the statistical test for the user for different
        #  ways to compute volatility
        if stats_tests_to_apply is not None:
            self.stats_tests_to_apply = assure_list_of_strings(stats_tests_to_apply, "stats_tests_to_apply")
            assure_list_values_allowed(
                variable=self.stats_tests_to_apply,
                variable_name="stats_tests_to_apply",
                allowed_values=self.allowed_stats_tests,
            )
        else:
            self.stats_tests_to_apply = []

        self.stats_tests_objects = []
        if len(self.stats_tests_to_apply) > 0:
            if self.verbose > 0:
                warnings.warn(
                    "Computing statistics for distributions is an experimental feature. While using it, keep "
                    "in mind that the samples of metrics might be correlated."
                )
            for test_name in self.stats_tests_to_apply:
                self.stats_tests_objects.append(DistributionStatistics(statistical_test=test_name))

        self.scorers = get_scorers(scoring)