def test_distribution_statistics_attributes_ks(): d1 = np.histogram(np.random.normal(size=1000), 10)[0] d2 = np.histogram(np.random.normal(size=1000), 10)[0] myTest = DistributionStatistics('ks', binning_strategy=None) _ = myTest.compute(d1, d2, verbose=False) ks_value, p_value = ks(d1, d2) assert myTest.statistic == ks_value
def test_distribution_statistics_ks_no_binning(): d1 = np.histogram(np.random.normal(size=1000), 10)[0] d2 = np.histogram(np.random.weibull(1, size=1000) - 1, 10)[0] myTest = DistributionStatistics('ks', binning_strategy=None) assert not myTest.fitted res = myTest.compute(d1, d2) assert myTest.fitted assert isinstance(res, tuple)
def test_distribution_statistics_tuple_output(): d1 = np.histogram(np.random.normal(size=1000), 10)[0] d2 = np.histogram(np.random.weibull(1, size=1000) - 1, 10)[0] myTest = DistributionStatistics('ks', 'SimpleBucketer', bin_count=10) assert not myTest.fitted res = myTest.compute(d1, d2) assert myTest.fitted assert isinstance(res, tuple)
def test_distribution_statistics_psi(): d1 = np.histogram(np.random.normal(size=1000), 10)[0] d2 = np.histogram(np.random.weibull(1, size=1000) - 1, 10)[0] myTest = DistributionStatistics('psi', 'SimpleBucketer', bin_count=10) assert not myTest.fitted psi_test, p_value_test = myTest.compute(d1, d2) assert myTest.fitted assert isinstance(psi_test, numbers.Number)
def test_distribution_statistics_base(): with pytest.raises(NotImplementedError): assert DistributionStatistics('doesnotexist', 'SimpleBucketer', bin_count=10) with pytest.raises(NotImplementedError): assert DistributionStatistics('psi', 'doesnotexist', bin_count=10) myTest = DistributionStatistics('psi', 'SimpleBucketer', bin_count=10) assert repr(myTest).startswith('DistributionStatistics')
def test_distribution_statistics_attributes_psi(): a = np.random.normal(size=1000) b = np.random.normal(size=1000) d1 = np.histogram(a, 10)[0] d2 = np.histogram(b, 10)[0] myTest = DistributionStatistics('psi', binning_strategy=None) _ = myTest.compute(d1, d2, verbose=False) psi_value_test, p_value_test = psi(d1, d2, verbose=False) assert myTest.statistic == psi_value_test
def test_distribution_statistics_base(): """ Test. """ with pytest.raises(NotImplementedError): assert DistributionStatistics("doesnotexist", "SimpleBucketer", bin_count=10) with pytest.raises(NotImplementedError): assert DistributionStatistics("psi", "doesnotexist", bin_count=10) myTest = DistributionStatistics("psi", "SimpleBucketer", bin_count=10) assert repr(myTest).startswith("DistributionStatistics")
def test_distribution_statistics_autodist_base(): """ Test. """ nr_features = 2 size = 1000 np.random.seed(0) df1 = pd.DataFrame(np.random.normal(size=(size, nr_features)), columns=[f"feat_{x}" for x in range(nr_features)]) df2 = pd.DataFrame(np.random.normal(size=(size, nr_features)), columns=[f"feat_{x}" for x in range(nr_features)]) features = df1.columns myAutoDist = AutoDist(statistical_tests="all", binning_strategies="all", bin_count=[10, 20]) assert repr(myAutoDist).startswith("AutoDist") assert not myAutoDist.fitted res = myAutoDist.compute(df1, df2, column_names=features) assert myAutoDist.fitted pd.testing.assert_frame_equal(res, myAutoDist.result) assert isinstance(res, pd.DataFrame) assert res["column"].values.tolist() == features.to_list() dist = DistributionStatistics(statistical_test="ks", binning_strategy="simplebucketer", bin_count=10) dist.compute(df1["feat_0"], df2["feat_0"]) assert dist.p_value == res.loc[res["column"] == "feat_0", "p_value_KS_simplebucketer_10"][0] assert dist.statistic == res.loc[res["column"] == "feat_0", "statistic_KS_simplebucketer_10"][0] dist = DistributionStatistics(statistical_test="ks", binning_strategy=None, bin_count=10) dist.compute(df1["feat_0"], df2["feat_0"]) assert dist.p_value == res.loc[res["column"] == "feat_0", "p_value_KS_no_bucketing_0"][0] assert dist.statistic == res.loc[res["column"] == "feat_0", "statistic_KS_no_bucketing_0"][0]
def test_distribution_statistics_autodist_base(): '''DistributionStatiistics autodist base''' nr_features = 2 size = 1000 np.random.seed(0) df1 = pd.DataFrame(np.random.normal(size=(size, nr_features)), columns=[f'feat_{x}' for x in range(nr_features)]) df2 = pd.DataFrame(np.random.normal(size=(size, nr_features)), columns=[f'feat_{x}' for x in range(nr_features)]) features = df1.columns myAutoDist = AutoDist(statistical_tests='all', binning_strategies='all', bin_count=[10, 20]) assert repr(myAutoDist).startswith('AutoDist') assert not myAutoDist.fitted res = myAutoDist.compute(df1, df2, column_names=features) assert myAutoDist.fitted pd.testing.assert_frame_equal(res, myAutoDist.result) assert isinstance(res, pd.DataFrame) assert res['column'].values.tolist() == features.to_list() dist = DistributionStatistics(statistical_test='ks', binning_strategy='simplebucketer', bin_count=10) dist.compute(df1['feat_0'], df2['feat_0']) assert dist.p_value == res.loc[res['column'] == 'feat_0', 'p_value_KS_simplebucketer_10'][0] assert dist.statistic == res.loc[res['column'] == 'feat_0', 'statistic_KS_simplebucketer_10'][0] dist = DistributionStatistics(statistical_test='ks', binning_strategy=None, bin_count=10) dist.compute(df1['feat_0'], df2['feat_0']) assert dist.p_value == res.loc[res['column'] == 'feat_0', 'p_value_KS_no_bucketing_0'][0] assert dist.statistic == res.loc[res['column'] == 'feat_0', 'statistic_KS_no_bucketing_0'][0]
def __init__( self, clf, scoring="roc_auc", test_prc=0.25, n_jobs=1, stats_tests_to_apply=None, verbose=0, random_state=None, ): """ Initializes the class. Args: clf (model object): Binary classification model or pipeline. scoring (string, list of strings, probatus.utils.Scorer or list of probatus.utils.Scorers, optional): Metrics for which the score is calculated. It can be either a name or list of names metric names and needs to be aligned with predefined classification scorers names in sklearn ([link](https://scikit-learn.org/stable/modules/model_evaluation.html)). Another option is using probatus.utils.Scorer to define a custom metric. test_prc (float, optional): Percentage of input data used as test. By default 0.25. n_jobs (int, optional): Number of parallel executions. If -1 use all available cores. By default 1. stats_tests_to_apply (None, string or list of strings, optional): List of tests to apply. Available options: - `'ES'`: Epps-Singleton, - `'KS'`: Kolmogorov-Smirnov statistic, - `'PSI'`: Population Stability Index, - `'SW'`: Shapiro-Wilk based difference statistic, - `'AD'`: Anderson-Darling TS. verbose (int, optional): Controls verbosity of the output: - 0 - nether prints nor warnings are shown - 1 - 50 - only most important warnings and indication of progress in fitting the object. - 51 - 100 - shows other warnings and prints - above 100 - presents all prints and all warnings (including SHAP warnings). random_state (int, optional): Random state set at each round of feature elimination. If it is None, the results will not be reproducible and in random search at each iteration a different hyperparameters might be tested. For reproducible results set it to integer. """ self.clf = clf self.n_jobs = n_jobs self.random_state = random_state self.test_prc = test_prc self.iterations_results = None self.report = None self.verbose = verbose self.allowed_stats_tests = list(DistributionStatistics.statistical_test_dict.keys()) # TODO set reasonable default value for the parameter, to choose the statistical test for the user for different # ways to compute volatility if stats_tests_to_apply is not None: self.stats_tests_to_apply = assure_list_of_strings(stats_tests_to_apply, "stats_tests_to_apply") assure_list_values_allowed( variable=self.stats_tests_to_apply, variable_name="stats_tests_to_apply", allowed_values=self.allowed_stats_tests, ) else: self.stats_tests_to_apply = [] self.stats_tests_objects = [] if len(self.stats_tests_to_apply) > 0: if self.verbose > 0: warnings.warn( "Computing statistics for distributions is an experimental feature. While using it, keep " "in mind that the samples of metrics might be correlated." ) for test_name in self.stats_tests_to_apply: self.stats_tests_objects.append(DistributionStatistics(statistical_test=test_name)) self.scorers = get_scorers(scoring)