예제 #1
0
def test_distribution_statistics_autodist_return_failed_tests():
    """
    Test.
    """
    df1 = pd.DataFrame({"feat_0": [1, 2, 3, 4, 5], "feat_1": [5, 6, 7, 8, 9]})
    df2 = df1
    features = df1.columns.values.tolist()
    myAutoDist = AutoDist(binning_strategies="all")
    res = myAutoDist.compute(df1, df2, column_names=features, return_failed_tests=True)
    assert res.isin(["an error occurred"]).any().any()
    res = myAutoDist.compute(df1, df2, column_names=features, return_failed_tests=False)
    assert not res.isin(["an error occurred"]).any().any()
def test_distribution_statistics_autodist_column_names_error():
    df1 = pd.DataFrame({'feat_0': [1, 2, 3, 4, 5], 'feat_1': [5, 6, 7, 8, 9]})
    df2 = df1
    features = df1.columns.values.tolist() + ['missing_feature']
    myAutoDist = AutoDist()
    with pytest.raises(Exception):
        assert myAutoDist.compute(df1, df2, column_names=features)

    df1 = pd.DataFrame({'feat_0': [1, 2, 3, 4, 5], 'feat_1': [5, 6, 7, 8, 9]})
    df2 = df1.copy()
    df1['feat_2'] = 0
    features = df2.columns.values.tolist() + ['missing_feature']
    myAutoDist = AutoDist()
    with pytest.raises(Exception):
        assert myAutoDist.compute(df1, df2, column_names=features)
예제 #3
0
def test_distribution_statistics_autodist_base():
    """
    Test.
    """
    nr_features = 2
    size = 1000
    np.random.seed(0)
    df1 = pd.DataFrame(np.random.normal(size=(size, nr_features)), columns=[f"feat_{x}" for x in range(nr_features)])
    df2 = pd.DataFrame(np.random.normal(size=(size, nr_features)), columns=[f"feat_{x}" for x in range(nr_features)])
    features = df1.columns
    myAutoDist = AutoDist(statistical_tests="all", binning_strategies="all", bin_count=[10, 20])
    assert repr(myAutoDist).startswith("AutoDist")
    assert not myAutoDist.fitted
    res = myAutoDist.compute(df1, df2, column_names=features)
    assert myAutoDist.fitted
    pd.testing.assert_frame_equal(res, myAutoDist.result)
    assert isinstance(res, pd.DataFrame)
    assert res["column"].values.tolist() == features.to_list()

    dist = DistributionStatistics(statistical_test="ks", binning_strategy="simplebucketer", bin_count=10)
    dist.compute(df1["feat_0"], df2["feat_0"])
    assert dist.p_value == res.loc[res["column"] == "feat_0", "p_value_KS_simplebucketer_10"][0]
    assert dist.statistic == res.loc[res["column"] == "feat_0", "statistic_KS_simplebucketer_10"][0]

    dist = DistributionStatistics(statistical_test="ks", binning_strategy=None, bin_count=10)
    dist.compute(df1["feat_0"], df2["feat_0"])
    assert dist.p_value == res.loc[res["column"] == "feat_0", "p_value_KS_no_bucketing_0"][0]
    assert dist.statistic == res.loc[res["column"] == "feat_0", "statistic_KS_no_bucketing_0"][0]
def test_distribution_statistics_autodist_default():
    df1 = pd.DataFrame({'feat_0': [1, 2, 3, 4, 5], 'feat_1': [5, 6, 7, 8, 9]})
    df2 = df1
    features = df1.columns.values.tolist()
    myAutoDist = AutoDist(binning_strategies="default", bin_count=10)
    res = myAutoDist.compute(df1, df2, column_names=features)
    for stat_test, stat_info in DistributionStatistics.statistical_test_dict.items(
    ):
        if stat_info['default_binning']:
            assert f"p_value_{stat_test}_{stat_info['default_binning']}_10" in res.columns
        else:
            assert f"p_value_{stat_test}_no_bucketing_0" in res.columns

    assert "p_value_agglomerativebucketer_10" not in res.columns
    assert res.shape == (len(df1.columns), 1 +
                         2 * len(DistributionStatistics.statistical_test_dict))
예제 #5
0
def test_distribution_statistics_autodist_base():
    '''DistributionStatiistics autodist base'''
    nr_features = 2
    size = 1000
    np.random.seed(0)
    df1 = pd.DataFrame(np.random.normal(size=(size, nr_features)),
                       columns=[f'feat_{x}' for x in range(nr_features)])
    df2 = pd.DataFrame(np.random.normal(size=(size, nr_features)),
                       columns=[f'feat_{x}' for x in range(nr_features)])
    features = df1.columns
    myAutoDist = AutoDist(statistical_tests='all',
                          binning_strategies='all',
                          bin_count=[10, 20])
    assert repr(myAutoDist).startswith('AutoDist')
    assert not myAutoDist.fitted
    res = myAutoDist.compute(df1, df2, column_names=features)
    assert myAutoDist.fitted
    pd.testing.assert_frame_equal(res, myAutoDist.result)
    assert isinstance(res, pd.DataFrame)
    assert res['column'].values.tolist() == features.to_list()

    dist = DistributionStatistics(statistical_test='ks',
                                  binning_strategy='simplebucketer',
                                  bin_count=10)
    dist.compute(df1['feat_0'], df2['feat_0'])
    assert dist.p_value == res.loc[res['column'] == 'feat_0',
                                   'p_value_KS_simplebucketer_10'][0]
    assert dist.statistic == res.loc[res['column'] == 'feat_0',
                                     'statistic_KS_simplebucketer_10'][0]

    dist = DistributionStatistics(statistical_test='ks',
                                  binning_strategy=None,
                                  bin_count=10)
    dist.compute(df1['feat_0'], df2['feat_0'])
    assert dist.p_value == res.loc[res['column'] == 'feat_0',
                                   'p_value_KS_no_bucketing_0'][0]
    assert dist.statistic == res.loc[res['column'] == 'feat_0',
                                     'statistic_KS_no_bucketing_0'][0]