Exemplo n.º 1
0
def test_warnings_are_issued_for_missing():
    """Test if warnings are issued when missing values are present in the input of autodist."""
    # Generate an input dataframe without missing values
    X = pd.DataFrame({"A": [number for number in range(0, 50)]})
    X = X.assign(B=X["A"], C=X["A"], D=X["A"], E=X["A"])

    # Add some missing values to the dataframe.
    X_na = X.copy()
    X_na.iloc[X.sample(5, random_state=1).index, 1:3] = np.nan

    # Test missing value removal on the first data input.
    with pytest.warns(None) as record_first:
        _ = AutoDist(statistical_tests=["PSI"], binning_strategies="SimpleBucketer", bin_count=10).compute(X_na, X)
    assert len(record_first) == 2

    # Test missing values removal on the second data input
    with pytest.warns(None) as record_second:
        _ = AutoDist(statistical_tests=["PSI"], binning_strategies="SimpleBucketer", bin_count=10).compute(X, X_na)
    assert len(record_second) == 2

    # Test the missing values removal on the first and second data input
    with pytest.warns(None) as record_both:
        _ = AutoDist(statistical_tests=["PSI"], binning_strategies="SimpleBucketer", bin_count=10).compute(X_na, X_na)
    assert len(record_both) == 2

    # Test case where there are no missing values
    with pytest.warns(None) as record_both:
        _ = AutoDist(statistical_tests=["PSI"], binning_strategies="SimpleBucketer", bin_count=10).compute(X, X)
    assert len(record_both) == 0
Exemplo n.º 2
0
def test_distribution_statistics_autodist_base():
    """
    Test.
    """
    nr_features = 2
    size = 1000
    np.random.seed(0)
    df1 = pd.DataFrame(np.random.normal(size=(size, nr_features)), columns=[f"feat_{x}" for x in range(nr_features)])
    df2 = pd.DataFrame(np.random.normal(size=(size, nr_features)), columns=[f"feat_{x}" for x in range(nr_features)])
    features = df1.columns
    myAutoDist = AutoDist(statistical_tests="all", binning_strategies="all", bin_count=[10, 20])
    assert repr(myAutoDist).startswith("AutoDist")
    assert not myAutoDist.fitted
    res = myAutoDist.compute(df1, df2, column_names=features)
    assert myAutoDist.fitted
    pd.testing.assert_frame_equal(res, myAutoDist.result)
    assert isinstance(res, pd.DataFrame)
    assert res["column"].values.tolist() == features.to_list()

    dist = DistributionStatistics(statistical_test="ks", binning_strategy="simplebucketer", bin_count=10)
    dist.compute(df1["feat_0"], df2["feat_0"])
    assert dist.p_value == res.loc[res["column"] == "feat_0", "p_value_KS_simplebucketer_10"][0]
    assert dist.statistic == res.loc[res["column"] == "feat_0", "statistic_KS_simplebucketer_10"][0]

    dist = DistributionStatistics(statistical_test="ks", binning_strategy=None, bin_count=10)
    dist.compute(df1["feat_0"], df2["feat_0"])
    assert dist.p_value == res.loc[res["column"] == "feat_0", "p_value_KS_no_bucketing_0"][0]
    assert dist.statistic == res.loc[res["column"] == "feat_0", "statistic_KS_no_bucketing_0"][0]
Exemplo n.º 3
0
def test_distribution_statistics_autodist_return_failed_tests():
    """
    Test.
    """
    df1 = pd.DataFrame({"feat_0": [1, 2, 3, 4, 5], "feat_1": [5, 6, 7, 8, 9]})
    df2 = df1
    features = df1.columns.values.tolist()
    myAutoDist = AutoDist(binning_strategies="all")
    res = myAutoDist.compute(df1, df2, column_names=features, return_failed_tests=True)
    assert res.isin(["an error occurred"]).any().any()
    res = myAutoDist.compute(df1, df2, column_names=features, return_failed_tests=False)
    assert not res.isin(["an error occurred"]).any().any()
def test_distribution_statistics_autodist_default():
    df1 = pd.DataFrame({'feat_0': [1, 2, 3, 4, 5], 'feat_1': [5, 6, 7, 8, 9]})
    df2 = df1
    features = df1.columns.values.tolist()
    myAutoDist = AutoDist(binning_strategies="default", bin_count=10)
    res = myAutoDist.compute(df1, df2, column_names=features)
    for stat_test, stat_info in DistributionStatistics.statistical_test_dict.items(
    ):
        if stat_info['default_binning']:
            assert f"p_value_{stat_test}_{stat_info['default_binning']}_10" in res.columns
        else:
            assert f"p_value_{stat_test}_no_bucketing_0" in res.columns

    assert "p_value_agglomerativebucketer_10" not in res.columns
    assert res.shape == (len(df1.columns), 1 +
                         2 * len(DistributionStatistics.statistical_test_dict))
Exemplo n.º 5
0
def test_missing_values_in_autodist():
    """Test missing values have no impact in AutoDist functionality."""
    # Create dummy dataframe
    X, y = make_classification(50, 5, random_state=0)
    X = pd.DataFrame(X)
    # Split train and test
    X_train, X_test, _, _ = train_test_split(X, y, test_size=0.2, random_state=1)
    # Define an add-on with only missing values
    X_na = pd.DataFrame(np.tile(np.nan, (X.shape[1], X.shape[1])))

    # Compute the statistics with the missing values
    with_missings = AutoDist(
        statistical_tests=["PSI", "KS"], binning_strategies="SimpleBucketer", bin_count=10
    ).compute(pd.concat([X_train, X_na]), pd.concat([X_test, X_na]))

    # Compute the statistics withpout the missing values
    no_missing = AutoDist(statistical_tests=["PSI", "KS"], binning_strategies="SimpleBucketer", bin_count=10).compute(
        X_train, X_test
    )

    # Test the two set of results are identical
    pd.testing.assert_frame_equal(with_missings, no_missing)
Exemplo n.º 6
0
def test_distribution_statistics_autodist_base():
    '''DistributionStatiistics autodist base'''
    nr_features = 2
    size = 1000
    np.random.seed(0)
    df1 = pd.DataFrame(np.random.normal(size=(size, nr_features)),
                       columns=[f'feat_{x}' for x in range(nr_features)])
    df2 = pd.DataFrame(np.random.normal(size=(size, nr_features)),
                       columns=[f'feat_{x}' for x in range(nr_features)])
    features = df1.columns
    myAutoDist = AutoDist(statistical_tests='all',
                          binning_strategies='all',
                          bin_count=[10, 20])
    assert repr(myAutoDist).startswith('AutoDist')
    assert not myAutoDist.fitted
    res = myAutoDist.compute(df1, df2, column_names=features)
    assert myAutoDist.fitted
    pd.testing.assert_frame_equal(res, myAutoDist.result)
    assert isinstance(res, pd.DataFrame)
    assert res['column'].values.tolist() == features.to_list()

    dist = DistributionStatistics(statistical_test='ks',
                                  binning_strategy='simplebucketer',
                                  bin_count=10)
    dist.compute(df1['feat_0'], df2['feat_0'])
    assert dist.p_value == res.loc[res['column'] == 'feat_0',
                                   'p_value_KS_simplebucketer_10'][0]
    assert dist.statistic == res.loc[res['column'] == 'feat_0',
                                     'statistic_KS_simplebucketer_10'][0]

    dist = DistributionStatistics(statistical_test='ks',
                                  binning_strategy=None,
                                  bin_count=10)
    dist.compute(df1['feat_0'], df2['feat_0'])
    assert dist.p_value == res.loc[res['column'] == 'feat_0',
                                   'p_value_KS_no_bucketing_0'][0]
    assert dist.statistic == res.loc[res['column'] == 'feat_0',
                                     'statistic_KS_no_bucketing_0'][0]
def test_distribution_statistics_autodist_column_names_error():
    df1 = pd.DataFrame({'feat_0': [1, 2, 3, 4, 5], 'feat_1': [5, 6, 7, 8, 9]})
    df2 = df1
    features = df1.columns.values.tolist() + ['missing_feature']
    myAutoDist = AutoDist()
    with pytest.raises(Exception):
        assert myAutoDist.compute(df1, df2, column_names=features)

    df1 = pd.DataFrame({'feat_0': [1, 2, 3, 4, 5], 'feat_1': [5, 6, 7, 8, 9]})
    df2 = df1.copy()
    df1['feat_2'] = 0
    features = df2.columns.values.tolist() + ['missing_feature']
    myAutoDist = AutoDist()
    with pytest.raises(Exception):
        assert myAutoDist.compute(df1, df2, column_names=features)
Exemplo n.º 8
0
def test_distribution_statistics_autodist_init():
    """
    Test.
    """
    myAutoDist = AutoDist(statistical_tests="all", binning_strategies="all")
    assert isinstance(myAutoDist.statistical_tests, list)
    myAutoDist = AutoDist(statistical_tests="ks", binning_strategies="all")
    assert myAutoDist.statistical_tests == ["ks"]
    myAutoDist = AutoDist(statistical_tests=["ks", "psi"], binning_strategies="all")
    assert myAutoDist.statistical_tests == ["ks", "psi"]

    myAutoDist = AutoDist(statistical_tests="all", binning_strategies="all")
    assert isinstance(myAutoDist.binning_strategies, list)
    myAutoDist = AutoDist(statistical_tests="all", binning_strategies="quantilebucketer")
    assert myAutoDist.binning_strategies == ["quantilebucketer"]
    myAutoDist = AutoDist(statistical_tests="all", binning_strategies=["quantilebucketer", "simplebucketer"])
    assert myAutoDist.binning_strategies == ["quantilebucketer", "simplebucketer"]
def test_distribution_statistics_autodist_init():
    myAutoDist = AutoDist(statistical_tests='all', binning_strategies='all')
    assert isinstance(myAutoDist.statistical_tests, list)
    myAutoDist = AutoDist(statistical_tests='ks', binning_strategies='all')
    assert myAutoDist.statistical_tests == ['ks']
    myAutoDist = AutoDist(statistical_tests=['ks', 'psi'],
                          binning_strategies='all')
    assert myAutoDist.statistical_tests == ['ks', 'psi']

    myAutoDist = AutoDist(statistical_tests='all', binning_strategies='all')
    assert isinstance(myAutoDist.binning_strategies, list)
    myAutoDist = AutoDist(statistical_tests='all',
                          binning_strategies='quantilebucketer')
    assert myAutoDist.binning_strategies == ['quantilebucketer']
    myAutoDist = AutoDist(
        statistical_tests='all',
        binning_strategies=['quantilebucketer', 'simplebucketer'])
    assert myAutoDist.binning_strategies == [
        'quantilebucketer', 'simplebucketer'
    ]