示例#1
0
def test_json_reader():
    jr = JsonReader(file_path=resources.data("example.json"),
                    store_key="example")
    datastore = jr.transform(datastore={})

    assert datastore["example"]["boolean"]
    assert len(datastore["example"]["array"]) == 3
    assert datastore["example"]["category"]["a"] == 0
示例#2
0
def test_hists_stability_metrics():
    # get histograms
    pipeline = Pipeline(
        modules=[
            JsonReader(
                file_path=resources.data("example_histogram.json"), store_key="hists"
            )
        ]
    )
    datastore = pipeline.transform(datastore={})
    hists = datastore["hists"]

    # generate metrics
    hist_list = [
        "date:bankrupt",
        "date:country",
        "date:bankrupt",
        "date:A_score",
        "date:A_score:num_employees",
    ]
    ds = stability_metrics(
        hists, reference_type="rolling", window=5, features=hist_list
    )

    cols = ["profiles", "comparisons", "traffic_lights", "alerts"]
    for c in cols:
        assert c in list(ds.keys())
示例#3
0
def test_normalized_hist_mean_cov():
    hist_list = ['date:country', 'date:bankrupt', 'date:num_employees', 'date:A_score', 'date:A_score:num_employees']

    pipeline = Pipeline(modules=[
        JsonReader(file_path=resources.data("example_histogram.json"), store_key="example_hist"),
        HistSplitter(read_key='example_hist', store_key='output_hist', features=hist_list),
        ApplyFunc(apply_to_key='output_hist', assign_to_key='output_hist',
                  apply_funcs=[dict(func=normalized_hist_mean_cov, suffix='')])
    ])
    datastore = pipeline.transform(datastore={})

    assert 'output_hist' in datastore
    for f in ['A_score', 'A_score:num_employees', 'bankrupt', 'country', 'num_employees']:
        assert f in datastore['output_hist']

    df = datastore['output_hist']['A_score']

    check = np.array([[0.22916667, -0.01041667, -0.0625, -0.13541667, -0.02083333],
                      [-0.01041667, 0.015625, 0.01041667, -0.01354167, -0.00208333],
                      [-0.0625, 0.01041667, 0.12916667, -0.06458333, -0.0125],
                      [-0.13541667, -0.01354167, -0.06458333, 0.240625, -0.02708333],
                      [-0.02083333, -0.00208333, -0.0125, -0.02708333, 0.0625]])

    for hm, hc, hb in zip(df['histogram_mean'].values, df['histogram_cov'].values, df['histogram_binning'].values):
        np.testing.assert_array_almost_equal(hm, [0.3125, 0.03125, 0.1875, 0.40625, 0.0625])
        np.testing.assert_array_almost_equal(hb, [1.5, 2.5, 3.5, 4.5, 5.5])
        np.testing.assert_array_almost_equal(hc, check)
示例#4
0
def test_expanding_hist():
    hist_list = ['date:country', 'date:bankrupt', 'date:num_employees', 'date:A_score', 'date:A_score:num_employees']

    pipeline = Pipeline(modules=[
        JsonReader(file_path=resources.data("example_histogram.json"), store_key="example_hist"),
        HistSplitter(read_key='example_hist', store_key='output_hist', features=hist_list),
        ApplyFunc(apply_to_key='output_hist',
                  apply_funcs=[dict(func=expanding_hist, shift=1, suffix='sum', entire=True, hist_name='histogram')]),
    ])
    datastore = pipeline.transform(datastore={})

    df = datastore['output_hist']['num_employees']
    h = df['histogram_sum'].values[-1]
    bin_entries = h.hist.bin_entries()

    check = np.array([11., 1., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
                      0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
                      0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
                      0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
                      0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
                      0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
                      1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
                      0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
                      0., 0., 0., 0., 0., 0., 0., 0., 1.])

    np.testing.assert_array_almost_equal(bin_entries, check)
示例#5
0
def test_expand_norm_hist_mean_cov():
    hist_list = ['date:country', 'date:bankrupt', 'date:num_employees', 'date:A_score', 'date:A_score:num_employees']

    pipeline = Pipeline(modules=[
        JsonReader(file_path=resources.data("example_histogram.json"), store_key="example_hist"),
        HistSplitter(read_key='example_hist', store_key='output_hist', features=hist_list),
        ApplyFunc(apply_to_key='output_hist', apply_funcs=[
            dict(func=expand_norm_hist_mean_cov, hist_name='histogram', shift=1, suffix='', entire=True)])
    ])
    datastore = pipeline.transform(datastore={})

    assert 'output_hist' in datastore
    for f in ['A_score', 'A_score:num_employees', 'bankrupt', 'country', 'num_employees']:
        assert f in datastore['output_hist']

    df = datastore['output_hist']['num_employees']
    mean = df['histogram_mean'].values[-2]

    check = np.array([0.56666667, 0.03333333, 0.03333333, 0., 0.,
                      0., 0., 0., 0., 0.,
                      0.06666667, 0., 0., 0., 0.,
                      0., 0., 0., 0., 0.,
                      0., 0., 0., 0., 0.,
                      0., 0., 0., 0., 0.,
                      0., 0., 0., 0., 0.,
                      0., 0., 0., 0., 0.,
                      0., 0., 0., 0., 0.,
                      0., 0.06666667, 0., 0., 0.,
                      0., 0., 0., 0., 0.06666667,
                      0., 0., 0., 0., 0.,
                      0., 0., 0., 0.03333333, 0.06666667, 0.06666667])

    np.testing.assert_array_almost_equal(mean, check)
示例#6
0
def test_chi_squared2():
    hist_list = ['date:country', 'date:bankrupt', 'date:num_employees', 'date:A_score', 'date:A_score:num_employees']

    pipeline = Pipeline(modules=[
        JsonReader(file_path=resources.data("example_histogram.json"), store_key="example_hist"),
        HistSplitter(read_key='example_hist', store_key='output_hist', features=hist_list),
        ApplyFunc(apply_to_key='output_hist', apply_funcs=[
            dict(func=expand_norm_hist_mean_cov, hist_name='histogram', shift=1, suffix='', entire=True)]),
        ApplyFunc(apply_to_key='output_hist', apply_funcs=[dict(func=relative_chi_squared, suffix='', axis=1)])
    ])
    datastore = pipeline.transform(datastore={})

    assert 'output_hist' in datastore
    for f in ['A_score', 'A_score:num_employees', 'bankrupt', 'country', 'num_employees']:
        assert f in datastore['output_hist']

    df = datastore['output_hist']['A_score']
    np.testing.assert_almost_equal(df['chi2'][-1], 4.066666666666674)
    df = datastore['output_hist']['A_score:num_employees']
    np.testing.assert_almost_equal(df['chi2'][-2], 3.217532467532462)
    df = datastore['output_hist']['bankrupt']
    np.testing.assert_almost_equal(df['chi2'][-1], 0.11718750000000011)
    df = datastore['output_hist']['country']
    np.testing.assert_almost_equal(df['chi2'][-1], 0.6093749999999999)
    df = datastore['output_hist']['num_employees']
    np.testing.assert_almost_equal(df['chi2'][-1], 1.1858766233766194)
示例#7
0
def test_chi_squared1():
    hist_list = ['date:country', 'date:bankrupt', 'date:num_employees', 'date:A_score', 'date:A_score:num_employees']

    pipeline = Pipeline(modules=[
        JsonReader(file_path=resources.data("example_histogram.json"), store_key="example_hist"),
        HistSplitter(read_key='example_hist', store_key='output_hist', features=hist_list),
        ApplyFunc(apply_to_key='output_hist', apply_funcs=[
            dict(func=roll_norm_hist_mean_cov, hist_name='histogram', window=5, shift=1, suffix='', entire=True)]),
        ApplyFunc(apply_to_key='output_hist', apply_funcs=[dict(func=relative_chi_squared, suffix='', axis=1)])
    ])
    datastore = pipeline.transform(datastore={})

    assert 'output_hist' in datastore
    for f in ['A_score', 'A_score:num_employees', 'bankrupt', 'country', 'num_employees']:
        assert f in datastore['output_hist']

    df = datastore['output_hist']['A_score']
    np.testing.assert_almost_equal(df['chi2'][6], 3.275000000000001)
    df = datastore['output_hist']['A_score:num_employees']
    np.testing.assert_almost_equal(df['chi2'][-2], 2.1333333333333315)
    df = datastore['output_hist']['bankrupt']
    np.testing.assert_almost_equal(df['chi2'][6], 0.19687500000000002)
    df = datastore['output_hist']['country']
    np.testing.assert_almost_equal(df['chi2'][5], 0.8999999999999994)
    df = datastore['output_hist']['num_employees']
    np.testing.assert_almost_equal(df['chi2'][5], 0.849999999999999)
示例#8
0
def test_chi_RollingNormHistComparer():
    hist_list = [
        "date:country",
        "date:bankrupt",
        "date:num_employees",
        "date:A_score",
        "date:A_score:num_employees",
    ]

    pipeline = Pipeline(modules=[
        JsonReader(
            file_path=resources.data("example_histogram.json"),
            store_key="example_hist",
        ),
        HistSplitter(read_key="example_hist",
                     store_key="output_hist",
                     features=hist_list),
        RollingNormHistComparer(
            read_key="output_hist", store_key="comparisons", window=10),
    ])
    datastore = pipeline.transform(datastore={})

    assert "comparisons" in datastore
    for f in [
            "A_score",
            "A_score:num_employees",
            "bankrupt",
            "country",
            "num_employees",
    ]:
        assert f in datastore["comparisons"]

    df = datastore["comparisons"]["A_score"]
    np.testing.assert_almost_equal(df["chi2"][-1], 37.61910112359518)
示例#9
0
def test_normalized_hist_mean_cov():
    hist_list = [
        "date:country",
        "date:bankrupt",
        "date:num_employees",
        "date:A_score",
        "date:A_score:num_employees",
    ]

    pipeline = Pipeline(
        modules=[
            JsonReader(
                file_path=resources.data("example_histogram.json"),
                store_key="example_hist",
            ),
            HistSplitter(
                read_key="example_hist", store_key="output_hist", features=hist_list
            ),
            ApplyFunc(
                apply_to_key="output_hist",
                assign_to_key="output_hist",
                apply_funcs=[dict(func=normalized_hist_mean_cov, suffix="")],
            ),
        ]
    )
    datastore = pipeline.transform(datastore={})

    assert "output_hist" in datastore
    for f in [
        "A_score",
        "A_score:num_employees",
        "bankrupt",
        "country",
        "num_employees",
    ]:
        assert f in datastore["output_hist"]

    df = datastore["output_hist"]["A_score"]

    check = np.array(
        [
            [0.22916667, -0.01041667, -0.0625, -0.13541667, -0.02083333],
            [-0.01041667, 0.015625, 0.01041667, -0.01354167, -0.00208333],
            [-0.0625, 0.01041667, 0.12916667, -0.06458333, -0.0125],
            [-0.13541667, -0.01354167, -0.06458333, 0.240625, -0.02708333],
            [-0.02083333, -0.00208333, -0.0125, -0.02708333, 0.0625],
        ]
    )

    for hm, hc, hb in zip(
        df["histogram_mean"].values,
        df["histogram_cov"].values,
        df["histogram_binning"].values,
    ):
        np.testing.assert_array_almost_equal(
            hm, [0.3125, 0.03125, 0.1875, 0.40625, 0.0625]
        )
        np.testing.assert_array_almost_equal(hb, [1.5, 2.5, 3.5, 4.5, 5.5])
        np.testing.assert_array_almost_equal(hc, check)
示例#10
0
def test_chi_squared1():
    hist_list = [
        "date:country",
        "date:bankrupt",
        "date:num_employees",
        "date:A_score",
        "date:A_score:num_employees",
    ]

    pipeline = Pipeline(
        modules=[
            JsonReader(
                file_path=resources.data("example_histogram.json"),
                store_key="example_hist",
            ),
            HistSplitter(
                read_key="example_hist", store_key="output_hist", features=hist_list
            ),
            ApplyFunc(
                apply_to_key="output_hist",
                apply_funcs=[
                    dict(
                        func=roll_norm_hist_mean_cov,
                        hist_name="histogram",
                        window=5,
                        shift=1,
                        suffix="",
                        entire=True,
                    )
                ],
            ),
            ApplyFunc(
                apply_to_key="output_hist",
                apply_funcs=[dict(func=relative_chi_squared, suffix="", axis=1)],
            ),
        ]
    )
    datastore = pipeline.transform(datastore={})

    assert "output_hist" in datastore
    for f in [
        "A_score",
        "A_score:num_employees",
        "bankrupt",
        "country",
        "num_employees",
    ]:
        assert f in datastore["output_hist"]

    df = datastore["output_hist"]["A_score"]
    np.testing.assert_almost_equal(df["chi2"][6], 4.25)
    df = datastore["output_hist"]["A_score:num_employees"]
    np.testing.assert_almost_equal(df["chi2"][-2], 2.1333333333333315)
    df = datastore["output_hist"]["bankrupt"]
    np.testing.assert_almost_equal(df["chi2"][6], 0.40000000000000024)
    df = datastore["output_hist"]["country"]
    np.testing.assert_almost_equal(df["chi2"][5], 0.8999999999999994)
    df = datastore["output_hist"]["num_employees"]
    np.testing.assert_almost_equal(df["chi2"][5], 0.849999999999999)
示例#11
0
def test_chi_squared2():
    hist_list = [
        "date:country",
        "date:bankrupt",
        "date:num_employees",
        "date:A_score",
        "date:A_score:num_employees",
    ]

    pipeline = Pipeline(
        modules=[
            JsonReader(
                file_path=resources.data("example_histogram.json"),
                store_key="example_hist",
            ),
            HistSplitter(
                read_key="example_hist", store_key="output_hist", features=hist_list
            ),
            ApplyFunc(
                apply_to_key="output_hist",
                apply_funcs=[
                    dict(
                        func=expand_norm_hist_mean_cov,
                        hist_name="histogram",
                        shift=1,
                        suffix="",
                        entire=True,
                    )
                ],
            ),
            ApplyFunc(
                apply_to_key="output_hist",
                apply_funcs=[dict(func=relative_chi_squared, suffix="", axis=1)],
            ),
        ]
    )
    datastore = pipeline.transform(datastore={})

    assert "output_hist" in datastore
    for f in [
        "A_score",
        "A_score:num_employees",
        "bankrupt",
        "country",
        "num_employees",
    ]:
        assert f in datastore["output_hist"]

    df = datastore["output_hist"]["A_score"]
    np.testing.assert_almost_equal(df["chi2"][-1], 9.891821919006366)
    df = datastore["output_hist"]["A_score:num_employees"]
    np.testing.assert_almost_equal(df["chi2"][-2], 3.217532467532462)
    df = datastore["output_hist"]["bankrupt"]
    np.testing.assert_almost_equal(df["chi2"][-1], 0.23767605633802757)
    df = datastore["output_hist"]["country"]
    np.testing.assert_almost_equal(df["chi2"][-1], 1.3717532467532458)
    df = datastore["output_hist"]["num_employees"]
    np.testing.assert_almost_equal(df["chi2"][-1], 1.1858766233766194)
示例#12
0
def test_self_reference():
    hist_list = ['date:A_score', 'date:A_score:num_employees']

    pipeline = Pipeline(modules=[
        JsonReader(file_path=resources.data("example_histogram.json"),
                   store_key="hists"),
        self_reference(hists_key='hists', features=hist_list),
    ])
    pipeline.transform(datastore={})
示例#13
0
def test_rolling_reference():
    hist_list = ["date:country", "date:A_score:num_employees"]

    pipeline = Pipeline(modules=[
        JsonReader(file_path=resources.data("example_histogram.json"),
                   store_key="hists"),
        rolling_reference(hists_key="hists", window=5, features=hist_list),
    ])
    pipeline.transform(datastore={})
示例#14
0
def test_expanding_reference():
    hist_list = ["date:bankrupt", "date:num_employees"]

    pipeline = Pipeline(modules=[
        JsonReader(file_path=resources.data("example_histogram.json"),
                   store_key="hists"),
        expanding_reference(hists_key="hists", features=hist_list),
    ])
    pipeline.transform(datastore={})
示例#15
0
def test_external_reference():
    hist_list = ["date:country", "date:bankrupt"]

    pipeline = Pipeline(modules=[
        JsonReader(file_path=resources.data("example_histogram.json"),
                   store_key="hists"),
        external_reference(
            hists_key="hists", ref_hists_key="hists", features=hist_list),
    ])
    pipeline.transform(datastore={})
示例#16
0
def test_hists_stability_report():
    # get histograms
    pipeline = Pipeline(modules=[
        JsonReader(file_path=resources.data("example_histogram.json"), store_key="hists"),
    ])
    datastore = pipeline.transform(datastore={})
    hists = datastore['hists']

    # generate report
    hist_list = ['date:bankrupt', 'date:country', 'date:bankrupt', 'date:A_score', 'date:A_score:num_employees']
    stability_report(hists, reference_type='rolling', window=5, features=hist_list)
示例#17
0
def test_hist_compare():
    hist_list = ['date:country', 'date:bankrupt', 'date:num_employees', 'date:A_score', 'date:A_score:num_employees']

    pipeline = Pipeline(modules=[
        JsonReader(file_path=resources.data("example_histogram.json"), store_key="example_hist"),
        HistSplitter(read_key='example_hist', store_key='output_hist', features=hist_list),
        ApplyFunc(apply_to_key='output_hist',
                  apply_funcs=[dict(func=expanding_hist, shift=1, suffix='sum', entire=True, hist_name='histogram')]),
        ApplyFunc(apply_to_key='output_hist', assign_to_key='comparison', apply_funcs=[
            dict(func=hist_compare, hist_name1='histogram', hist_name2='histogram_sum', suffix='', axis=1)])
    ])
    datastore = pipeline.transform(datastore={})

    df = datastore['comparison']['num_employees']
    np.testing.assert_array_equal(df['chi2'].values[-1], 0.7017543859649122)
示例#18
0
def test_chi_RollingNormHistComparer():
    hist_list = ['date:country', 'date:bankrupt', 'date:num_employees', 'date:A_score', 'date:A_score:num_employees']

    pipeline = Pipeline(modules=[
        JsonReader(file_path=resources.data("example_histogram.json"), store_key="example_hist"),
        HistSplitter(read_key='example_hist', store_key='output_hist', features=hist_list),
        RollingNormHistComparer(read_key='output_hist', store_key='comparisons', window=10)
    ])
    datastore = pipeline.transform(datastore={})

    assert 'comparisons' in datastore
    for f in ['A_score', 'A_score:num_employees', 'bankrupt', 'country', 'num_employees']:
        assert f in datastore['comparisons']

    df = datastore['comparisons']['A_score']
    np.testing.assert_almost_equal(df['chi2'][-1], 45.200000)
示例#19
0
def test_expanding_hist_comparer():

    hist_list = ['date:country', 'date:bankrupt', 'date:num_employees', 'date:A_score']
    features = ['country', 'bankrupt', 'num_employees', 'A_score']

    cols = ['expanding_pearson', 'expanding_chi2', 'expanding_chi2_zscore', 'expanding_chi2_norm',
            'expanding_chi2_pvalue', 'expanding_chi2_max_residual', 'expanding_chi2_spike_count',
            'expanding_ks', 'expanding_ks_zscore', 'expanding_ks_pvalue', 'expanding_max_prob_diff',
            'expanding_unknown_labels']

    pipeline = Pipeline(modules=[
        JsonReader(file_path=resources.data("example_histogram.json"), store_key="example_hist"),
        HistSplitter(read_key='example_hist', store_key='output_hist', features=hist_list),
        ExpandingHistComparer(read_key='output_hist', store_key='comparison')
    ])
    datastore = pipeline.transform(datastore={})

    assert 'comparison' in datastore and isinstance(datastore['comparison'], dict)
    assert len(datastore['comparison'].keys()) == len(features)
    for f in features:
        assert f in datastore['comparison']
    for f in features:
        assert isinstance(datastore['comparison'][f], pd.DataFrame)

    df = datastore['comparison']['A_score']
    assert len(df) == 16
    np.testing.assert_array_equal(sorted(df.columns), sorted(cols))
    np.testing.assert_almost_equal(df['expanding_chi2'].mean(), 2.8366236044275257)

    df = datastore['comparison']['country']
    assert len(df) == 17
    np.testing.assert_array_equal(sorted(df.columns), sorted(cols))
    np.testing.assert_almost_equal(df['expanding_chi2'].mean(), 1.1224348056645368)

    df = datastore['comparison']['bankrupt']
    assert len(df) == 17
    np.testing.assert_array_equal(sorted(df.columns), sorted(cols))
    np.testing.assert_almost_equal(df['expanding_chi2'].mean(), 0.6901425387043608)

    df = datastore['comparison']['num_employees']
    assert len(df) == 17
    np.testing.assert_array_equal(sorted(df.columns), sorted(cols))
    np.testing.assert_almost_equal(df['expanding_chi2'].mean(), 4.243731870738727)
示例#20
0
def test_rolling_hist_comparer():

    hist_list = ['date:country', 'date:bankrupt', 'date:num_employees', 'date:A_score']
    features = ['country', 'bankrupt', 'num_employees', 'A_score']

    cols = ['roll_pearson', 'roll_chi2', 'roll_chi2_zscore', 'roll_chi2_norm',
            'roll_chi2_pvalue', 'roll_chi2_max_residual', 'roll_chi2_spike_count',
            'roll_ks', 'roll_ks_zscore', 'roll_ks_pvalue', 'roll_max_prob_diff',
            'roll_unknown_labels']

    pipeline = Pipeline(modules=[
        JsonReader(file_path=resources.data("example_histogram.json"), store_key="example_hist"),
        HistSplitter(read_key='example_hist', store_key='output_hist', features=hist_list),
        RollingHistComparer(read_key='output_hist', store_key='comparison', window=5)
    ])
    datastore = pipeline.transform(datastore={})

    assert 'comparison' in datastore and isinstance(datastore['comparison'], dict)
    assert len(datastore['comparison'].keys()) == len(features)
    for f in features:
        assert f in datastore['comparison']
    for f in features:
        assert isinstance(datastore['comparison'][f], pd.DataFrame)

    df = datastore['comparison']['A_score']
    assert len(df) == 16
    np.testing.assert_array_equal(sorted(df.columns), sorted(cols))
    np.testing.assert_almost_equal(df['roll_chi2'].mean(), 2.927272727272727)

    df = datastore['comparison']['country']
    assert len(df) == 17
    np.testing.assert_array_equal(sorted(df.columns), sorted(cols))
    np.testing.assert_almost_equal(df['roll_chi2'].mean(), 1.3022619047619046)

    df = datastore['comparison']['bankrupt']
    assert len(df) == 17
    np.testing.assert_array_equal(sorted(df.columns), sorted(cols))
    np.testing.assert_almost_equal(df['roll_chi2'].mean(), 0.7251681783824641)

    df = datastore['comparison']['num_employees']
    assert len(df) == 17
    np.testing.assert_array_equal(sorted(df.columns), sorted(cols))
    np.testing.assert_almost_equal(df['roll_chi2'].mean(), 4.0995701058201055)
示例#21
0
def test_reference_hist_comparer():

    hist_list = ['date:country', 'date:bankrupt', 'date:num_employees', 'date:A_score']
    features = ['country', 'bankrupt', 'num_employees', 'A_score']

    cols = ['ref_pearson', 'ref_chi2', 'ref_chi2_zscore', 'ref_chi2_norm',
            'ref_chi2_pvalue', 'ref_chi2_max_residual', 'ref_chi2_spike_count',
            'ref_ks', 'ref_ks_zscore', 'ref_ks_pvalue', 'ref_max_prob_diff', 'ref_unknown_labels']

    pipeline = Pipeline(modules=[
        JsonReader(file_path=resources.data("example_histogram.json"), store_key="example_hist"),
        HistSplitter(read_key='example_hist', store_key='output_hist', features=hist_list),
        ReferenceHistComparer(reference_key='output_hist', assign_to_key='output_hist', store_key='comparison')
    ])
    datastore = pipeline.transform(datastore={})

    assert 'comparison' in datastore and isinstance(datastore['comparison'], dict)
    assert len(datastore['comparison'].keys()) == len(features)
    for f in features:
        assert f in datastore['comparison']
    for f in features:
        assert isinstance(datastore['comparison'][f], pd.DataFrame)

    df = datastore['comparison']['A_score']
    assert len(df) == 16
    np.testing.assert_array_equal(sorted(df.columns), sorted(cols))
    np.testing.assert_almost_equal(df['ref_chi2'].mean(), 2.623206018518519)

    df = datastore['comparison']['country']
    assert len(df) == 17
    np.testing.assert_array_equal(sorted(df.columns), sorted(cols))
    np.testing.assert_almost_equal(df['ref_chi2'].mean(), 0.9804481792717087)

    df = datastore['comparison']['bankrupt']
    assert len(df) == 17
    np.testing.assert_array_equal(sorted(df.columns), sorted(cols))
    np.testing.assert_almost_equal(df['ref_chi2'].mean(), 0.6262951496388027)

    df = datastore['comparison']['num_employees']
    assert len(df) == 17
    np.testing.assert_array_equal(sorted(df.columns), sorted(cols))
    np.testing.assert_almost_equal(df['ref_chi2'].mean(), 4.213429217840983)
示例#22
0
def test_report_generator():

    hist_list = [
        "date:country", "date:bankrupt", "date:num_employees", "date:A_score"
    ]
    features = ["country", "bankrupt", "num_employees", "A_score"]

    pipeline = Pipeline(modules=[
        JsonReader(
            file_path=resources.data("example_histogram.json"),
            store_key="example_hist",
        ),
        HistSplitter(read_key="example_hist",
                     store_key="output_hist",
                     features=hist_list),
        ReferenceHistComparer(
            reference_key="output_hist",
            assign_to_key="output_hist",
            store_key="comparison",
        ),
        SectionGenerator(
            read_key="comparison",
            store_key="all_sections",
            section_name="Comparisons",
            last_n=2,
        ),
        ReportGenerator(read_key="all_sections", store_key="final_report"),
    ])
    datastore = pipeline.transform(datastore={})

    assert "comparison" in datastore and isinstance(datastore["comparison"],
                                                    dict)
    assert len(datastore["comparison"].keys()) == len(features)
    for f in features:
        assert f in datastore["comparison"]
    for f in features:
        assert isinstance(datastore["comparison"][f], pd.DataFrame)

    assert pipeline.modules[-2].last_n == 2
    assert "final_report" in datastore
    assert (isinstance(datastore["final_report"], str)
            and len(datastore["final_report"]) > 0)
示例#23
0
def test_hist_splitter():

    hist_list = [
        "date:country",
        "date:bankrupt",
        "date:num_employees",
        "date:A_score",
        "date:A_score:num_employees",
    ]
    features = [
        "country",
        "bankrupt",
        "num_employees",
        "A_score",
        "A_score:num_employees",
    ]

    pipeline = Pipeline(modules=[
        JsonReader(
            file_path=resources.data("example_histogram.json"),
            store_key="example_hist",
        ),
        HistSplitter(read_key="example_hist",
                     store_key="output_hist",
                     features=hist_list),
    ])
    datastore = pipeline.transform(datastore={})

    assert "output_hist" in datastore and isinstance(datastore["output_hist"],
                                                     dict)
    assert len(datastore["output_hist"].keys()) == len(features)
    for f in features:
        assert f in datastore["output_hist"]
    for f in features:
        assert isinstance(datastore["output_hist"][f], pd.DataFrame)

    for f in features:
        df = datastore["output_hist"][f]
        split_list = df.reset_index().to_dict("records")
        hlist = [s["histogram"] for s in split_list]
        assert_similar_hists(hlist)
示例#24
0
def test_hist_splitter_filter():
    """ Test of hist_splitter option filter_empty_split_hists

    One of the split histograms of type date:A_score:num_employees is empty and only contains a NaN.
    In this test, those empty split-histograms are *not* removed, leading to split-histograms of
    inconsistent types.
    """

    hist_list = ["date:A_score:num_employees"]
    features = ["A_score:num_employees"]

    pipeline = Pipeline(modules=[
        JsonReader(
            file_path=resources.data("example_histogram.json"),
            store_key="example_hist",
        ),
        HistSplitter(
            read_key="example_hist",
            store_key="output_hist",
            features=hist_list,
            filter_empty_split_hists=False,
        ),
    ])
    datastore = pipeline.transform(datastore={})

    assert "output_hist" in datastore and isinstance(datastore["output_hist"],
                                                     dict)
    assert len(datastore["output_hist"].keys()) == len(features)
    for f in features:
        assert f in datastore["output_hist"]
    for f in features:
        assert isinstance(datastore["output_hist"][f], pd.DataFrame)

    for f in features:
        df = datastore["output_hist"][f]
        split_list = df.reset_index().to_dict("records")
        hlist = [s["histogram"] for s in split_list]
        check = check_similar_hists(hlist)
        assert check is False
示例#25
0
def test_chi_ReferenceNormHistComparer():
    hist_list = [
        "date:country",
        "date:bankrupt",
        "date:num_employees",
        "date:A_score",
        "date:A_score:num_employees",
    ]

    pipeline = Pipeline(
        modules=[
            JsonReader(
                file_path=resources.data("example_histogram.json"),
                store_key="example_hist",
            ),
            HistSplitter(
                read_key="example_hist", store_key="output_hist", features=hist_list
            ),
            ReferenceNormHistComparer(
                reference_key="output_hist",
                assign_to_key="output_hist",
                store_key="comparisons",
            ),
        ]
    )
    datastore = pipeline.transform(datastore={})

    assert "comparisons" in datastore
    for f in [
        "A_score",
        "A_score:num_employees",
        "bankrupt",
        "country",
        "num_employees",
    ]:
        assert f in datastore["comparisons"]

    df = datastore["comparisons"]["A_score"]
    np.testing.assert_almost_equal(df["chi2"][0], 2.2884111855886022)
示例#26
0
def test_report_generator():

    hist_list = [
        'date:country', 'date:bankrupt', 'date:num_employees', 'date:A_score'
    ]
    features = ['country', 'bankrupt', 'num_employees', 'A_score']

    pipeline = Pipeline(modules=[
        JsonReader(file_path=resources.data("example_histogram.json"),
                   store_key="example_hist"),
        HistSplitter(read_key='example_hist',
                     store_key='output_hist',
                     features=hist_list),
        ReferenceHistComparer(reference_key='output_hist',
                              assign_to_key='output_hist',
                              store_key='comparison'),
        SectionGenerator(read_key="comparison",
                         store_key="all_sections",
                         section_name="Comparisons",
                         last_n=2),
        ReportGenerator(read_key="all_sections", store_key="final_report")
    ])
    datastore = pipeline.transform(datastore={})

    assert 'comparison' in datastore and isinstance(datastore['comparison'],
                                                    dict)
    assert len(datastore['comparison'].keys()) == len(features)
    for f in features:
        assert f in datastore['comparison']
    for f in features:
        assert isinstance(datastore['comparison'][f], pd.DataFrame)

    assert pipeline.modules[-2].last_n == 2
    assert 'final_report' in datastore
    assert isinstance(datastore['final_report'],
                      str) and len(datastore['final_report']) > 0
示例#27
0
def test_expanding_hist():
    hist_list = [
        "date:country",
        "date:bankrupt",
        "date:num_employees",
        "date:A_score",
        "date:A_score:num_employees",
    ]

    pipeline = Pipeline(modules=[
        JsonReader(
            file_path=resources.data("example_histogram.json"),
            store_key="example_hist",
        ),
        HistSplitter(read_key="example_hist",
                     store_key="output_hist",
                     features=hist_list),
        ApplyFunc(
            apply_to_key="output_hist",
            apply_funcs=[
                dict(
                    func=expanding_hist,
                    shift=1,
                    suffix="sum",
                    entire=True,
                    hist_name="histogram",
                )
            ],
        ),
    ])
    datastore = pipeline.transform(datastore={})

    df = datastore["output_hist"]["num_employees"]
    h = df["histogram_sum"].values[-1]
    bin_entries = h.bin_entries()

    check = np.array([
        11.0,
        1.0,
        1.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        1.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        1.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        1.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        1.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        1.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        1.0,
    ])

    np.testing.assert_array_almost_equal(bin_entries, check)
示例#28
0
def test_expand_norm_hist_mean_cov():
    hist_list = [
        "date:country",
        "date:bankrupt",
        "date:num_employees",
        "date:A_score",
        "date:A_score:num_employees",
    ]

    pipeline = Pipeline(modules=[
        JsonReader(
            file_path=resources.data("example_histogram.json"),
            store_key="example_hist",
        ),
        HistSplitter(read_key="example_hist",
                     store_key="output_hist",
                     features=hist_list),
        ApplyFunc(
            apply_to_key="output_hist",
            apply_funcs=[
                dict(
                    func=expand_norm_hist_mean_cov,
                    hist_name="histogram",
                    shift=1,
                    suffix="",
                    entire=True,
                )
            ],
        ),
    ])
    datastore = pipeline.transform(datastore={})

    assert "output_hist" in datastore
    for f in [
            "A_score",
            "A_score:num_employees",
            "bankrupt",
            "country",
            "num_employees",
    ]:
        assert f in datastore["output_hist"]

    df = datastore["output_hist"]["num_employees"]
    mean = df["histogram_mean"].values[-2]

    check = np.array([
        0.56666667,
        0.03333333,
        0.03333333,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.06666667,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.06666667,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.06666667,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.03333333,
        0.06666667,
        0.06666667,
    ])

    np.testing.assert_array_almost_equal(mean, check)