Пример #1
0
def test_expanding_hist():
    hist_list = ['date:country', 'date:bankrupt', 'date:num_employees', 'date:A_score', 'date:A_score:num_employees']

    pipeline = Pipeline(modules=[
        JsonReader(file_path=resources.data("example_histogram.json"), store_key="example_hist"),
        HistSplitter(read_key='example_hist', store_key='output_hist', features=hist_list),
        ApplyFunc(apply_to_key='output_hist',
                  apply_funcs=[dict(func=expanding_hist, shift=1, suffix='sum', entire=True, hist_name='histogram')]),
    ])
    datastore = pipeline.transform(datastore={})

    df = datastore['output_hist']['num_employees']
    h = df['histogram_sum'].values[-1]
    bin_entries = h.hist.bin_entries()

    check = np.array([11., 1., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
                      0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
                      0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
                      0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
                      0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
                      0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
                      1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
                      0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
                      0., 0., 0., 0., 0., 0., 0., 0., 1.])

    np.testing.assert_array_almost_equal(bin_entries, check)
Пример #2
0
def test_normalized_hist_mean_cov():
    hist_list = ['date:country', 'date:bankrupt', 'date:num_employees', 'date:A_score', 'date:A_score:num_employees']

    pipeline = Pipeline(modules=[
        JsonReader(file_path=resources.data("example_histogram.json"), store_key="example_hist"),
        HistSplitter(read_key='example_hist', store_key='output_hist', features=hist_list),
        ApplyFunc(apply_to_key='output_hist', assign_to_key='output_hist',
                  apply_funcs=[dict(func=normalized_hist_mean_cov, suffix='')])
    ])
    datastore = pipeline.transform(datastore={})

    assert 'output_hist' in datastore
    for f in ['A_score', 'A_score:num_employees', 'bankrupt', 'country', 'num_employees']:
        assert f in datastore['output_hist']

    df = datastore['output_hist']['A_score']

    check = np.array([[0.22916667, -0.01041667, -0.0625, -0.13541667, -0.02083333],
                      [-0.01041667, 0.015625, 0.01041667, -0.01354167, -0.00208333],
                      [-0.0625, 0.01041667, 0.12916667, -0.06458333, -0.0125],
                      [-0.13541667, -0.01354167, -0.06458333, 0.240625, -0.02708333],
                      [-0.02083333, -0.00208333, -0.0125, -0.02708333, 0.0625]])

    for hm, hc, hb in zip(df['histogram_mean'].values, df['histogram_cov'].values, df['histogram_binning'].values):
        np.testing.assert_array_almost_equal(hm, [0.3125, 0.03125, 0.1875, 0.40625, 0.0625])
        np.testing.assert_array_almost_equal(hb, [1.5, 2.5, 3.5, 4.5, 5.5])
        np.testing.assert_array_almost_equal(hc, check)
Пример #3
0
def test_chi_squared2():
    hist_list = ['date:country', 'date:bankrupt', 'date:num_employees', 'date:A_score', 'date:A_score:num_employees']

    pipeline = Pipeline(modules=[
        JsonReader(file_path=resources.data("example_histogram.json"), store_key="example_hist"),
        HistSplitter(read_key='example_hist', store_key='output_hist', features=hist_list),
        ApplyFunc(apply_to_key='output_hist', apply_funcs=[
            dict(func=expand_norm_hist_mean_cov, hist_name='histogram', shift=1, suffix='', entire=True)]),
        ApplyFunc(apply_to_key='output_hist', apply_funcs=[dict(func=relative_chi_squared, suffix='', axis=1)])
    ])
    datastore = pipeline.transform(datastore={})

    assert 'output_hist' in datastore
    for f in ['A_score', 'A_score:num_employees', 'bankrupt', 'country', 'num_employees']:
        assert f in datastore['output_hist']

    df = datastore['output_hist']['A_score']
    np.testing.assert_almost_equal(df['chi2'][-1], 4.066666666666674)
    df = datastore['output_hist']['A_score:num_employees']
    np.testing.assert_almost_equal(df['chi2'][-2], 3.217532467532462)
    df = datastore['output_hist']['bankrupt']
    np.testing.assert_almost_equal(df['chi2'][-1], 0.11718750000000011)
    df = datastore['output_hist']['country']
    np.testing.assert_almost_equal(df['chi2'][-1], 0.6093749999999999)
    df = datastore['output_hist']['num_employees']
    np.testing.assert_almost_equal(df['chi2'][-1], 1.1858766233766194)
Пример #4
0
def test_variance_comparer():
    datastore = dict()
    datastore["to_profile"] = test_comparer_df

    module1 = ApplyFunc(apply_to_key="to_profile",
                        features=["the_feature", "dummy_feature"])
    module1.add_apply_func(np.std, suffix="_std", entire=True)
    module1.add_apply_func(np.mean, suffix="_mean", entire=True)

    module2 = ApplyFunc(apply_to_key="to_profile",
                        features=["the_feature", "dummy_feature"])
    module2.add_apply_func(pull,
                           suffix="_pull",
                           axis=1,
                           suffix_mean="_mean",
                           suffix_std="_std")

    pipeline = Pipeline(modules=[module1, module2])
    datastore = pipeline.transform(datastore)

    p = datastore["to_profile"]["the_feature"]
    np.testing.assert_almost_equal(p["mae_pull"].values[2], -0.1017973, 5)
    np.testing.assert_almost_equal(p["mae_pull"].values[3], 1.934149074, 6)

    p = datastore["to_profile"]["dummy_feature"]
    np.testing.assert_almost_equal(p["mae_pull"].values[0], -0.6107839182)
Пример #5
0
def test_expand_norm_hist_mean_cov():
    hist_list = ['date:country', 'date:bankrupt', 'date:num_employees', 'date:A_score', 'date:A_score:num_employees']

    pipeline = Pipeline(modules=[
        JsonReader(file_path=resources.data("example_histogram.json"), store_key="example_hist"),
        HistSplitter(read_key='example_hist', store_key='output_hist', features=hist_list),
        ApplyFunc(apply_to_key='output_hist', apply_funcs=[
            dict(func=expand_norm_hist_mean_cov, hist_name='histogram', shift=1, suffix='', entire=True)])
    ])
    datastore = pipeline.transform(datastore={})

    assert 'output_hist' in datastore
    for f in ['A_score', 'A_score:num_employees', 'bankrupt', 'country', 'num_employees']:
        assert f in datastore['output_hist']

    df = datastore['output_hist']['num_employees']
    mean = df['histogram_mean'].values[-2]

    check = np.array([0.56666667, 0.03333333, 0.03333333, 0., 0.,
                      0., 0., 0., 0., 0.,
                      0.06666667, 0., 0., 0., 0.,
                      0., 0., 0., 0., 0.,
                      0., 0., 0., 0., 0.,
                      0., 0., 0., 0., 0.,
                      0., 0., 0., 0., 0.,
                      0., 0., 0., 0., 0.,
                      0., 0., 0., 0., 0.,
                      0., 0.06666667, 0., 0., 0.,
                      0., 0., 0., 0., 0.06666667,
                      0., 0., 0., 0., 0.,
                      0., 0., 0., 0.03333333, 0.06666667, 0.06666667])

    np.testing.assert_array_almost_equal(mean, check)
Пример #6
0
def test_chi_squared1():
    hist_list = ['date:country', 'date:bankrupt', 'date:num_employees', 'date:A_score', 'date:A_score:num_employees']

    pipeline = Pipeline(modules=[
        JsonReader(file_path=resources.data("example_histogram.json"), store_key="example_hist"),
        HistSplitter(read_key='example_hist', store_key='output_hist', features=hist_list),
        ApplyFunc(apply_to_key='output_hist', apply_funcs=[
            dict(func=roll_norm_hist_mean_cov, hist_name='histogram', window=5, shift=1, suffix='', entire=True)]),
        ApplyFunc(apply_to_key='output_hist', apply_funcs=[dict(func=relative_chi_squared, suffix='', axis=1)])
    ])
    datastore = pipeline.transform(datastore={})

    assert 'output_hist' in datastore
    for f in ['A_score', 'A_score:num_employees', 'bankrupt', 'country', 'num_employees']:
        assert f in datastore['output_hist']

    df = datastore['output_hist']['A_score']
    np.testing.assert_almost_equal(df['chi2'][6], 3.275000000000001)
    df = datastore['output_hist']['A_score:num_employees']
    np.testing.assert_almost_equal(df['chi2'][-2], 2.1333333333333315)
    df = datastore['output_hist']['bankrupt']
    np.testing.assert_almost_equal(df['chi2'][6], 0.19687500000000002)
    df = datastore['output_hist']['country']
    np.testing.assert_almost_equal(df['chi2'][5], 0.8999999999999994)
    df = datastore['output_hist']['num_employees']
    np.testing.assert_almost_equal(df['chi2'][5], 0.849999999999999)
Пример #7
0
def test_chi_RollingNormHistComparer():
    hist_list = [
        "date:country",
        "date:bankrupt",
        "date:num_employees",
        "date:A_score",
        "date:A_score:num_employees",
    ]

    pipeline = Pipeline(modules=[
        JsonReader(
            file_path=resources.data("example_histogram.json"),
            store_key="example_hist",
        ),
        HistSplitter(read_key="example_hist",
                     store_key="output_hist",
                     features=hist_list),
        RollingNormHistComparer(
            read_key="output_hist", store_key="comparisons", window=10),
    ])
    datastore = pipeline.transform(datastore={})

    assert "comparisons" in datastore
    for f in [
            "A_score",
            "A_score:num_employees",
            "bankrupt",
            "country",
            "num_employees",
    ]:
        assert f in datastore["comparisons"]

    df = datastore["comparisons"]["A_score"]
    np.testing.assert_almost_equal(df["chi2"][-1], 37.61910112359518)
Пример #8
0
def test_apply_dynamic_traffic_light_bounds():
    datastore = dict()
    datastore["to_profile"] = {"asc_numbers": get_test_data()}

    conf = {"monitoring_rules": {"*_pull": [7, 4, -4, -7]}}

    m1 = ApplyFunc(
        apply_to_key="to_profile", features=["asc_numbers"], metrics=["a", "b"]
    )
    m1.add_apply_func(np.std, suffix="_std")
    m1.add_apply_func(np.mean, suffix="_mean")

    m2 = ApplyFunc(apply_to_key="to_profile", features=["asc_numbers"])
    m2.add_apply_func(
        pull, suffix="_pull", axis=1, suffix_mean="_mean", suffix_std="_std"
    )

    m5 = DynamicBounds(
        read_key="to_profile",
        store_key="tl",
        rules=conf["monitoring_rules"],
        suffix_mean="_mean",
        suffix_std="_std",
    )

    pipeline = Pipeline(modules=[m1, m2, m5])
    datastore = pipeline.transform(datastore)

    assert "tl" in datastore
    test_data = datastore["tl"]
    assert "asc_numbers" in test_data
    p = test_data["asc_numbers"]

    tlcs = [
        "traffic_light_a_red_high",
        "traffic_light_a_yellow_high",
        "traffic_light_a_yellow_low",
        "traffic_light_a_red_low",
        "traffic_light_b_red_high",
        "traffic_light_b_yellow_high",
        "traffic_light_b_yellow_low",
        "traffic_light_b_red_low",
    ]
    for c in tlcs:
        assert c in p.columns

    np.testing.assert_almost_equal(p["traffic_light_a_red_high"].values[0], 251.5624903)
    np.testing.assert_almost_equal(
        p["traffic_light_a_yellow_high"].values[0], 164.96428019
    )
    np.testing.assert_almost_equal(
        p["traffic_light_a_yellow_low"].values[0], -65.96428019
    )
    np.testing.assert_almost_equal(
        p["traffic_light_a_red_low"].values[0], -152.56249033
    )
    np.testing.assert_almost_equal(p["traffic_light_b_red_high"].values[0], 5.0)
    np.testing.assert_almost_equal(p["traffic_light_b_yellow_high"].values[0], 3.5)
    np.testing.assert_almost_equal(p["traffic_light_b_yellow_low"].values[0], -0.5)
    np.testing.assert_almost_equal(p["traffic_light_b_red_low"].values[0], -2.0)
Пример #9
0
def test_get_histograms_module():

    pandas_filler = PandasHistogrammar(features=[
        'date', 'isActive', 'age', 'eyeColor', 'gender', 'company', 'latitude',
        'longitude', ['isActive', 'age'], ['latitude', 'longitude']
    ],
                                       bin_specs={
                                           'longitude': {
                                               'bin_width': 5,
                                               'bin_offset': 0
                                           },
                                           'latitude': {
                                               'bin_width': 5,
                                               'bin_offset': 0
                                           }
                                       },
                                       read_key='input',
                                       store_key='output')

    pipeline = Pipeline(modules=[pandas_filler])
    datastore = pipeline.transform(datastore={'input': pytest.test_df})

    assert 'output' in datastore
    current_hists = datastore['output']
    assert current_hists['age'].toJson() == pytest.age
    assert current_hists['company'].toJson() == pytest.company
    assert current_hists['date'].toJson() == pytest.date
    assert current_hists['eyeColor'].toJson() == pytest.eyesColor
    assert current_hists['gender'].toJson() == pytest.gender
    assert current_hists['isActive'].toJson() == pytest.isActive
    assert current_hists['isActive:age'].toJson() == pytest.isActive_age
    assert current_hists['latitude'].toJson() == pytest.latitude
    assert current_hists['longitude'].toJson() == pytest.longitude
    assert current_hists['latitude:longitude'].toJson(
    ) == pytest.latitude_longitude
Пример #10
0
def test_popmon_pipeline():
    logger = logging.getLogger()
    logger.addHandler(logging.StreamHandler())
    logger.setLevel(logging.INFO)

    datastore = {"x": np.array([7, 2, 7, 9, 6]), "weights": np.array([1, 1, 2, 1, 2])}
    expected_result = np.sum(
        np.power(np.log(datastore["x"]), 2) * datastore["weights"]
    ) / np.sum(datastore["weights"])

    log_pow_pipeline = Pipeline(
        modules=[
            LogTransformer(input_key="x", output_key="log_x"),
            PowerTransformer(input_key="log_x", output_key="log_pow_x", power=2),
        ]
    )

    pipeline = Pipeline(
        modules=[
            log_pow_pipeline,
            SumNormalizer(input_key="weights", output_key="norm_weights"),
            WeightedSum(
                input_key="log_pow_x", weight_key="norm_weights", output_key="res"
            ),
        ],
        logger=logger,
    )

    assert pipeline.transform(datastore)["res"] == expected_result
Пример #11
0
def test_hists_stability_metrics():
    # get histograms
    pipeline = Pipeline(
        modules=[
            JsonReader(
                file_path=resources.data("example_histogram.json"), store_key="hists"
            )
        ]
    )
    datastore = pipeline.transform(datastore={})
    hists = datastore["hists"]

    # generate metrics
    hist_list = [
        "date:bankrupt",
        "date:country",
        "date:bankrupt",
        "date:A_score",
        "date:A_score:num_employees",
    ]
    ds = stability_metrics(
        hists, reference_type="rolling", window=5, features=hist_list
    )

    cols = ["profiles", "comparisons", "traffic_lights", "alerts"]
    for c in cols:
        assert c in list(ds.keys())
Пример #12
0
def test_pull():
    datastore = dict()
    datastore["to_profile"] = {"asc_numbers": get_test_data()}

    module1 = ApplyFunc(apply_to_key="to_profile")
    module1.add_apply_func(np.std, suffix="_std", entire=True)
    module1.add_apply_func(np.mean, suffix="_mean", entire=True)

    module2 = ApplyFunc(apply_to_key="to_profile", features=["asc_numbers"])
    module2.add_apply_func(
        pull,
        suffix="_pull",
        axis=1,
        suffix_mean="_mean",
        suffix_std="_std",
        cols=["a", "b"],
    )

    pipeline = Pipeline(modules=[module1, module2])
    datastore = pipeline.transform(datastore)

    p = datastore["to_profile"]["asc_numbers"]

    np.testing.assert_almost_equal(p["a_pull"].values[0], -1.714816)
    np.testing.assert_almost_equal(p["b_pull"].values[0], -1.0)
Пример #13
0
def test_chi_squared1():
    hist_list = [
        "date:country",
        "date:bankrupt",
        "date:num_employees",
        "date:A_score",
        "date:A_score:num_employees",
    ]

    pipeline = Pipeline(
        modules=[
            JsonReader(
                file_path=resources.data("example_histogram.json"),
                store_key="example_hist",
            ),
            HistSplitter(
                read_key="example_hist", store_key="output_hist", features=hist_list
            ),
            ApplyFunc(
                apply_to_key="output_hist",
                apply_funcs=[
                    dict(
                        func=roll_norm_hist_mean_cov,
                        hist_name="histogram",
                        window=5,
                        shift=1,
                        suffix="",
                        entire=True,
                    )
                ],
            ),
            ApplyFunc(
                apply_to_key="output_hist",
                apply_funcs=[dict(func=relative_chi_squared, suffix="", axis=1)],
            ),
        ]
    )
    datastore = pipeline.transform(datastore={})

    assert "output_hist" in datastore
    for f in [
        "A_score",
        "A_score:num_employees",
        "bankrupt",
        "country",
        "num_employees",
    ]:
        assert f in datastore["output_hist"]

    df = datastore["output_hist"]["A_score"]
    np.testing.assert_almost_equal(df["chi2"][6], 4.25)
    df = datastore["output_hist"]["A_score:num_employees"]
    np.testing.assert_almost_equal(df["chi2"][-2], 2.1333333333333315)
    df = datastore["output_hist"]["bankrupt"]
    np.testing.assert_almost_equal(df["chi2"][6], 0.40000000000000024)
    df = datastore["output_hist"]["country"]
    np.testing.assert_almost_equal(df["chi2"][5], 0.8999999999999994)
    df = datastore["output_hist"]["num_employees"]
    np.testing.assert_almost_equal(df["chi2"][5], 0.849999999999999)
Пример #14
0
def test_normalized_hist_mean_cov():
    hist_list = [
        "date:country",
        "date:bankrupt",
        "date:num_employees",
        "date:A_score",
        "date:A_score:num_employees",
    ]

    pipeline = Pipeline(
        modules=[
            JsonReader(
                file_path=resources.data("example_histogram.json"),
                store_key="example_hist",
            ),
            HistSplitter(
                read_key="example_hist", store_key="output_hist", features=hist_list
            ),
            ApplyFunc(
                apply_to_key="output_hist",
                assign_to_key="output_hist",
                apply_funcs=[dict(func=normalized_hist_mean_cov, suffix="")],
            ),
        ]
    )
    datastore = pipeline.transform(datastore={})

    assert "output_hist" in datastore
    for f in [
        "A_score",
        "A_score:num_employees",
        "bankrupt",
        "country",
        "num_employees",
    ]:
        assert f in datastore["output_hist"]

    df = datastore["output_hist"]["A_score"]

    check = np.array(
        [
            [0.22916667, -0.01041667, -0.0625, -0.13541667, -0.02083333],
            [-0.01041667, 0.015625, 0.01041667, -0.01354167, -0.00208333],
            [-0.0625, 0.01041667, 0.12916667, -0.06458333, -0.0125],
            [-0.13541667, -0.01354167, -0.06458333, 0.240625, -0.02708333],
            [-0.02083333, -0.00208333, -0.0125, -0.02708333, 0.0625],
        ]
    )

    for hm, hc, hb in zip(
        df["histogram_mean"].values,
        df["histogram_cov"].values,
        df["histogram_binning"].values,
    ):
        np.testing.assert_array_almost_equal(
            hm, [0.3125, 0.03125, 0.1875, 0.40625, 0.0625]
        )
        np.testing.assert_array_almost_equal(hb, [1.5, 2.5, 3.5, 4.5, 5.5])
        np.testing.assert_array_almost_equal(hc, check)
Пример #15
0
def test_chi_squared2():
    hist_list = [
        "date:country",
        "date:bankrupt",
        "date:num_employees",
        "date:A_score",
        "date:A_score:num_employees",
    ]

    pipeline = Pipeline(
        modules=[
            JsonReader(
                file_path=resources.data("example_histogram.json"),
                store_key="example_hist",
            ),
            HistSplitter(
                read_key="example_hist", store_key="output_hist", features=hist_list
            ),
            ApplyFunc(
                apply_to_key="output_hist",
                apply_funcs=[
                    dict(
                        func=expand_norm_hist_mean_cov,
                        hist_name="histogram",
                        shift=1,
                        suffix="",
                        entire=True,
                    )
                ],
            ),
            ApplyFunc(
                apply_to_key="output_hist",
                apply_funcs=[dict(func=relative_chi_squared, suffix="", axis=1)],
            ),
        ]
    )
    datastore = pipeline.transform(datastore={})

    assert "output_hist" in datastore
    for f in [
        "A_score",
        "A_score:num_employees",
        "bankrupt",
        "country",
        "num_employees",
    ]:
        assert f in datastore["output_hist"]

    df = datastore["output_hist"]["A_score"]
    np.testing.assert_almost_equal(df["chi2"][-1], 9.891821919006366)
    df = datastore["output_hist"]["A_score:num_employees"]
    np.testing.assert_almost_equal(df["chi2"][-2], 3.217532467532462)
    df = datastore["output_hist"]["bankrupt"]
    np.testing.assert_almost_equal(df["chi2"][-1], 0.23767605633802757)
    df = datastore["output_hist"]["country"]
    np.testing.assert_almost_equal(df["chi2"][-1], 1.3717532467532458)
    df = datastore["output_hist"]["num_employees"]
    np.testing.assert_almost_equal(df["chi2"][-1], 1.1858766233766194)
Пример #16
0
def test_self_reference():
    hist_list = ['date:A_score', 'date:A_score:num_employees']

    pipeline = Pipeline(modules=[
        JsonReader(file_path=resources.data("example_histogram.json"),
                   store_key="hists"),
        self_reference(hists_key='hists', features=hist_list),
    ])
    pipeline.transform(datastore={})
Пример #17
0
def test_expanding_reference():
    hist_list = ["date:bankrupt", "date:num_employees"]

    pipeline = Pipeline(modules=[
        JsonReader(file_path=resources.data("example_histogram.json"),
                   store_key="hists"),
        expanding_reference(hists_key="hists", features=hist_list),
    ])
    pipeline.transform(datastore={})
Пример #18
0
def test_rolling_reference():
    hist_list = ["date:country", "date:A_score:num_employees"]

    pipeline = Pipeline(modules=[
        JsonReader(file_path=resources.data("example_histogram.json"),
                   store_key="hists"),
        rolling_reference(hists_key="hists", window=5, features=hist_list),
    ])
    pipeline.transform(datastore={})
Пример #19
0
def test_report_traffic_light_bounds():
    datastore = dict()
    datastore["to_profile"] = {"asc_numbers": get_test_data()}

    conf = {
        "monitoring_rules": {
            "the_feature:mae": [8, 4, 2, 0.15],
            "mse": [0.2, 0.11, 0.09, 0],
            "mae": [1, 0, 0, 0],
            "*_pull": [7, 4, -4, -7]
        },
        "pull_rules": {
            "*_pull": [7, 4, -4, -7]
        }
    }

    m1 = ApplyFunc(apply_to_key="to_profile",
                   features=["asc_numbers"],
                   metrics=['a', 'b'])
    m1.add_apply_func(expanding_mean, suffix='_std', entire=True)
    m1.add_apply_func(expanding_std, suffix='_mean', entire=True)

    m2 = ApplyFunc(apply_to_key="to_profile", features=["asc_numbers"])
    m2.add_apply_func(pull,
                      suffix='_pull',
                      axis=1,
                      suffix_mean='_mean',
                      suffix_std='_std')

    ctlb = ComputeTLBounds(
        read_key="to_profile",
        store_key="static_tlb",
        monitoring_rules=conf["monitoring_rules"],
    )

    m3 = ComputeTLBounds(read_key="to_profile",
                         monitoring_rules=conf["pull_rules"],
                         apply_funcs_key="dynamic_tlb",
                         func=pull_bounds,
                         metrics_wide=True,
                         axis=1)

    m4 = ApplyFunc(
        apply_to_key=m3.read_key,
        assign_to_key='dtlb',
        apply_funcs_key="dynamic_tlb",
    )

    rg = SectionGenerator(read_key="to_profile",
                          store_key="section",
                          section_name="Profiles",
                          dynamic_bounds='dtlb',
                          static_bounds='static_tlb')

    pipeline = Pipeline(modules=[m1, m2, ctlb, m3, m4, rg])
    datastore = pipeline.transform(datastore)
Пример #20
0
def test_external_reference():
    hist_list = ["date:country", "date:bankrupt"]

    pipeline = Pipeline(modules=[
        JsonReader(file_path=resources.data("example_histogram.json"),
                   store_key="hists"),
        external_reference(
            hists_key="hists", ref_hists_key="hists", features=hist_list),
    ])
    pipeline.transform(datastore={})
def test_get_histograms_module(spark_co):
    pytest.age["data"]["name"] = "b'age'"
    pytest.company["data"]["name"] = "b'company'"
    pytest.eyesColor["data"]["name"] = "b'eyeColor'"
    pytest.gender["data"]["name"] = "b'gender'"
    pytest.isActive["data"]["name"] = "b'isActive'"
    pytest.latitude["data"]["name"] = "b'latitude'"
    pytest.longitude["data"]["name"] = "b'longitude'"

    pytest.latitude_longitude["data"]["name"] = "b'latitude:longitude'"
    pytest.latitude_longitude["data"]["bins:name"] = "unit_func"

    spark = spark_co

    spark_df = spark.createDataFrame(pytest.test_df)

    spark_filler = SparkHistogrammar(
        features=[
            "date",
            "isActive",
            "age",
            "eyeColor",
            "gender",
            "company",
            "latitude",
            "longitude",
            ["isActive", "age"],
            ["latitude", "longitude"],
        ],
        bin_specs={
            "longitude": {
                "bin_width": 5.0,
                "bin_offset": 0.0
            },
            "latitude": {
                "bin_width": 5.0,
                "bin_offset": 0.0
            },
        },
        read_key="input",
        store_key="output",
    )

    # test transform() function call
    pipeline = Pipeline(modules=[spark_filler])
    datastore = pipeline.transform(datastore={"input": spark_df})

    assert "output" in datastore
    current_hists = datastore["output"]
    assert current_hists["age"].toJson() == pytest.age
    assert current_hists["company"].toJson() == pytest.company
    assert current_hists["eyeColor"].toJson() == pytest.eyesColor
    assert current_hists["gender"].toJson() == pytest.gender
    assert current_hists["latitude"].toJson() == pytest.latitude
    assert current_hists["longitude"].toJson() == pytest.longitude
Пример #22
0
def test_hists_stability_report():
    # get histograms
    pipeline = Pipeline(modules=[
        JsonReader(file_path=resources.data("example_histogram.json"), store_key="hists"),
    ])
    datastore = pipeline.transform(datastore={})
    hists = datastore['hists']

    # generate report
    hist_list = ['date:bankrupt', 'date:country', 'date:bankrupt', 'date:A_score', 'date:A_score:num_employees']
    stability_report(hists, reference_type='rolling', window=5, features=hist_list)
Пример #23
0
def test_hist_compare():
    hist_list = ['date:country', 'date:bankrupt', 'date:num_employees', 'date:A_score', 'date:A_score:num_employees']

    pipeline = Pipeline(modules=[
        JsonReader(file_path=resources.data("example_histogram.json"), store_key="example_hist"),
        HistSplitter(read_key='example_hist', store_key='output_hist', features=hist_list),
        ApplyFunc(apply_to_key='output_hist',
                  apply_funcs=[dict(func=expanding_hist, shift=1, suffix='sum', entire=True, hist_name='histogram')]),
        ApplyFunc(apply_to_key='output_hist', assign_to_key='comparison', apply_funcs=[
            dict(func=hist_compare, hist_name1='histogram', hist_name2='histogram_sum', suffix='', axis=1)])
    ])
    datastore = pipeline.transform(datastore={})

    df = datastore['comparison']['num_employees']
    np.testing.assert_array_equal(df['chi2'].values[-1], 0.7017543859649122)
Пример #24
0
def test_rolling_window_funcs():
    datastore = dict(to_profile={"asc_numbers": get_test_data()})

    m = ApplyFunc(
        apply_to_key="to_profile", features=["asc_numbers"], metrics=["a", "b"]
    )
    m.add_apply_func(
        rolling_mean, suffix="_rolling_3_mean", entire=True, window=3, shift=0
    )
    m.add_apply_func(
        rolling_lr, suffix="_rolling_10_slope", entire=True, window=10, index=0
    )
    m.add_apply_func(
        rolling_lr, suffix="_rolling_10_intercept", entire=True, window=10, index=1
    )

    datastore = Pipeline(modules=[m]).transform(datastore)
    feature_df = datastore["to_profile"]["asc_numbers"]

    np.testing.assert_array_almost_equal(
        feature_df["a_rolling_3_mean"].tolist(), [np.nan] * 2 + list(range(1, 99))
    )
    np.testing.assert_array_almost_equal(
        feature_df["a_rolling_10_slope"].tolist(), [np.nan] * 9 + [1.0] * 91
    )
    np.testing.assert_array_almost_equal(
        feature_df["a_rolling_10_intercept"].tolist(),
        [np.nan] * 9 + [float(i) for i in range(0, 91)],
    )
Пример #25
0
def test_chi_RollingNormHistComparer():
    hist_list = ['date:country', 'date:bankrupt', 'date:num_employees', 'date:A_score', 'date:A_score:num_employees']

    pipeline = Pipeline(modules=[
        JsonReader(file_path=resources.data("example_histogram.json"), store_key="example_hist"),
        HistSplitter(read_key='example_hist', store_key='output_hist', features=hist_list),
        RollingNormHistComparer(read_key='output_hist', store_key='comparisons', window=10)
    ])
    datastore = pipeline.transform(datastore={})

    assert 'comparisons' in datastore
    for f in ['A_score', 'A_score:num_employees', 'bankrupt', 'country', 'num_employees']:
        assert f in datastore['comparisons']

    df = datastore['comparisons']['A_score']
    np.testing.assert_almost_equal(df['chi2'][-1], 45.200000)
Пример #26
0
def test_traffic_light_summary_combination():
    datastore = {"test_data": test_comparer_df}

    conf = {
        "monitoring_rules": {
            "the_feature:mae": [8, 4, 2, 0.15],
            "dummy_feature:*": [0, 0, 0, 0],
            "mse": [0.2, 0.11, 0.09, 0],
            "mae": [0, 0, 0, 0],
            "*": [0, 0, 0, 0],
        }
    }

    ctlb = ComputeTLBounds(
        read_key="test_data",
        store_key="traffic_light_bounds",
        apply_funcs_key="traffic_light_funcs",
        ignore_features=["dummy_feature"],
        monitoring_rules=conf["monitoring_rules"],
        prefix="tl_",
    )

    atlb = ApplyFunc(
        apply_to_key=ctlb.read_key,
        assign_to_key="output_data",
        apply_funcs_key="traffic_light_funcs",
    )

    tls = ApplyFunc(
        apply_to_key="output_data",
        apply_funcs=[dict(func=traffic_light_summary, axis=1, suffix="")],
        assign_to_key="alerts",
    )

    asum = AlertsSummary(read_key="alerts")

    pipeline = Pipeline(modules=[ctlb, atlb, tls, asum])
    datastore = pipeline.transform(datastore)

    alerts = datastore["alerts"]
    assert "_AGGREGATE_" in alerts
    output = datastore["alerts"]["_AGGREGATE_"]

    assert output["worst"].values[-1] == 2
    assert output["n_green"].values[-1] == 1
    assert output["n_yellow"].values[-1] == 0
    assert output["n_red"].values[-1] == 1
Пример #27
0
def test_get_histograms_module():

    np_filler = NumpyHistogrammar(
        features=[
            "date",
            "isActive",
            "age",
            "eyeColor",
            "gender",
            "company",
            "latitude",
            "longitude",
            ["isActive", "age"],
            ["latitude", "longitude"],
        ],
        bin_specs={
            "longitude": {
                "bin_width": 5,
                "bin_offset": 0
            },
            "latitude": {
                "bin_width": 5,
                "bin_offset": 0
            },
        },
        read_key="input",
        store_key="output",
    )

    pipeline = Pipeline(modules=[np_filler])
    datastore = pipeline.transform(
        datastore={"input": pytest.test_df.to_records(index=False)})

    assert "output" in datastore
    current_hists = datastore["output"]

    assert current_hists["age"].toJson() == pytest.age
    assert current_hists["company"].toJson() == pytest.company
    assert current_hists["date"].toJson() == pytest.date
    assert current_hists["eyeColor"].toJson() == pytest.eyesColor
    assert current_hists["gender"].toJson() == pytest.gender
    assert current_hists["isActive"].toJson() == pytest.isActive
    assert current_hists["isActive:age"].toJson() == pytest.isActive_age
    assert current_hists["latitude"].toJson() == pytest.latitude
    assert current_hists["longitude"].toJson() == pytest.longitude
    assert current_hists["latitude:longitude"].toJson(
    ) == pytest.latitude_longitude
Пример #28
0
def test_get_histograms_module(spark_co):
    pytest.age["data"]["name"] = "b'age'"
    pytest.company["data"]["name"] = "b'company'"
    pytest.eyesColor["data"]["name"] = "b'eyeColor'"
    pytest.gender["data"]["name"] = "b'gender'"
    pytest.isActive["data"]["name"] = "b'isActive'"
    pytest.latitude["data"]["name"] = "b'latitude'"
    pytest.longitude["data"]["name"] = "b'longitude'"

    pytest.latitude_longitude["data"]["name"] = "b'latitude:longitude'"
    pytest.latitude_longitude["data"]["bins:name"] = "unit_func"

    spark = spark_co

    spark_df = spark.createDataFrame(pytest.test_df)

    spark_filler = SparkHistogrammar(features=[
        'date', 'isActive', 'age', 'eyeColor', 'gender', 'company', 'latitude',
        'longitude', ['isActive', 'age'], ['latitude', 'longitude']
    ],
                                     bin_specs={
                                         'longitude': {
                                             'bin_width': 5.0,
                                             'bin_offset': 0.0
                                         },
                                         'latitude': {
                                             'bin_width': 5.0,
                                             'bin_offset': 0.0
                                         }
                                     },
                                     read_key='input',
                                     store_key='output')

    # test transform() function call
    pipeline = Pipeline(modules=[spark_filler])
    datastore = pipeline.transform(datastore={'input': spark_df})

    assert 'output' in datastore
    current_hists = datastore['output']
    assert current_hists['age'].toJson() == pytest.age
    assert current_hists['company'].toJson() == pytest.company
    assert current_hists['eyeColor'].toJson() == pytest.eyesColor
    assert current_hists['gender'].toJson() == pytest.gender
    assert current_hists['latitude'].toJson() == pytest.latitude
    assert current_hists['longitude'].toJson() == pytest.longitude
Пример #29
0
def test_expanding_hist_comparer():

    hist_list = ['date:country', 'date:bankrupt', 'date:num_employees', 'date:A_score']
    features = ['country', 'bankrupt', 'num_employees', 'A_score']

    cols = ['expanding_pearson', 'expanding_chi2', 'expanding_chi2_zscore', 'expanding_chi2_norm',
            'expanding_chi2_pvalue', 'expanding_chi2_max_residual', 'expanding_chi2_spike_count',
            'expanding_ks', 'expanding_ks_zscore', 'expanding_ks_pvalue', 'expanding_max_prob_diff',
            'expanding_unknown_labels']

    pipeline = Pipeline(modules=[
        JsonReader(file_path=resources.data("example_histogram.json"), store_key="example_hist"),
        HistSplitter(read_key='example_hist', store_key='output_hist', features=hist_list),
        ExpandingHistComparer(read_key='output_hist', store_key='comparison')
    ])
    datastore = pipeline.transform(datastore={})

    assert 'comparison' in datastore and isinstance(datastore['comparison'], dict)
    assert len(datastore['comparison'].keys()) == len(features)
    for f in features:
        assert f in datastore['comparison']
    for f in features:
        assert isinstance(datastore['comparison'][f], pd.DataFrame)

    df = datastore['comparison']['A_score']
    assert len(df) == 16
    np.testing.assert_array_equal(sorted(df.columns), sorted(cols))
    np.testing.assert_almost_equal(df['expanding_chi2'].mean(), 2.8366236044275257)

    df = datastore['comparison']['country']
    assert len(df) == 17
    np.testing.assert_array_equal(sorted(df.columns), sorted(cols))
    np.testing.assert_almost_equal(df['expanding_chi2'].mean(), 1.1224348056645368)

    df = datastore['comparison']['bankrupt']
    assert len(df) == 17
    np.testing.assert_array_equal(sorted(df.columns), sorted(cols))
    np.testing.assert_almost_equal(df['expanding_chi2'].mean(), 0.6901425387043608)

    df = datastore['comparison']['num_employees']
    assert len(df) == 17
    np.testing.assert_array_equal(sorted(df.columns), sorted(cols))
    np.testing.assert_almost_equal(df['expanding_chi2'].mean(), 4.243731870738727)
Пример #30
0
def test_rolling_hist_comparer():

    hist_list = ['date:country', 'date:bankrupt', 'date:num_employees', 'date:A_score']
    features = ['country', 'bankrupt', 'num_employees', 'A_score']

    cols = ['roll_pearson', 'roll_chi2', 'roll_chi2_zscore', 'roll_chi2_norm',
            'roll_chi2_pvalue', 'roll_chi2_max_residual', 'roll_chi2_spike_count',
            'roll_ks', 'roll_ks_zscore', 'roll_ks_pvalue', 'roll_max_prob_diff',
            'roll_unknown_labels']

    pipeline = Pipeline(modules=[
        JsonReader(file_path=resources.data("example_histogram.json"), store_key="example_hist"),
        HistSplitter(read_key='example_hist', store_key='output_hist', features=hist_list),
        RollingHistComparer(read_key='output_hist', store_key='comparison', window=5)
    ])
    datastore = pipeline.transform(datastore={})

    assert 'comparison' in datastore and isinstance(datastore['comparison'], dict)
    assert len(datastore['comparison'].keys()) == len(features)
    for f in features:
        assert f in datastore['comparison']
    for f in features:
        assert isinstance(datastore['comparison'][f], pd.DataFrame)

    df = datastore['comparison']['A_score']
    assert len(df) == 16
    np.testing.assert_array_equal(sorted(df.columns), sorted(cols))
    np.testing.assert_almost_equal(df['roll_chi2'].mean(), 2.927272727272727)

    df = datastore['comparison']['country']
    assert len(df) == 17
    np.testing.assert_array_equal(sorted(df.columns), sorted(cols))
    np.testing.assert_almost_equal(df['roll_chi2'].mean(), 1.3022619047619046)

    df = datastore['comparison']['bankrupt']
    assert len(df) == 17
    np.testing.assert_array_equal(sorted(df.columns), sorted(cols))
    np.testing.assert_almost_equal(df['roll_chi2'].mean(), 0.7251681783824641)

    df = datastore['comparison']['num_employees']
    assert len(df) == 17
    np.testing.assert_array_equal(sorted(df.columns), sorted(cols))
    np.testing.assert_almost_equal(df['roll_chi2'].mean(), 4.0995701058201055)