def test_json_reader(): jr = JsonReader(file_path=resources.data("example.json"), store_key="example") datastore = jr.transform(datastore={}) assert datastore["example"]["boolean"] assert len(datastore["example"]["array"]) == 3 assert datastore["example"]["category"]["a"] == 0
def test_hists_stability_metrics(): # get histograms pipeline = Pipeline( modules=[ JsonReader( file_path=resources.data("example_histogram.json"), store_key="hists" ) ] ) datastore = pipeline.transform(datastore={}) hists = datastore["hists"] # generate metrics hist_list = [ "date:bankrupt", "date:country", "date:bankrupt", "date:A_score", "date:A_score:num_employees", ] ds = stability_metrics( hists, reference_type="rolling", window=5, features=hist_list ) cols = ["profiles", "comparisons", "traffic_lights", "alerts"] for c in cols: assert c in list(ds.keys())
def test_normalized_hist_mean_cov(): hist_list = ['date:country', 'date:bankrupt', 'date:num_employees', 'date:A_score', 'date:A_score:num_employees'] pipeline = Pipeline(modules=[ JsonReader(file_path=resources.data("example_histogram.json"), store_key="example_hist"), HistSplitter(read_key='example_hist', store_key='output_hist', features=hist_list), ApplyFunc(apply_to_key='output_hist', assign_to_key='output_hist', apply_funcs=[dict(func=normalized_hist_mean_cov, suffix='')]) ]) datastore = pipeline.transform(datastore={}) assert 'output_hist' in datastore for f in ['A_score', 'A_score:num_employees', 'bankrupt', 'country', 'num_employees']: assert f in datastore['output_hist'] df = datastore['output_hist']['A_score'] check = np.array([[0.22916667, -0.01041667, -0.0625, -0.13541667, -0.02083333], [-0.01041667, 0.015625, 0.01041667, -0.01354167, -0.00208333], [-0.0625, 0.01041667, 0.12916667, -0.06458333, -0.0125], [-0.13541667, -0.01354167, -0.06458333, 0.240625, -0.02708333], [-0.02083333, -0.00208333, -0.0125, -0.02708333, 0.0625]]) for hm, hc, hb in zip(df['histogram_mean'].values, df['histogram_cov'].values, df['histogram_binning'].values): np.testing.assert_array_almost_equal(hm, [0.3125, 0.03125, 0.1875, 0.40625, 0.0625]) np.testing.assert_array_almost_equal(hb, [1.5, 2.5, 3.5, 4.5, 5.5]) np.testing.assert_array_almost_equal(hc, check)
def test_expanding_hist(): hist_list = ['date:country', 'date:bankrupt', 'date:num_employees', 'date:A_score', 'date:A_score:num_employees'] pipeline = Pipeline(modules=[ JsonReader(file_path=resources.data("example_histogram.json"), store_key="example_hist"), HistSplitter(read_key='example_hist', store_key='output_hist', features=hist_list), ApplyFunc(apply_to_key='output_hist', apply_funcs=[dict(func=expanding_hist, shift=1, suffix='sum', entire=True, hist_name='histogram')]), ]) datastore = pipeline.transform(datastore={}) df = datastore['output_hist']['num_employees'] h = df['histogram_sum'].values[-1] bin_entries = h.hist.bin_entries() check = np.array([11., 1., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]) np.testing.assert_array_almost_equal(bin_entries, check)
def test_expand_norm_hist_mean_cov(): hist_list = ['date:country', 'date:bankrupt', 'date:num_employees', 'date:A_score', 'date:A_score:num_employees'] pipeline = Pipeline(modules=[ JsonReader(file_path=resources.data("example_histogram.json"), store_key="example_hist"), HistSplitter(read_key='example_hist', store_key='output_hist', features=hist_list), ApplyFunc(apply_to_key='output_hist', apply_funcs=[ dict(func=expand_norm_hist_mean_cov, hist_name='histogram', shift=1, suffix='', entire=True)]) ]) datastore = pipeline.transform(datastore={}) assert 'output_hist' in datastore for f in ['A_score', 'A_score:num_employees', 'bankrupt', 'country', 'num_employees']: assert f in datastore['output_hist'] df = datastore['output_hist']['num_employees'] mean = df['histogram_mean'].values[-2] check = np.array([0.56666667, 0.03333333, 0.03333333, 0., 0., 0., 0., 0., 0., 0., 0.06666667, 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.06666667, 0., 0., 0., 0., 0., 0., 0., 0.06666667, 0., 0., 0., 0., 0., 0., 0., 0., 0.03333333, 0.06666667, 0.06666667]) np.testing.assert_array_almost_equal(mean, check)
def test_chi_squared2(): hist_list = ['date:country', 'date:bankrupt', 'date:num_employees', 'date:A_score', 'date:A_score:num_employees'] pipeline = Pipeline(modules=[ JsonReader(file_path=resources.data("example_histogram.json"), store_key="example_hist"), HistSplitter(read_key='example_hist', store_key='output_hist', features=hist_list), ApplyFunc(apply_to_key='output_hist', apply_funcs=[ dict(func=expand_norm_hist_mean_cov, hist_name='histogram', shift=1, suffix='', entire=True)]), ApplyFunc(apply_to_key='output_hist', apply_funcs=[dict(func=relative_chi_squared, suffix='', axis=1)]) ]) datastore = pipeline.transform(datastore={}) assert 'output_hist' in datastore for f in ['A_score', 'A_score:num_employees', 'bankrupt', 'country', 'num_employees']: assert f in datastore['output_hist'] df = datastore['output_hist']['A_score'] np.testing.assert_almost_equal(df['chi2'][-1], 4.066666666666674) df = datastore['output_hist']['A_score:num_employees'] np.testing.assert_almost_equal(df['chi2'][-2], 3.217532467532462) df = datastore['output_hist']['bankrupt'] np.testing.assert_almost_equal(df['chi2'][-1], 0.11718750000000011) df = datastore['output_hist']['country'] np.testing.assert_almost_equal(df['chi2'][-1], 0.6093749999999999) df = datastore['output_hist']['num_employees'] np.testing.assert_almost_equal(df['chi2'][-1], 1.1858766233766194)
def test_chi_squared1(): hist_list = ['date:country', 'date:bankrupt', 'date:num_employees', 'date:A_score', 'date:A_score:num_employees'] pipeline = Pipeline(modules=[ JsonReader(file_path=resources.data("example_histogram.json"), store_key="example_hist"), HistSplitter(read_key='example_hist', store_key='output_hist', features=hist_list), ApplyFunc(apply_to_key='output_hist', apply_funcs=[ dict(func=roll_norm_hist_mean_cov, hist_name='histogram', window=5, shift=1, suffix='', entire=True)]), ApplyFunc(apply_to_key='output_hist', apply_funcs=[dict(func=relative_chi_squared, suffix='', axis=1)]) ]) datastore = pipeline.transform(datastore={}) assert 'output_hist' in datastore for f in ['A_score', 'A_score:num_employees', 'bankrupt', 'country', 'num_employees']: assert f in datastore['output_hist'] df = datastore['output_hist']['A_score'] np.testing.assert_almost_equal(df['chi2'][6], 3.275000000000001) df = datastore['output_hist']['A_score:num_employees'] np.testing.assert_almost_equal(df['chi2'][-2], 2.1333333333333315) df = datastore['output_hist']['bankrupt'] np.testing.assert_almost_equal(df['chi2'][6], 0.19687500000000002) df = datastore['output_hist']['country'] np.testing.assert_almost_equal(df['chi2'][5], 0.8999999999999994) df = datastore['output_hist']['num_employees'] np.testing.assert_almost_equal(df['chi2'][5], 0.849999999999999)
def test_chi_RollingNormHistComparer(): hist_list = [ "date:country", "date:bankrupt", "date:num_employees", "date:A_score", "date:A_score:num_employees", ] pipeline = Pipeline(modules=[ JsonReader( file_path=resources.data("example_histogram.json"), store_key="example_hist", ), HistSplitter(read_key="example_hist", store_key="output_hist", features=hist_list), RollingNormHistComparer( read_key="output_hist", store_key="comparisons", window=10), ]) datastore = pipeline.transform(datastore={}) assert "comparisons" in datastore for f in [ "A_score", "A_score:num_employees", "bankrupt", "country", "num_employees", ]: assert f in datastore["comparisons"] df = datastore["comparisons"]["A_score"] np.testing.assert_almost_equal(df["chi2"][-1], 37.61910112359518)
def test_normalized_hist_mean_cov(): hist_list = [ "date:country", "date:bankrupt", "date:num_employees", "date:A_score", "date:A_score:num_employees", ] pipeline = Pipeline( modules=[ JsonReader( file_path=resources.data("example_histogram.json"), store_key="example_hist", ), HistSplitter( read_key="example_hist", store_key="output_hist", features=hist_list ), ApplyFunc( apply_to_key="output_hist", assign_to_key="output_hist", apply_funcs=[dict(func=normalized_hist_mean_cov, suffix="")], ), ] ) datastore = pipeline.transform(datastore={}) assert "output_hist" in datastore for f in [ "A_score", "A_score:num_employees", "bankrupt", "country", "num_employees", ]: assert f in datastore["output_hist"] df = datastore["output_hist"]["A_score"] check = np.array( [ [0.22916667, -0.01041667, -0.0625, -0.13541667, -0.02083333], [-0.01041667, 0.015625, 0.01041667, -0.01354167, -0.00208333], [-0.0625, 0.01041667, 0.12916667, -0.06458333, -0.0125], [-0.13541667, -0.01354167, -0.06458333, 0.240625, -0.02708333], [-0.02083333, -0.00208333, -0.0125, -0.02708333, 0.0625], ] ) for hm, hc, hb in zip( df["histogram_mean"].values, df["histogram_cov"].values, df["histogram_binning"].values, ): np.testing.assert_array_almost_equal( hm, [0.3125, 0.03125, 0.1875, 0.40625, 0.0625] ) np.testing.assert_array_almost_equal(hb, [1.5, 2.5, 3.5, 4.5, 5.5]) np.testing.assert_array_almost_equal(hc, check)
def test_chi_squared1(): hist_list = [ "date:country", "date:bankrupt", "date:num_employees", "date:A_score", "date:A_score:num_employees", ] pipeline = Pipeline( modules=[ JsonReader( file_path=resources.data("example_histogram.json"), store_key="example_hist", ), HistSplitter( read_key="example_hist", store_key="output_hist", features=hist_list ), ApplyFunc( apply_to_key="output_hist", apply_funcs=[ dict( func=roll_norm_hist_mean_cov, hist_name="histogram", window=5, shift=1, suffix="", entire=True, ) ], ), ApplyFunc( apply_to_key="output_hist", apply_funcs=[dict(func=relative_chi_squared, suffix="", axis=1)], ), ] ) datastore = pipeline.transform(datastore={}) assert "output_hist" in datastore for f in [ "A_score", "A_score:num_employees", "bankrupt", "country", "num_employees", ]: assert f in datastore["output_hist"] df = datastore["output_hist"]["A_score"] np.testing.assert_almost_equal(df["chi2"][6], 4.25) df = datastore["output_hist"]["A_score:num_employees"] np.testing.assert_almost_equal(df["chi2"][-2], 2.1333333333333315) df = datastore["output_hist"]["bankrupt"] np.testing.assert_almost_equal(df["chi2"][6], 0.40000000000000024) df = datastore["output_hist"]["country"] np.testing.assert_almost_equal(df["chi2"][5], 0.8999999999999994) df = datastore["output_hist"]["num_employees"] np.testing.assert_almost_equal(df["chi2"][5], 0.849999999999999)
def test_chi_squared2(): hist_list = [ "date:country", "date:bankrupt", "date:num_employees", "date:A_score", "date:A_score:num_employees", ] pipeline = Pipeline( modules=[ JsonReader( file_path=resources.data("example_histogram.json"), store_key="example_hist", ), HistSplitter( read_key="example_hist", store_key="output_hist", features=hist_list ), ApplyFunc( apply_to_key="output_hist", apply_funcs=[ dict( func=expand_norm_hist_mean_cov, hist_name="histogram", shift=1, suffix="", entire=True, ) ], ), ApplyFunc( apply_to_key="output_hist", apply_funcs=[dict(func=relative_chi_squared, suffix="", axis=1)], ), ] ) datastore = pipeline.transform(datastore={}) assert "output_hist" in datastore for f in [ "A_score", "A_score:num_employees", "bankrupt", "country", "num_employees", ]: assert f in datastore["output_hist"] df = datastore["output_hist"]["A_score"] np.testing.assert_almost_equal(df["chi2"][-1], 9.891821919006366) df = datastore["output_hist"]["A_score:num_employees"] np.testing.assert_almost_equal(df["chi2"][-2], 3.217532467532462) df = datastore["output_hist"]["bankrupt"] np.testing.assert_almost_equal(df["chi2"][-1], 0.23767605633802757) df = datastore["output_hist"]["country"] np.testing.assert_almost_equal(df["chi2"][-1], 1.3717532467532458) df = datastore["output_hist"]["num_employees"] np.testing.assert_almost_equal(df["chi2"][-1], 1.1858766233766194)
def test_self_reference(): hist_list = ['date:A_score', 'date:A_score:num_employees'] pipeline = Pipeline(modules=[ JsonReader(file_path=resources.data("example_histogram.json"), store_key="hists"), self_reference(hists_key='hists', features=hist_list), ]) pipeline.transform(datastore={})
def test_rolling_reference(): hist_list = ["date:country", "date:A_score:num_employees"] pipeline = Pipeline(modules=[ JsonReader(file_path=resources.data("example_histogram.json"), store_key="hists"), rolling_reference(hists_key="hists", window=5, features=hist_list), ]) pipeline.transform(datastore={})
def test_expanding_reference(): hist_list = ["date:bankrupt", "date:num_employees"] pipeline = Pipeline(modules=[ JsonReader(file_path=resources.data("example_histogram.json"), store_key="hists"), expanding_reference(hists_key="hists", features=hist_list), ]) pipeline.transform(datastore={})
def test_external_reference(): hist_list = ["date:country", "date:bankrupt"] pipeline = Pipeline(modules=[ JsonReader(file_path=resources.data("example_histogram.json"), store_key="hists"), external_reference( hists_key="hists", ref_hists_key="hists", features=hist_list), ]) pipeline.transform(datastore={})
def test_hists_stability_report(): # get histograms pipeline = Pipeline(modules=[ JsonReader(file_path=resources.data("example_histogram.json"), store_key="hists"), ]) datastore = pipeline.transform(datastore={}) hists = datastore['hists'] # generate report hist_list = ['date:bankrupt', 'date:country', 'date:bankrupt', 'date:A_score', 'date:A_score:num_employees'] stability_report(hists, reference_type='rolling', window=5, features=hist_list)
def test_hist_compare(): hist_list = ['date:country', 'date:bankrupt', 'date:num_employees', 'date:A_score', 'date:A_score:num_employees'] pipeline = Pipeline(modules=[ JsonReader(file_path=resources.data("example_histogram.json"), store_key="example_hist"), HistSplitter(read_key='example_hist', store_key='output_hist', features=hist_list), ApplyFunc(apply_to_key='output_hist', apply_funcs=[dict(func=expanding_hist, shift=1, suffix='sum', entire=True, hist_name='histogram')]), ApplyFunc(apply_to_key='output_hist', assign_to_key='comparison', apply_funcs=[ dict(func=hist_compare, hist_name1='histogram', hist_name2='histogram_sum', suffix='', axis=1)]) ]) datastore = pipeline.transform(datastore={}) df = datastore['comparison']['num_employees'] np.testing.assert_array_equal(df['chi2'].values[-1], 0.7017543859649122)
def test_chi_RollingNormHistComparer(): hist_list = ['date:country', 'date:bankrupt', 'date:num_employees', 'date:A_score', 'date:A_score:num_employees'] pipeline = Pipeline(modules=[ JsonReader(file_path=resources.data("example_histogram.json"), store_key="example_hist"), HistSplitter(read_key='example_hist', store_key='output_hist', features=hist_list), RollingNormHistComparer(read_key='output_hist', store_key='comparisons', window=10) ]) datastore = pipeline.transform(datastore={}) assert 'comparisons' in datastore for f in ['A_score', 'A_score:num_employees', 'bankrupt', 'country', 'num_employees']: assert f in datastore['comparisons'] df = datastore['comparisons']['A_score'] np.testing.assert_almost_equal(df['chi2'][-1], 45.200000)
def test_expanding_hist_comparer(): hist_list = ['date:country', 'date:bankrupt', 'date:num_employees', 'date:A_score'] features = ['country', 'bankrupt', 'num_employees', 'A_score'] cols = ['expanding_pearson', 'expanding_chi2', 'expanding_chi2_zscore', 'expanding_chi2_norm', 'expanding_chi2_pvalue', 'expanding_chi2_max_residual', 'expanding_chi2_spike_count', 'expanding_ks', 'expanding_ks_zscore', 'expanding_ks_pvalue', 'expanding_max_prob_diff', 'expanding_unknown_labels'] pipeline = Pipeline(modules=[ JsonReader(file_path=resources.data("example_histogram.json"), store_key="example_hist"), HistSplitter(read_key='example_hist', store_key='output_hist', features=hist_list), ExpandingHistComparer(read_key='output_hist', store_key='comparison') ]) datastore = pipeline.transform(datastore={}) assert 'comparison' in datastore and isinstance(datastore['comparison'], dict) assert len(datastore['comparison'].keys()) == len(features) for f in features: assert f in datastore['comparison'] for f in features: assert isinstance(datastore['comparison'][f], pd.DataFrame) df = datastore['comparison']['A_score'] assert len(df) == 16 np.testing.assert_array_equal(sorted(df.columns), sorted(cols)) np.testing.assert_almost_equal(df['expanding_chi2'].mean(), 2.8366236044275257) df = datastore['comparison']['country'] assert len(df) == 17 np.testing.assert_array_equal(sorted(df.columns), sorted(cols)) np.testing.assert_almost_equal(df['expanding_chi2'].mean(), 1.1224348056645368) df = datastore['comparison']['bankrupt'] assert len(df) == 17 np.testing.assert_array_equal(sorted(df.columns), sorted(cols)) np.testing.assert_almost_equal(df['expanding_chi2'].mean(), 0.6901425387043608) df = datastore['comparison']['num_employees'] assert len(df) == 17 np.testing.assert_array_equal(sorted(df.columns), sorted(cols)) np.testing.assert_almost_equal(df['expanding_chi2'].mean(), 4.243731870738727)
def test_rolling_hist_comparer(): hist_list = ['date:country', 'date:bankrupt', 'date:num_employees', 'date:A_score'] features = ['country', 'bankrupt', 'num_employees', 'A_score'] cols = ['roll_pearson', 'roll_chi2', 'roll_chi2_zscore', 'roll_chi2_norm', 'roll_chi2_pvalue', 'roll_chi2_max_residual', 'roll_chi2_spike_count', 'roll_ks', 'roll_ks_zscore', 'roll_ks_pvalue', 'roll_max_prob_diff', 'roll_unknown_labels'] pipeline = Pipeline(modules=[ JsonReader(file_path=resources.data("example_histogram.json"), store_key="example_hist"), HistSplitter(read_key='example_hist', store_key='output_hist', features=hist_list), RollingHistComparer(read_key='output_hist', store_key='comparison', window=5) ]) datastore = pipeline.transform(datastore={}) assert 'comparison' in datastore and isinstance(datastore['comparison'], dict) assert len(datastore['comparison'].keys()) == len(features) for f in features: assert f in datastore['comparison'] for f in features: assert isinstance(datastore['comparison'][f], pd.DataFrame) df = datastore['comparison']['A_score'] assert len(df) == 16 np.testing.assert_array_equal(sorted(df.columns), sorted(cols)) np.testing.assert_almost_equal(df['roll_chi2'].mean(), 2.927272727272727) df = datastore['comparison']['country'] assert len(df) == 17 np.testing.assert_array_equal(sorted(df.columns), sorted(cols)) np.testing.assert_almost_equal(df['roll_chi2'].mean(), 1.3022619047619046) df = datastore['comparison']['bankrupt'] assert len(df) == 17 np.testing.assert_array_equal(sorted(df.columns), sorted(cols)) np.testing.assert_almost_equal(df['roll_chi2'].mean(), 0.7251681783824641) df = datastore['comparison']['num_employees'] assert len(df) == 17 np.testing.assert_array_equal(sorted(df.columns), sorted(cols)) np.testing.assert_almost_equal(df['roll_chi2'].mean(), 4.0995701058201055)
def test_reference_hist_comparer(): hist_list = ['date:country', 'date:bankrupt', 'date:num_employees', 'date:A_score'] features = ['country', 'bankrupt', 'num_employees', 'A_score'] cols = ['ref_pearson', 'ref_chi2', 'ref_chi2_zscore', 'ref_chi2_norm', 'ref_chi2_pvalue', 'ref_chi2_max_residual', 'ref_chi2_spike_count', 'ref_ks', 'ref_ks_zscore', 'ref_ks_pvalue', 'ref_max_prob_diff', 'ref_unknown_labels'] pipeline = Pipeline(modules=[ JsonReader(file_path=resources.data("example_histogram.json"), store_key="example_hist"), HistSplitter(read_key='example_hist', store_key='output_hist', features=hist_list), ReferenceHistComparer(reference_key='output_hist', assign_to_key='output_hist', store_key='comparison') ]) datastore = pipeline.transform(datastore={}) assert 'comparison' in datastore and isinstance(datastore['comparison'], dict) assert len(datastore['comparison'].keys()) == len(features) for f in features: assert f in datastore['comparison'] for f in features: assert isinstance(datastore['comparison'][f], pd.DataFrame) df = datastore['comparison']['A_score'] assert len(df) == 16 np.testing.assert_array_equal(sorted(df.columns), sorted(cols)) np.testing.assert_almost_equal(df['ref_chi2'].mean(), 2.623206018518519) df = datastore['comparison']['country'] assert len(df) == 17 np.testing.assert_array_equal(sorted(df.columns), sorted(cols)) np.testing.assert_almost_equal(df['ref_chi2'].mean(), 0.9804481792717087) df = datastore['comparison']['bankrupt'] assert len(df) == 17 np.testing.assert_array_equal(sorted(df.columns), sorted(cols)) np.testing.assert_almost_equal(df['ref_chi2'].mean(), 0.6262951496388027) df = datastore['comparison']['num_employees'] assert len(df) == 17 np.testing.assert_array_equal(sorted(df.columns), sorted(cols)) np.testing.assert_almost_equal(df['ref_chi2'].mean(), 4.213429217840983)
def test_report_generator(): hist_list = [ "date:country", "date:bankrupt", "date:num_employees", "date:A_score" ] features = ["country", "bankrupt", "num_employees", "A_score"] pipeline = Pipeline(modules=[ JsonReader( file_path=resources.data("example_histogram.json"), store_key="example_hist", ), HistSplitter(read_key="example_hist", store_key="output_hist", features=hist_list), ReferenceHistComparer( reference_key="output_hist", assign_to_key="output_hist", store_key="comparison", ), SectionGenerator( read_key="comparison", store_key="all_sections", section_name="Comparisons", last_n=2, ), ReportGenerator(read_key="all_sections", store_key="final_report"), ]) datastore = pipeline.transform(datastore={}) assert "comparison" in datastore and isinstance(datastore["comparison"], dict) assert len(datastore["comparison"].keys()) == len(features) for f in features: assert f in datastore["comparison"] for f in features: assert isinstance(datastore["comparison"][f], pd.DataFrame) assert pipeline.modules[-2].last_n == 2 assert "final_report" in datastore assert (isinstance(datastore["final_report"], str) and len(datastore["final_report"]) > 0)
def test_hist_splitter(): hist_list = [ "date:country", "date:bankrupt", "date:num_employees", "date:A_score", "date:A_score:num_employees", ] features = [ "country", "bankrupt", "num_employees", "A_score", "A_score:num_employees", ] pipeline = Pipeline(modules=[ JsonReader( file_path=resources.data("example_histogram.json"), store_key="example_hist", ), HistSplitter(read_key="example_hist", store_key="output_hist", features=hist_list), ]) datastore = pipeline.transform(datastore={}) assert "output_hist" in datastore and isinstance(datastore["output_hist"], dict) assert len(datastore["output_hist"].keys()) == len(features) for f in features: assert f in datastore["output_hist"] for f in features: assert isinstance(datastore["output_hist"][f], pd.DataFrame) for f in features: df = datastore["output_hist"][f] split_list = df.reset_index().to_dict("records") hlist = [s["histogram"] for s in split_list] assert_similar_hists(hlist)
def test_hist_splitter_filter(): """ Test of hist_splitter option filter_empty_split_hists One of the split histograms of type date:A_score:num_employees is empty and only contains a NaN. In this test, those empty split-histograms are *not* removed, leading to split-histograms of inconsistent types. """ hist_list = ["date:A_score:num_employees"] features = ["A_score:num_employees"] pipeline = Pipeline(modules=[ JsonReader( file_path=resources.data("example_histogram.json"), store_key="example_hist", ), HistSplitter( read_key="example_hist", store_key="output_hist", features=hist_list, filter_empty_split_hists=False, ), ]) datastore = pipeline.transform(datastore={}) assert "output_hist" in datastore and isinstance(datastore["output_hist"], dict) assert len(datastore["output_hist"].keys()) == len(features) for f in features: assert f in datastore["output_hist"] for f in features: assert isinstance(datastore["output_hist"][f], pd.DataFrame) for f in features: df = datastore["output_hist"][f] split_list = df.reset_index().to_dict("records") hlist = [s["histogram"] for s in split_list] check = check_similar_hists(hlist) assert check is False
def test_chi_ReferenceNormHistComparer(): hist_list = [ "date:country", "date:bankrupt", "date:num_employees", "date:A_score", "date:A_score:num_employees", ] pipeline = Pipeline( modules=[ JsonReader( file_path=resources.data("example_histogram.json"), store_key="example_hist", ), HistSplitter( read_key="example_hist", store_key="output_hist", features=hist_list ), ReferenceNormHistComparer( reference_key="output_hist", assign_to_key="output_hist", store_key="comparisons", ), ] ) datastore = pipeline.transform(datastore={}) assert "comparisons" in datastore for f in [ "A_score", "A_score:num_employees", "bankrupt", "country", "num_employees", ]: assert f in datastore["comparisons"] df = datastore["comparisons"]["A_score"] np.testing.assert_almost_equal(df["chi2"][0], 2.2884111855886022)
def test_report_generator(): hist_list = [ 'date:country', 'date:bankrupt', 'date:num_employees', 'date:A_score' ] features = ['country', 'bankrupt', 'num_employees', 'A_score'] pipeline = Pipeline(modules=[ JsonReader(file_path=resources.data("example_histogram.json"), store_key="example_hist"), HistSplitter(read_key='example_hist', store_key='output_hist', features=hist_list), ReferenceHistComparer(reference_key='output_hist', assign_to_key='output_hist', store_key='comparison'), SectionGenerator(read_key="comparison", store_key="all_sections", section_name="Comparisons", last_n=2), ReportGenerator(read_key="all_sections", store_key="final_report") ]) datastore = pipeline.transform(datastore={}) assert 'comparison' in datastore and isinstance(datastore['comparison'], dict) assert len(datastore['comparison'].keys()) == len(features) for f in features: assert f in datastore['comparison'] for f in features: assert isinstance(datastore['comparison'][f], pd.DataFrame) assert pipeline.modules[-2].last_n == 2 assert 'final_report' in datastore assert isinstance(datastore['final_report'], str) and len(datastore['final_report']) > 0
def test_expanding_hist(): hist_list = [ "date:country", "date:bankrupt", "date:num_employees", "date:A_score", "date:A_score:num_employees", ] pipeline = Pipeline(modules=[ JsonReader( file_path=resources.data("example_histogram.json"), store_key="example_hist", ), HistSplitter(read_key="example_hist", store_key="output_hist", features=hist_list), ApplyFunc( apply_to_key="output_hist", apply_funcs=[ dict( func=expanding_hist, shift=1, suffix="sum", entire=True, hist_name="histogram", ) ], ), ]) datastore = pipeline.transform(datastore={}) df = datastore["output_hist"]["num_employees"] h = df["histogram_sum"].values[-1] bin_entries = h.bin_entries() check = np.array([ 11.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ]) np.testing.assert_array_almost_equal(bin_entries, check)
def test_expand_norm_hist_mean_cov(): hist_list = [ "date:country", "date:bankrupt", "date:num_employees", "date:A_score", "date:A_score:num_employees", ] pipeline = Pipeline(modules=[ JsonReader( file_path=resources.data("example_histogram.json"), store_key="example_hist", ), HistSplitter(read_key="example_hist", store_key="output_hist", features=hist_list), ApplyFunc( apply_to_key="output_hist", apply_funcs=[ dict( func=expand_norm_hist_mean_cov, hist_name="histogram", shift=1, suffix="", entire=True, ) ], ), ]) datastore = pipeline.transform(datastore={}) assert "output_hist" in datastore for f in [ "A_score", "A_score:num_employees", "bankrupt", "country", "num_employees", ]: assert f in datastore["output_hist"] df = datastore["output_hist"]["num_employees"] mean = df["histogram_mean"].values[-2] check = np.array([ 0.56666667, 0.03333333, 0.03333333, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.06666667, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.06666667, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.06666667, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03333333, 0.06666667, 0.06666667, ]) np.testing.assert_array_almost_equal(mean, check)