def test_assert_similar_hists(): """ Test assert on similarity of list of histograms Check similarity of: type, n-dim, sub-hists, specific type attributes """ # dummy dataset with mixed types # convert timestamp (col D) to nanosec since 1970-1-1 df = pd.util.testing.makeMixedDataFrame() df['date'] = df['D'].apply(to_ns) # building 1d-, 2d-, and 3d-histogram (iteratively) hist0 = hg.Bin(5, 0, 5, unit('A')) hist1 = hg.Categorize(unit('C')) hist2 = hg.Bin(5, 0, 5, unit('A'), value=hist1) hist3 = hg.Categorize(unit('C'), value=hist0) hist4 = hg.SparselyBin(origin=pd.Timestamp('2009-01-01').value, binWidth=pd.Timedelta(days=1).value, quantity=unit('date'), value=hist2) hist5 = hg.SparselyBin(origin=pd.Timestamp('2009-01-01').value, binWidth=pd.Timedelta(days=1).value, quantity=unit('date'), value=hist3) # fill them for hist in [hist0, hist1, hist2, hist3, hist4, hist5]: hist.fill.numpy(df) hc0 = HistogramContainer(hist0) hc1 = HistogramContainer(hist1) hc2 = HistogramContainer(hist2) hc3 = HistogramContainer(hist3) hc4 = HistogramContainer(hist4) hc5 = HistogramContainer(hist5) for hc in [hc0, hc1, hc2, hc3, hc4, hc5]: assert check_similar_hists([hc, hc]) args01 = [''] args23 = [''] args45 = [''] try: assert_similar_hists([hc0, hc1]) except AssertionError as e: args01 = e.args try: assert_similar_hists([hc2, hc3]) except AssertionError as e: args23 = e.args try: assert_similar_hists([hc4, hc5]) except AssertionError as e: args45 = e.args assert args01[0] == 'Input histograms are not all similar.' assert args23[0] == 'Input histograms are not all similar.' assert args45[0] == 'Input histograms are not all similar.'
def test_assert_similar_hists(): """Test assert on similarity of list of histograms Check similarity of: type, n-dim, sub-hists, specific type attributes """ # dummy dataset with mixed types # convert timestamp (col D) to nanosec since 1970-1-1 df = pd.util.testing.makeMixedDataFrame() df["date"] = df["D"].apply(to_ns) # building 1d-, 2d-, and 3d-histogram (iteratively) hist0 = hg.Bin(5, 0, 5, unit("A")) hist1 = hg.Categorize(unit("C")) hist2 = hg.Bin(5, 0, 5, unit("A"), value=hist1) hist3 = hg.Categorize(unit("C"), value=hist0) hist4 = hg.SparselyBin( origin=pd.Timestamp("2009-01-01").value, binWidth=pd.Timedelta(days=1).value, quantity=unit("date"), value=hist2, ) hist5 = hg.SparselyBin( origin=pd.Timestamp("2009-01-01").value, binWidth=pd.Timedelta(days=1).value, quantity=unit("date"), value=hist3, ) # fill them for hist in [hist0, hist1, hist2, hist3, hist4, hist5]: hist.fill.numpy(df) for hist in [hist0, hist1, hist2, hist3, hist4, hist5]: assert check_similar_hists([hist, hist]) args01 = [""] args23 = [""] args45 = [""] try: assert_similar_hists([hist0, hist1]) except ValueError as e: args01 = e.args try: assert_similar_hists([hist2, hist3]) except ValueError as e: args23 = e.args try: assert_similar_hists([hist4, hist5]) except ValueError as e: args45 = e.args assert args01[0] == "Input histograms are not all similar." assert args23[0] == "Input histograms are not all similar." assert args45[0] == "Input histograms are not all similar."
def test_check_similar_hists(): """Test similarity of list of histograms Check similarity of: type, n-dim, sub-hists, specific type attributes """ # dummy dataset with mixed types # convert timestamp (col D) to nanosec since 1970-1-1 df = pd.util.testing.makeMixedDataFrame() df["date"] = df["D"].apply(to_ns) # building 1d-, 2d-, and 3d-histogram (iteratively) hist0 = hg.Bin(5, 0, 5, unit("A")) hist1 = hg.Categorize(unit("C")) hist2 = hg.Bin(5, 0, 5, unit("A"), value=hist1) hist3 = hg.Categorize(unit("C"), value=hist0) hist4 = hg.SparselyBin( origin=pd.Timestamp("2009-01-01").value, binWidth=pd.Timedelta(days=1).value, quantity=unit("date"), value=hist2, ) hist5 = hg.SparselyBin( origin=pd.Timestamp("2009-01-01").value, binWidth=pd.Timedelta(days=1).value, quantity=unit("date"), value=hist3, ) # fill them for hist in [hist0, hist1, hist2, hist3, hist4, hist5]: hist.fill.numpy(df) hc0 = HistogramContainer(hist0) hc1 = HistogramContainer(hist1) hc2 = HistogramContainer(hist2) hc3 = HistogramContainer(hist3) hc4 = HistogramContainer(hist4) hc5 = HistogramContainer(hist5) for hc in [hc0, hc1, hc2, hc3, hc4, hc5]: assert check_similar_hists([hc, hc]) assert not check_similar_hists([hc0, hc1]) assert not check_similar_hists([hc2, hc3]) assert not check_similar_hists([hc4, hc5])
def test_hist_splitter_filter(): """ Test of hist_splitter option filter_empty_split_hists One of the split histograms of type date:A_score:num_employees is empty and only contains a NaN. In this test, those empty split-histograms are *not* removed, leading to split-histograms of inconsistent types. """ hist_list = ["date:A_score:num_employees"] features = ["A_score:num_employees"] pipeline = Pipeline(modules=[ JsonReader( file_path=resources.data("example_histogram.json"), store_key="example_hist", ), HistSplitter( read_key="example_hist", store_key="output_hist", features=hist_list, filter_empty_split_hists=False, ), ]) datastore = pipeline.transform(datastore={}) assert "output_hist" in datastore and isinstance(datastore["output_hist"], dict) assert len(datastore["output_hist"].keys()) == len(features) for f in features: assert f in datastore["output_hist"] for f in features: assert isinstance(datastore["output_hist"][f], pd.DataFrame) for f in features: df = datastore["output_hist"][f] split_list = df.reset_index().to_dict("records") hlist = [s["histogram"] for s in split_list] check = check_similar_hists(hlist) assert check is False