Пример #1
0
    def test_add_raised_suppressed_error(self):
        """Test that an supressed error does not show up in the unsuppressed error list."""
        report = ValidationReport([self.ERROR_1])
        report.add_raised_error(self.ERROR_1)

        assert len(report.unsuppressed_errors) == 0
        assert report.num_suppressed == 1
Пример #2
0
 def test_add_raised_unsuppressed_error(self):
     """Test that an unsupressed error shows up in the unsuppressed error list."""
     report = ValidationReport([ValidationFailure("good",
                                filename="20201107_county_sig2.csv",
                                message="msg 2")])
     report.add_raised_error(self.ERROR_1)
     report.add_raised_error(self.ERROR_2)
     assert report.unsuppressed_errors == [self.ERROR_1, self.ERROR_2]
Пример #3
0
 def test_str(self):
     """Test that the string representation contains all information."""
     report = ValidationReport([self.ERROR_1])
     report.increment_total_checks()
     report.increment_total_checks()
     report.increment_total_checks()
     report.add_raised_warning(ImportWarning("wrong import"))
     report.add_raised_warning(ImportWarning("right import"))
     report.add_raised_error(self.ERROR_1)
     report.add_raised_error(self.ERROR_2)
    def test_no_outlier(self):
        validator = DynamicValidator(self.params)
        report = ValidationReport([])

        # Data from 51580 between 9/24 and 10/26 (10/25 query date)
        ref_val = [30, 30.28571429, 30.57142857, 30.85714286, 31.14285714,
                   31.42857143, 31.71428571, 32, 32, 32.14285714,
                   32.28571429, 32.42857143, 32.57142857, 32.71428571,
                   32.85714286, 33, 33, 33, 33, 33, 33, 33, 33,
                   33, 33, 33, 33, 33, 33, 33]
        test_val = [33, 33, 33]

        ref_data = {"val": ref_val, "se": [np.nan] * len(ref_val),
                    "sample_size": [np.nan] * len(ref_val), "geo_id": ["1"] * len(ref_val),
                    "time_value": pd.date_range(start="2020-09-24", end="2020-10-23")}
        test_data = {"val": test_val, "se": [np.nan] * len(test_val),
                     "sample_size": [np.nan] * len(test_val), "geo_id": ["1"] * len(test_val),
                     "time_value": pd.date_range(start="2020-10-24", end="2020-10-26")}

        ref_data2 = {"val": ref_val, "se": [np.nan] * len(ref_val),
                     "sample_size": [np.nan] * len(ref_val), "geo_id": ["2"] * len(ref_val),
                     "time_value": pd.date_range(start="2020-09-24", end="2020-10-23")}
        test_data2 = {"val": test_val, "se": [np.nan] * len(test_val),
                      "sample_size": [np.nan] * len(test_val), "geo_id": ["2"] * len(test_val),
                      "time_value": pd.date_range(start="2020-10-24", end="2020-10-26")}

        ref_df = pd.concat([pd.DataFrame(ref_data), pd.DataFrame(ref_data2)]). \
            reset_index(drop=True)
        test_df = pd.concat([pd.DataFrame(test_data), pd.DataFrame(test_data2)]). \
            reset_index(drop=True)

        validator.check_positive_negative_spikes(
            test_df, ref_df, "state", "signal", report)

        assert len(report.raised_errors) == 0
Пример #5
0
 def test_more_than_two_copies(self):
     validator = StaticValidator(self.params)
     report = ValidationReport([])
     df = pd.DataFrame([["a", "1"], ["b", "2"], ["b", "2"], ["b", "2"]])
     validator.check_duplicate_rows(df, FILENAME, report)
     assert len(report.raised_warnings) == 1
     assert report.raised_warnings[0].check_name == "check_duplicate_rows"
Пример #6
0
    def test_1000x_val(self):
        validator = DynamicValidator(self.params)
        report = ValidationReport([])
        test_data = {
            "val": [1, 1, 1, 2000, 0, 1],
            "se": [np.nan] * 6,
            "sample_size": [np.nan] * 6,
            "geo_id": ["1"] * 6
        }
        ref_data = {
            "val": [1, 1, 1, 2, 0, 1],
            "se": [np.nan] * 6,
            "sample_size": [np.nan] * 6,
            "geo_id": ["1"] * 6
        }

        test_df = pd.DataFrame(test_data)
        ref_df = pd.DataFrame(ref_data)
        validator.check_avg_val_vs_reference(
            test_df, ref_df, datetime.combine(date.today(),
                                              datetime.min.time()), "geo",
            "signal", report)

        assert len(report.raised_errors) == 1
        assert report.raised_errors[
            0].check_name == "check_test_vs_reference_avg_changed"
Пример #7
0
    def test_empty_df(self):
        validator = StaticValidator(self.params)
        report = ValidationReport([])
        empty_df = pd.DataFrame(columns=["geo_id"], dtype=str)
        validator.check_bad_geo_id_format(empty_df, FILENAME, "county", report)

        assert len(report.raised_errors) == 0
Пример #8
0
def mock_validator_fn():
    """Set up a mock validator function."""
    validator_fn = mock.Mock()
    validator = mock.Mock()
    validator_fn.return_value = validator
    validator.validate.return_value = ValidationReport([])
    yield validator_fn
Пример #9
0
    def test_gt_max_prop(self):
        validator = StaticValidator(self.params)
        report = ValidationReport([])
        df = pd.DataFrame([1e7], columns=["val"])
        validator.check_bad_val(df, FILENAME, "prop", report)

        assert len(report.raised_errors) == 1
        assert report.raised_errors[0].check_name == "check_val_prop_gt_100k"
Пример #10
0
    def test_lt_0(self):
        validator = StaticValidator(self.params)
        report = ValidationReport([])
        df = pd.DataFrame([-5], columns=["val"])
        validator.check_bad_val(df, FILENAME, "signal", report)

        assert len(report.raised_errors) == 1
        assert report.raised_errors[0].check_name == "check_val_lt_0"
Пример #11
0
    def test_invalid_geo_id_value_nation(self):
        validator = StaticValidator(self.params)
        report = ValidationReport([])
        df = pd.DataFrame(["us", "zz"], columns=["geo_id"])
        validator.check_bad_geo_id_value(df, FILENAME, "nation", report)

        assert len(report.raised_errors) == 1
        assert report.raised_errors[0].check_name == "check_bad_geo_id_value"
Пример #12
0
    def test_invalid_geo_id_format_hhs(self):
        validator = StaticValidator(self.params)
        report = ValidationReport([])
        df = pd.DataFrame(["1", "112"], columns=["geo_id"])
        validator.check_bad_geo_id_format(df, FILENAME, "hhs", report)

        assert len(report.raised_errors) == 1
        assert report.raised_errors[0].check_name == "check_geo_id_format"
Пример #13
0
    def test_invalid_geo_type(self):
        validator = StaticValidator(self.params)
        report = ValidationReport([])
        empty_df = pd.DataFrame(columns=["geo_id"], dtype=str)
        validator.check_bad_geo_id_format(empty_df, FILENAME, "hello", report)

        assert len(report.raised_errors) == 1
        assert report.raised_errors[0].check_name == "check_geo_type"
Пример #14
0
    def test_empty_filelist(self):
        params = {
            "common": {
                "data_source": "",
                "span_length": 8,
                "end_date": "2020-09-09"
            }
        }
        validator = StaticValidator(params)
        report = ValidationReport([])
        report = ValidationReport([])

        filenames = list()
        validator.check_missing_date_files(filenames, report)

        assert len(report.raised_errors) == 1
        assert report.raised_errors[0].check_name == "check_missing_date_files"
Пример #15
0
    def test_empty_df(self):
        validator = StaticValidator(self.params)
        report = ValidationReport([])
        empty_df = pd.DataFrame(columns=["val"])
        validator.check_bad_val(empty_df, "", "", report)
        validator.check_bad_val(empty_df, "", "prop", report)
        validator.check_bad_val(empty_df, "", "pct", report)

        assert len(report.raised_errors) == 0
Пример #16
0
    def test_same_df(self):
        validator = DynamicValidator(self.params)
        report = ValidationReport([])
        test_df = pd.DataFrame([date.today()] * 5, columns=["time_value"])
        ref_df = pd.DataFrame([date.today()] * 5, columns=["time_value"])
        validator.check_rapid_change_num_rows(test_df, ref_df, date.today(),
                                              "geo", "signal", report)

        assert len(report.raised_errors) == 0
Пример #17
0
    def test_uppercase_geo_id(self):
        validator = StaticValidator(self.params)
        report = ValidationReport([])
        df = pd.DataFrame(["ak", "AK"], columns=["geo_id"])
        validator.check_bad_geo_id_value(df, FILENAME, "state", report)

        assert len(report.raised_errors) == 0
        assert len(report.raised_warnings) == 1
        assert report.raised_warnings[0].check_name == "check_geo_id_lowercase"
Пример #18
0
    def test_invalid_geo_id_format_state(self):
        validator = StaticValidator(self.params)
        report = ValidationReport([])
        df = pd.DataFrame(["aa", "hi", "HI", "hawaii", "Hawaii", "a", "H.I."],
                          columns=["geo_id"])
        validator.check_bad_geo_id_format(df, FILENAME, "state", report)

        assert len(report.raised_errors) == 1
        assert report.raised_errors[0].check_name == "check_geo_id_format"
Пример #19
0
    def test_lt_min_missing_not_allowed(self):
        validator = StaticValidator(self.params)
        report = ValidationReport([])
        validator.params.missing_sample_size_allowed = False
        df = pd.DataFrame([[1, 0, 10], [1, np.nan, 240], [1, np.nan, 245]],
                          columns=["val", "se", "sample_size"])
        validator.check_bad_sample_size(df, FILENAME, report)

        assert len(report.raised_errors) == 1
        assert report.raised_errors[0].check_name == "check_n_gt_min"
Пример #20
0
    def test_neg_outlier(self):
        validator = DynamicValidator(self.params)
        report = ValidationReport([])

        ref_val = [
            100, 101, 100, 101, 100, 100, 100, 100, 100, 100, 100, 102, 100,
            100, 100, 100, 100, 101, 100, 100, 100, 100, 100, 99, 100, 100, 98,
            100, 100, 100
        ]
        test_val = [10, 10, 10]

        ref_data = {
            "val": ref_val,
            "se": [np.nan] * len(ref_val),
            "sample_size": [np.nan] * len(ref_val),
            "geo_id": ["1"] * len(ref_val),
            "time_value": pd.date_range(start="2020-09-24", end="2020-10-23")
        }
        test_data = {
            "val": test_val,
            "se": [np.nan] * len(test_val),
            "sample_size": [np.nan] * len(test_val),
            "geo_id": ["1"] * len(test_val),
            "time_value": pd.date_range(start="2020-10-24", end="2020-10-26")
        }

        ref_data2 = {
            "val": ref_val,
            "se": [np.nan] * len(ref_val),
            "sample_size": [np.nan] * len(ref_val),
            "geo_id": ["2"] * len(ref_val),
            "time_value": pd.date_range(start="2020-09-24", end="2020-10-23")
        }
        test_data2 = {
            "val": test_val,
            "se": [np.nan] * len(test_val),
            "sample_size": [np.nan] * len(test_val),
            "geo_id": ["2"] * len(test_val),
            "time_value": pd.date_range(start="2020-10-24", end="2020-10-26")
        }

        ref_df = pd.concat([pd.DataFrame(ref_data), pd.DataFrame(ref_data2)]). \
                    reset_index(drop=True)
        test_df = pd.concat([pd.DataFrame(test_data), pd.DataFrame(test_data2)]). \
                    reset_index(drop=True)

        validator.check_positive_negative_spikes(test_df, ref_df, "state",
                                                 "signal", report)

        assert len(report.raised_errors) == 1
        assert report.raised_errors[
            0].check_name == "check_positive_negative_spikes"
Пример #21
0
    def test_0_vs_many(self):
        validator = DynamicValidator(self.params)
        report = ValidationReport([])

        time_value = datetime.combine(date.today(), datetime.min.time())

        test_df = pd.DataFrame([time_value] * 5, columns=["time_value"])
        ref_df = pd.DataFrame([time_value] * 1, columns=["time_value"])
        validator.check_rapid_change_num_rows(
            test_df, ref_df, time_value, "geo", "signal", report)

        assert len(report.raised_errors) == 1
        assert report.raised_errors[0].check_name == "check_rapid_change_num_rows"
Пример #22
0
    def test_e_0_missing_not_allowed(self):
        validator = StaticValidator(self.params)
        report = ValidationReport([])
        validator.params.missing_se_allowed = False
        df = pd.DataFrame([[1, 0, 200], [1, 0, np.nan], [1, np.nan, np.nan]],
                          columns=["val", "se", "sample_size"])
        validator.check_bad_se(df, FILENAME, report)

        assert len(report.raised_errors) == 2
        assert "check_se_not_missing_and_in_range" in [
            err.check_name for err in report.raised_errors
        ]
        assert "check_se_0" in [err.check_name for err in report.raised_errors]
Пример #23
0
    def test_state_level_fips(self):
        validator = StaticValidator(self.params)
        report = ValidationReport([])
        df = pd.DataFrame(["37183", "56000", "04000", "60000", "78000"], columns=["geo_id"])
        validator.check_bad_geo_id_value(df, FILENAME, "county", report)

        assert len(report.raised_errors) == 0

        df = pd.DataFrame(["37183", "56000", "04000", "60000", "78000", "99000"], columns=["geo_id"])
        validator.check_bad_geo_id_value(df, FILENAME, "county", report)

        assert len(report.raised_errors) == 1
        assert report.raised_errors[0].check_name == "check_bad_geo_id_value"
Пример #24
0
    def test_empty_df(self):
        validator = StaticValidator(self.params)
        report = ValidationReport([])
        empty_df = pd.DataFrame(columns=["val", "se", "sample_size"],
                                dtype=float)
        validator.check_bad_sample_size(empty_df, "", report)

        assert len(report.raised_errors) == 0

        validator.params.missing_sample_size_allowed = True
        validator.check_bad_sample_size(empty_df, "", report)

        assert len(report.raised_errors) == 0
Пример #25
0
    def test_no_padding(self):
        validator = DynamicValidator(self.params)
        report = ValidationReport([])
        data = {"val": [1, 1, 1, 2, 0, 1], "se": [np.nan] * 6,
                "sample_size": [np.nan] * 6, "geo_id": ["1"] * 6,
                "time_value": pd.date_range(start="2021-01-01", end="2021-01-06")}

        test_df = pd.DataFrame(data)
        ref_df = pd.DataFrame(data)

        new_ref_df = validator.pad_reference_api_df(
            ref_df, test_df, datetime.strptime("2021-01-06", "%Y-%m-%d").date())

        assert new_ref_df.equals(ref_df)
Пример #26
0
    def test_same_day(self):
        params = {
            "common": {
                "data_source": "",
                "span_length": 0,
                "end_date": "2020-09-01"
            }
        }
        validator = StaticValidator(params)
        report = ValidationReport([])

        filenames = [("20200901_county_signal_signal.csv", "match_obj")]
        validator.check_missing_date_files(filenames, report)

        assert len(report.raised_errors) == 0
Пример #27
0
    def test_same_val_se_n(self):
        validator = DynamicValidator(self.params)
        report = ValidationReport([])

        data = {"val": [1, 1, 1, 2, 0, 1, 1]*2, "se": [1, 1, 1, 2, 0, 1, 1]*2,
                "sample_size": [1, 1, 1, 2, 0, 1, 1]*2, "geo_id": ["1"] * 14,
                "time_value": ["2021-01-01", "2021-01-02", "2021-01-03", "2021-01-04", "2021-01-05", "2021-01-06", "2021-01-07",
                    "2021-01-08", "2021-01-09", "2021-01-10", "2021-01-11", "2021-01-12", "2021-01-13", "2021-01-14"]}

        test_df = pd.DataFrame(data)
        ref_df = pd.DataFrame(data)

        validator.check_avg_val_vs_reference(
            test_df, ref_df, date.today(), "geo", "signal", report)

        assert len(report.raised_errors) == 0
Пример #28
0
    def test_same_n(self):
        validator = DynamicValidator(self.params)
        report = ValidationReport([])

        data = {
            "val": [np.nan] * 6,
            "se": [np.nan] * 6,
            "sample_size": [1, 1, 1, 2, 0, 1],
            "geo_id": ["1"] * 6
        }

        test_df = pd.DataFrame(data)
        ref_df = pd.DataFrame(data)

        validator.check_avg_val_vs_reference(test_df, ref_df, date.today(),
                                             "geo", "signal", report)

        assert len(report.raised_errors) == 0
Пример #29
0
    def test_duplicate_dates(self):
        params = {
            "common": {
                "data_source": "",
                "span_length": 1,
                "end_date": "2020-09-02"
            }
        }
        validator = StaticValidator(params)
        report = ValidationReport([])

        filenames = [("20200901_county_signal_signal.csv", "match_obj"),
                     ("20200903_county_signal_signal.csv", "match_obj"),
                     ("20200903_usa_signal_signal.csv", "match_obj"),
                     ("20200903_usa_signal_signal.csv", "match_obj")]
        validator.check_missing_date_files(filenames, report)

        assert len(report.raised_errors) == 1
        assert report.raised_errors[0].check_name == "check_missing_date_files"
Пример #30
0
    def test_half_padding(self):
        validator = DynamicValidator(self.params)
        report = ValidationReport([])
        ref_data = {"val": [2, 2, 2, 2, 2, 2], "se": [np.nan] * 6,
                "sample_size": [np.nan] * 6, "geo_id": ["1"] * 6,
                "time_value": pd.date_range(start="2021-01-01", end="2021-01-06")}
        test_data = {"val": [1, 1, 1, 1, 1, 1], "se": [np.nan] * 6,
                "sample_size": [np.nan] * 6, "geo_id": ["1"] * 6,
                "time_value": pd.date_range(start="2021-01-06", end="2021-01-11")}
        ref_df = pd.DataFrame(ref_data)
        test_df = pd.DataFrame(test_data)

        new_ref_df = validator.pad_reference_api_df(
            ref_df, test_df, datetime.strptime("2021-01-15", "%Y-%m-%d").date())

        # Check it only takes missing dates - so the last 5 dates
        assert new_ref_df.time_value.max() == datetime.strptime("2021-01-11",
            "%Y-%m-%d").date()
        assert new_ref_df.shape[0] == 11
        assert new_ref_df.loc[:, "val"].iloc[5] == 2