def test_duplicate_dates(self): params = { "data_source": "", "span_length": 1, "end_date": "2020-09-02", "expected_lag": {} } validator = StaticValidator(params) report = ValidationReport([]) filenames = [("20200901_county_signal_signal.csv", "match_obj"), ("20200903_county_signal_signal.csv", "match_obj"), ("20200903_usa_signal_signal.csv", "match_obj"), ("20200903_usa_signal_signal.csv", "match_obj")] validator.check_missing_date_files(filenames, report) assert len(report.raised_errors) == 1 assert "check_missing_date_files" in [ err.check_data_id[0] for err in report.raised_errors ] assert len([ err.expression[0] for err in report.raised_errors if err.check_data_id[0] == "check_missing_date_files" ]) == 1 assert [ err.expression[0] for err in report.raised_errors if err.check_data_id[0] == "check_missing_date_files" ][0] == datetime.strptime("20200902", "%Y%m%d").date()
def test_more_than_two_copies(self): validator = StaticValidator(self.params) report = ValidationReport([]) df = pd.DataFrame([["a", "1"], ["b", "2"], ["b", "2"], ["b", "2"]]) validator.check_duplicate_rows(df, "file", report) assert len(report.raised_warnings) == 1 assert report.raised_warnings[0].expression == [2, 3]
def test_empty_df(self): validator = StaticValidator(self.params) report = ValidationReport([]) empty_df = pd.DataFrame(columns=["geo_id"], dtype=str) validator.check_bad_geo_id_format(empty_df, "name", "county", report) assert len(report.raised_errors) == 0
def test_non_consecutive_duplicates(self): validator = StaticValidator(self.params) report = ValidationReport([]) df = pd.DataFrame([["a", "1"], ["b", "2"], ["a", "1"]]) validator.check_duplicate_rows(df, "file", report) assert len(report.raised_warnings) == 1 assert report.raised_warnings[0].expression == [2] assert report.raised_warnings[0].check_data_id[1] == "file"
def test_lt_0(self): validator = StaticValidator(self.params) report = ValidationReport([]) df = pd.DataFrame([-5], columns=["val"]) validator.check_bad_val(df, "name", "signal", report) assert len(report.raised_errors) == 1 assert "check_val_lt_0" in report.raised_errors[0].check_data_id
def test_gt_max_pct(self): validator = StaticValidator(self.params) report = ValidationReport([]) df = pd.DataFrame([1e7], columns=["val"]) validator.check_bad_val(df, "name", "pct", report) assert len(report.raised_errors) == 1 assert "check_val_pct_gt_100" in report.raised_errors[0].check_data_id
def test_uppercase_geo_id(self): validator = StaticValidator(self.params) report = ValidationReport([]) df = pd.DataFrame(["ak", "AK"], columns=["geo_id"]) validator.check_bad_geo_id_value(df, "name", "state", report) assert len(report.raised_errors) == 0 assert len(report.raised_warnings) == 1 assert "check_geo_id_lowercase" in report.raised_warnings[ 0].check_data_id assert "AK" in report.raised_warnings[0].expression
def test_invalid_geo_id_msa(self): validator = StaticValidator(self.params) report = ValidationReport([]) df = pd.DataFrame(["0", "54321", "123", ".0000", "abc12"], columns=["geo_id"]) validator.check_bad_geo_id_format(df, "name", "msa", report) assert len(report.raised_errors) == 1 assert "check_geo_id_format" in report.raised_errors[0].check_data_id assert len(report.raised_errors[0].expression) == 2 assert "54321" not in report.raised_errors[0].expression
def test_lt_min_missing_not_allowed(self): validator = StaticValidator(self.params) report = ValidationReport([]) validator.params.missing_sample_size_allowed = False df = pd.DataFrame([[1, 0, 10], [1, np.nan, 240], [1, np.nan, 245]], columns=["val", "se", "sample_size"]) validator.check_bad_sample_size(df, "name", report) assert len(report.raised_errors) == 1 assert "check_n_gt_min" in [ err.check_data_id[0] for err in report.raised_errors ]
def test_invalid_geo_id_national(self): validator = StaticValidator(self.params) report = ValidationReport([]) df = pd.DataFrame(["us", "zz"], columns=["geo_id"]) validator.check_bad_geo_id_value(df, "name", "national", report) assert len(report.raised_errors) == 1 assert "check_bad_geo_id_value" in report.raised_errors[ 0].check_data_id assert len(report.raised_errors[0].expression) == 1 assert "us" not in report.raised_errors[0].expression assert "zz" in report.raised_errors[0].expression
def test_invalid_geo_id_hrr(self): validator = StaticValidator(self.params) report = ValidationReport([]) df = pd.DataFrame(["1", "12", "123", "1234", "12345", "a", ".", "ab1"], columns=["geo_id"]) validator.check_bad_geo_id_format(df, "name", "hrr", report) assert len(report.raised_errors) == 1 assert "check_geo_id_format" in report.raised_errors[0].check_data_id assert len(report.raised_errors[0].expression) == 5 assert "1" not in report.raised_errors[0].expression assert "12" not in report.raised_errors[0].expression assert "123" not in report.raised_errors[0].expression
def test_invalid_geo_id_state(self): validator = StaticValidator(self.params) report = ValidationReport([]) df = pd.DataFrame(["aa", "hi", "HI", "hawaii", "Hawaii", "a", "H.I."], columns=["geo_id"]) validator.check_bad_geo_id_format(df, "name", "state", report) assert len(report.raised_errors) == 1 assert "check_geo_id_format" in report.raised_errors[0].check_data_id assert len(report.raised_errors[0].expression) == 4 assert "aa" not in report.raised_errors[0].expression assert "hi" not in report.raised_errors[0].expression assert "HI" not in report.raised_errors[0].expression
def test_invalid_geo_id_msa(self): validator = StaticValidator(self.params) report = ValidationReport([]) df = pd.DataFrame(["10180", "88888", "99999"], columns=["geo_id"]) validator.check_bad_geo_id_value(df, "name", "msa", report) assert len(report.raised_errors) == 1 assert "check_bad_geo_id_value" in report.raised_errors[ 0].check_data_id assert len(report.raised_errors[0].expression) == 2 assert "10180" not in report.raised_errors[0].expression assert "88888" in report.raised_errors[0].expression assert "99999" in report.raised_errors[0].expression
def test_invalid_geo_type(self): validator = StaticValidator(self.params) report = ValidationReport([]) empty_df = pd.DataFrame(columns=["geo_id"], dtype=str) validator.check_bad_geo_id_format(empty_df, "name", "hello", report) assert len(report.raised_errors) == 1 assert "check_geo_type" in [ err.check_data_id[0] for err in report.raised_errors ] assert [ err.expression for err in report.raised_errors if err.check_data_id[0] == "check_geo_type" ][0] == "hello"
def test_jeffreys(self): validator = StaticValidator(self.params) report = ValidationReport([]) validator.params.missing_se_allowed = False df = pd.DataFrame([[0, 0, 200], [1, 0, np.nan], [1, np.nan, np.nan]], columns=["val", "se", "sample_size"]) validator.check_bad_se(df, "name", report) assert len(report.raised_errors) == 2 assert "check_se_not_missing_and_in_range" in [ err.check_data_id[0] for err in report.raised_errors ] assert "check_se_0_when_val_0" in [ err.check_data_id[0] for err in report.raised_errors ]
def test_empty_df(self): validator = StaticValidator(self.params) report = ValidationReport([]) empty_df = pd.DataFrame(columns=["val"]) validator.check_bad_val(empty_df, "", "", report) validator.check_bad_val(empty_df, "", "prop", report) validator.check_bad_val(empty_df, "", "pct", report) assert len(report.raised_errors) == 0
def test_same_day(self): params = { "data_source": "", "span_length": 0, "end_date": "2020-09-01", "expected_lag": {} } validator = StaticValidator(params) report = ValidationReport([]) filenames = [("20200901_county_signal_signal.csv", "match_obj")] validator.check_missing_date_files(filenames, report) assert len(report.raised_errors) == 0 assert "check_missing_date_files" not in [ err.check_data_id[0] for err in report.raised_errors ]
def test_empty_filelist(self): params = { "data_source": "", "span_length": 8, "end_date": "2020-09-09", "expected_lag": {} } validator = StaticValidator(params) report = ValidationReport([]) report = ValidationReport([]) filenames = list() validator.check_missing_date_files(filenames, report) assert len(report.raised_errors) == 1 assert "check_missing_date_files" in [ err.check_data_id[0] for err in report.raised_errors ] assert len(report.raised_errors[0].expression) == 9
def test_empty_df(self): validator = StaticValidator(self.params) report = ValidationReport([]) empty_df = pd.DataFrame(columns=["val", "se", "sample_size"], dtype=float) validator.check_bad_sample_size(empty_df, "", report) assert len(report.raised_errors) == 0 validator.params.missing_sample_size_allowed = True validator.check_bad_sample_size(empty_df, "", report) assert len(report.raised_errors) == 0
def test_single_column_duplicates_but_not_row(self): validator = StaticValidator(self.params) report = ValidationReport([]) df = pd.DataFrame([["a", "1"], ["a", "2"], ["b", "2"]]) validator.check_duplicate_rows(df, "file", report) assert len(report.raised_warnings) == 0