def test_1000x_val(self): validator = DynamicValidator(self.params) report = ValidationReport([]) test_data = { "val": [1, 1, 1, 2000, 0, 1], "se": [np.nan] * 6, "sample_size": [np.nan] * 6, "geo_id": ["1"] * 6 } ref_data = { "val": [1, 1, 1, 2, 0, 1], "se": [np.nan] * 6, "sample_size": [np.nan] * 6, "geo_id": ["1"] * 6 } test_df = pd.DataFrame(test_data) ref_df = pd.DataFrame(ref_data) validator.check_avg_val_vs_reference( test_df, ref_df, datetime.combine(date.today(), datetime.min.time()), "geo", "signal", report) assert len(report.raised_errors) == 1 assert report.raised_errors[ 0].check_name == "check_test_vs_reference_avg_changed"
def test_no_outlier(self): validator = DynamicValidator(self.params) report = ValidationReport([]) # Data from 51580 between 9/24 and 10/26 (10/25 query date) ref_val = [30, 30.28571429, 30.57142857, 30.85714286, 31.14285714, 31.42857143, 31.71428571, 32, 32, 32.14285714, 32.28571429, 32.42857143, 32.57142857, 32.71428571, 32.85714286, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33] test_val = [33, 33, 33] ref_data = {"val": ref_val, "se": [np.nan] * len(ref_val), "sample_size": [np.nan] * len(ref_val), "geo_id": ["1"] * len(ref_val), "time_value": pd.date_range(start="2020-09-24", end="2020-10-23")} test_data = {"val": test_val, "se": [np.nan] * len(test_val), "sample_size": [np.nan] * len(test_val), "geo_id": ["1"] * len(test_val), "time_value": pd.date_range(start="2020-10-24", end="2020-10-26")} ref_data2 = {"val": ref_val, "se": [np.nan] * len(ref_val), "sample_size": [np.nan] * len(ref_val), "geo_id": ["2"] * len(ref_val), "time_value": pd.date_range(start="2020-09-24", end="2020-10-23")} test_data2 = {"val": test_val, "se": [np.nan] * len(test_val), "sample_size": [np.nan] * len(test_val), "geo_id": ["2"] * len(test_val), "time_value": pd.date_range(start="2020-10-24", end="2020-10-26")} ref_df = pd.concat([pd.DataFrame(ref_data), pd.DataFrame(ref_data2)]). \ reset_index(drop=True) test_df = pd.concat([pd.DataFrame(test_data), pd.DataFrame(test_data2)]). \ reset_index(drop=True) validator.check_positive_negative_spikes( test_df, ref_df, "state", "signal", report) assert len(report.raised_errors) == 0
def test_same_df(self): validator = DynamicValidator(self.params) report = ValidationReport([]) test_df = pd.DataFrame([date.today()] * 5, columns=["time_value"]) ref_df = pd.DataFrame([date.today()] * 5, columns=["time_value"]) validator.check_rapid_change_num_rows(test_df, ref_df, date.today(), "geo", "signal", report) assert len(report.raised_errors) == 0
def test_neg_outlier(self): validator = DynamicValidator(self.params) report = ValidationReport([]) ref_val = [ 100, 101, 100, 101, 100, 100, 100, 100, 100, 100, 100, 102, 100, 100, 100, 100, 100, 101, 100, 100, 100, 100, 100, 99, 100, 100, 98, 100, 100, 100 ] test_val = [10, 10, 10] ref_data = { "val": ref_val, "se": [np.nan] * len(ref_val), "sample_size": [np.nan] * len(ref_val), "geo_id": ["1"] * len(ref_val), "time_value": pd.date_range(start="2020-09-24", end="2020-10-23") } test_data = { "val": test_val, "se": [np.nan] * len(test_val), "sample_size": [np.nan] * len(test_val), "geo_id": ["1"] * len(test_val), "time_value": pd.date_range(start="2020-10-24", end="2020-10-26") } ref_data2 = { "val": ref_val, "se": [np.nan] * len(ref_val), "sample_size": [np.nan] * len(ref_val), "geo_id": ["2"] * len(ref_val), "time_value": pd.date_range(start="2020-09-24", end="2020-10-23") } test_data2 = { "val": test_val, "se": [np.nan] * len(test_val), "sample_size": [np.nan] * len(test_val), "geo_id": ["2"] * len(test_val), "time_value": pd.date_range(start="2020-10-24", end="2020-10-26") } ref_df = pd.concat([pd.DataFrame(ref_data), pd.DataFrame(ref_data2)]). \ reset_index(drop=True) test_df = pd.concat([pd.DataFrame(test_data), pd.DataFrame(test_data2)]). \ reset_index(drop=True) validator.check_positive_negative_spikes(test_df, ref_df, "state", "signal", report) assert len(report.raised_errors) == 1 assert report.raised_errors[ 0].check_name == "check_positive_negative_spikes"
def test_0_vs_many(self): validator = DynamicValidator(self.params) report = ValidationReport([]) time_value = datetime.combine(date.today(), datetime.min.time()) test_df = pd.DataFrame([time_value] * 5, columns=["time_value"]) ref_df = pd.DataFrame([time_value] * 1, columns=["time_value"]) validator.check_rapid_change_num_rows( test_df, ref_df, time_value, "geo", "signal", report) assert len(report.raised_errors) == 1 assert report.raised_errors[0].check_name == "check_rapid_change_num_rows"
def test_no_padding(self): validator = DynamicValidator(self.params) report = ValidationReport([]) data = {"val": [1, 1, 1, 2, 0, 1], "se": [np.nan] * 6, "sample_size": [np.nan] * 6, "geo_id": ["1"] * 6, "time_value": pd.date_range(start="2021-01-01", end="2021-01-06")} test_df = pd.DataFrame(data) ref_df = pd.DataFrame(data) new_ref_df = validator.pad_reference_api_df( ref_df, test_df, datetime.strptime("2021-01-06", "%Y-%m-%d").date()) assert new_ref_df.equals(ref_df)
def test_same_val_se_n(self): validator = DynamicValidator(self.params) report = ValidationReport([]) data = {"val": [1, 1, 1, 2, 0, 1, 1]*2, "se": [1, 1, 1, 2, 0, 1, 1]*2, "sample_size": [1, 1, 1, 2, 0, 1, 1]*2, "geo_id": ["1"] * 14, "time_value": ["2021-01-01", "2021-01-02", "2021-01-03", "2021-01-04", "2021-01-05", "2021-01-06", "2021-01-07", "2021-01-08", "2021-01-09", "2021-01-10", "2021-01-11", "2021-01-12", "2021-01-13", "2021-01-14"]} test_df = pd.DataFrame(data) ref_df = pd.DataFrame(data) validator.check_avg_val_vs_reference( test_df, ref_df, date.today(), "geo", "signal", report) assert len(report.raised_errors) == 0
def test_same_n(self): validator = DynamicValidator(self.params) report = ValidationReport([]) data = { "val": [np.nan] * 6, "se": [np.nan] * 6, "sample_size": [1, 1, 1, 2, 0, 1], "geo_id": ["1"] * 6 } test_df = pd.DataFrame(data) ref_df = pd.DataFrame(data) validator.check_avg_val_vs_reference(test_df, ref_df, date.today(), "geo", "signal", report) assert len(report.raised_errors) == 0
def test_half_padding(self): validator = DynamicValidator(self.params) report = ValidationReport([]) ref_data = {"val": [2, 2, 2, 2, 2, 2], "se": [np.nan] * 6, "sample_size": [np.nan] * 6, "geo_id": ["1"] * 6, "time_value": pd.date_range(start="2021-01-01", end="2021-01-06")} test_data = {"val": [1, 1, 1, 1, 1, 1], "se": [np.nan] * 6, "sample_size": [np.nan] * 6, "geo_id": ["1"] * 6, "time_value": pd.date_range(start="2021-01-06", end="2021-01-11")} ref_df = pd.DataFrame(ref_data) test_df = pd.DataFrame(test_data) new_ref_df = validator.pad_reference_api_df( ref_df, test_df, datetime.strptime("2021-01-15", "%Y-%m-%d").date()) # Check it only takes missing dates - so the last 5 dates assert new_ref_df.time_value.max() == datetime.strptime("2021-01-11", "%Y-%m-%d").date() assert new_ref_df.shape[0] == 11 assert new_ref_df.loc[:, "val"].iloc[5] == 2
def test_1000x_val(self): validator = DynamicValidator(self.params) report = ValidationReport([]) test_data = {"val": [1, 1, 1, 2000, 0, 1, 1]*2, "se": [np.nan] * 14, "sample_size": [np.nan] * 14, "geo_id": ["1"] * 14, "time_value": ["2021-01-01", "2021-01-02", "2021-01-03", "2021-01-04", "2021-01-05", "2021-01-06", "2021-01-07", "2021-01-08", "2021-01-09", "2021-01-10", "2021-01-11", "2021-01-12", "2021-01-13", "2021-01-14"]} ref_data = {"val": [1, 1, 1, 2, 0, 1, 1]*2, "se": [np.nan] * 14, "sample_size": [np.nan] * 14, "geo_id": ["1"] * 14, "time_value": ["2021-01-01", "2021-01-02", "2021-01-03", "2021-01-04", "2021-01-05", "2021-01-06", "2021-01-07", "2021-01-08", "2021-01-09", "2021-01-10", "2021-01-11", "2021-01-12", "2021-01-13", "2021-01-14"]} test_df = pd.DataFrame(test_data) ref_df = pd.DataFrame(ref_data) validator.check_avg_val_vs_reference( test_df, ref_df, datetime.combine(date.today(), datetime.min.time()), "geo", "signal", report) assert len(report.raised_errors) == 1 assert report.raised_errors[0].check_name == "check_test_vs_reference_avg_changed"