def get_test_validations() -> List[DataValidationJob]: return [ DataValidationJob( region_code="US_UT", validation=ExistenceDataValidationCheck( validation_category=ValidationCategory.INVARIANT, view_builder=SimpleBigQueryViewBuilder( dataset_id="my_dataset", view_id="test_1", description="test_1 description", view_query_template="select * from literally_anything", ), ), ), DataValidationJob( region_code="US_UT", validation=ExistenceDataValidationCheck( validation_category=ValidationCategory.INVARIANT, view_builder=SimpleBigQueryViewBuilder( dataset_id="my_dataset", view_id="test_2", description="test_2 description", view_query_template="select * from literally_anything", ), ), ), DataValidationJob( region_code="US_VA", validation=ExistenceDataValidationCheck( validation_category=ValidationCategory.INVARIANT, view_builder=SimpleBigQueryViewBuilder( dataset_id="my_dataset", view_id="test_1", description="test_1 description", view_query_template="select * from literally_anything", ), ), ), DataValidationJob( region_code="US_VA", validation=ExistenceDataValidationCheck( validation_category=ValidationCategory.INVARIANT, view_builder=SimpleBigQueryViewBuilder( dataset_id="my_dataset", view_id="test_2", description="test_2 description", view_query_template="select * from literally_anything", ), ), ), ]
def test_sameness_check_numbers_different_values_within_margin(self): self.mock_client.run_query_async.return_value = [{ 'a': 98, 'b': 100, 'c': 99 }] job = DataValidationJob( region_code='US_VA', validation=SamenessDataValidationCheck( validation_type=ValidationCheckType.SAMENESS, comparison_columns=['a', 'b', 'c'], sameness_check_type=SamenessDataValidationCheckType.NUMBERS, max_allowed_error=0.02, view=BigQueryView( dataset_id='my_dataset', view_id='test_view', view_query_template='select * from literally_anything'))) result = SamenessValidationChecker.run_check(job) self.assertEqual( result, DataValidationJobResult(validation_job=job, was_successful=True, failure_description=None))
def test_sameness_check_numbers_different_values_within_margin(self) -> None: self.mock_client.run_query_async.return_value = [{"a": 98, "b": 100, "c": 99}] job = DataValidationJob( region_code="US_XX", validation=SamenessDataValidationCheck( validation_category=ValidationCategory.EXTERNAL_AGGREGATE, validation_type=ValidationCheckType.SAMENESS, comparison_columns=["a", "b", "c"], sameness_check_type=SamenessDataValidationCheckType.NUMBERS, max_allowed_error=0.02, view_builder=SimpleBigQueryViewBuilder( dataset_id="my_dataset", view_id="test_view", description="test_view description", view_query_template="select * from literally_anything", ), ), ) result = SamenessValidationChecker.run_check(job) self.assertEqual( result, DataValidationJobResult( validation_job=job, result_details=SamenessNumbersValidationResultDetails( failed_rows=[], max_allowed_error=0.02 ), ), )
def test_existence_check_failures(self) -> None: self.mock_client.run_query_async.return_value = [ "some result row", "some other result row", ] job = DataValidationJob( region_code="US_VA", validation=ExistenceDataValidationCheck( validation_type=ValidationCheckType.EXISTENCE, view=BigQueryView( dataset_id="my_dataset", view_id="test_view", view_query_template="select * from literally_anything", ), ), ) result = ExistenceValidationChecker.run_check(job) self.assertEqual( result, DataValidationJobResult( validation_job=job, was_successful=False, failure_description= "Found [2] invalid rows, though [0] were expected", ), )
def test_existence_check_failures_below_threshold(self) -> None: self.mock_client.run_query_async.return_value = [ "some result row", "some other result row", ] job = DataValidationJob( region_code="US_VA", validation=ExistenceDataValidationCheck( validation_type=ValidationCheckType.EXISTENCE, view=BigQueryView( dataset_id="my_dataset", view_id="test_view", view_query_template="select * from literally_anything", ), num_allowed_rows=2, ), ) result = ExistenceValidationChecker.run_check(job) self.assertEqual( result, DataValidationJobResult(validation_job=job, was_successful=True, failure_description=None), )
def test_existence_check_failures(self) -> None: self.mock_client.run_query_async.return_value = [ "some result row", "some other result row", ] job = DataValidationJob( region_code="US_VA", validation=ExistenceDataValidationCheck( validation_category=ValidationCategory.INVARIANT, validation_type=ValidationCheckType.EXISTENCE, view_builder=SimpleBigQueryViewBuilder( dataset_id="my_dataset", view_id="test_view", description="test_view description", view_query_template="select * from literally_anything", ), ), ) result = ExistenceValidationChecker.run_check(job) self.assertEqual( result, DataValidationJobResult( validation_job=job, result_details=ExistenceValidationResultDetails( num_invalid_rows=2, num_allowed_rows=0), ), )
def test_sameness_check_numbers_multiple_rows_above_margin(self) -> None: self.mock_client.run_query_async.return_value = [ {"a": 97, "b": 100, "c": 99}, {"a": 14, "b": 21, "c": 14}, ] job = DataValidationJob( region_code="US_VA", validation=SamenessDataValidationCheck( validation_type=ValidationCheckType.SAMENESS, comparison_columns=["a", "b", "c"], sameness_check_type=SamenessDataValidationCheckType.NUMBERS, max_allowed_error=0.02, view=BigQueryView( dataset_id="my_dataset", view_id="test_view", view_query_template="select * from literally_anything", ), ), ) result = SamenessValidationChecker.run_check(job) self.assertEqual( result, DataValidationJobResult( validation_job=job, was_successful=False, failure_description="2 row(s) had unacceptable margins of error. The acceptable margin " "of error is only 0.02, but the validation returned rows with " "errors as high as 0.3333.", ), )
def test_string_sameness_check_strings_values_all_none(self) -> None: self.mock_client.run_query_async.return_value = [ {"a": None, "b": None, "c": None} ] job = DataValidationJob( region_code="US_VA", validation=SamenessDataValidationCheck( validation_type=ValidationCheckType.SAMENESS, comparison_columns=["a", "b", "c"], sameness_check_type=SamenessDataValidationCheckType.STRINGS, view=BigQueryView( dataset_id="my_dataset", view_id="test_view", view_query_template="select * from literally_anything", ), ), ) result = SamenessValidationChecker.run_check(job) self.assertEqual( result, DataValidationJobResult( validation_job=job, was_successful=True, failure_description=None ), )
def test_string_sameness_check_different_values_handle_non_string_type( self, ) -> None: self.mock_client.run_query_async.return_value = [ {"a": "same", "b": "same", "c": 1245} ] job = DataValidationJob( region_code="US_VA", validation=SamenessDataValidationCheck( validation_type=ValidationCheckType.SAMENESS, comparison_columns=["a", "b", "c"], sameness_check_type=SamenessDataValidationCheckType.STRINGS, view=BigQueryView( dataset_id="my_dataset", view_id="test_view", view_query_template="select * from literally_anything", ), ), ) with self.assertRaises(ValueError) as e: _ = SamenessValidationChecker.run_check(job) self.assertEqual( str(e.exception), "Unexpected type [<class 'int'>] for value [1245] in STRING validation [test_view].", )
def test_string_sameness_check_different_values_above_margin(self): num_bad_rows = 5 max_allowed_error = ( (num_bad_rows - 1) / 100) # Below the number of bad rows self.mock_client.run_query_async.return_value = self.return_string_values_with_num_bad_rows( num_bad_rows) job = DataValidationJob( region_code='US_VA', validation=SamenessDataValidationCheck( validation_type=ValidationCheckType.SAMENESS, comparison_columns=['a', 'b', 'c'], sameness_check_type=SamenessDataValidationCheckType.STRINGS, max_allowed_error=max_allowed_error, view=BigQueryView( dataset_id='my_dataset', view_id='test_view', view_query_template='select * from literally_anything'))) result = SamenessValidationChecker.run_check(job) actual_expected_error = (num_bad_rows / 100) self.assertEqual( result, DataValidationJobResult( validation_job=job, was_successful=False, failure_description= f'{num_bad_rows} out of 100 row(s) did not contain matching strings. ' f'The acceptable margin of error is only {max_allowed_error}, but the ' f'validation returned an error rate of {actual_expected_error}.', ))
def test_string_sameness_check_different_values_handle_non_string_type( self): self.mock_client.run_query_async.return_value = [{ 'a': 'same', 'b': 'same', 'c': 1245 }] job = DataValidationJob( region_code='US_VA', validation=SamenessDataValidationCheck( validation_type=ValidationCheckType.SAMENESS, comparison_columns=['a', 'b', 'c'], sameness_check_type=SamenessDataValidationCheckType.STRINGS, view=BigQueryView( dataset_id='my_dataset', view_id='test_view', view_query_template='select * from literally_anything'))) with self.assertRaises(ValueError) as e: _ = SamenessValidationChecker.run_check(job) self.assertEqual( str(e.exception), 'Unexpected type [<class \'int\'>] for value [1245] in STRING validation [test_view].' )
def test_string_sameness_check_different_values_handle_empty_string(self): self.mock_client.run_query_async.return_value = [{ 'a': 'same', 'b': 'same', 'c': None }] job = DataValidationJob( region_code='US_VA', validation=SamenessDataValidationCheck( validation_type=ValidationCheckType.SAMENESS, comparison_columns=['a', 'b', 'c'], sameness_check_type=SamenessDataValidationCheckType.STRINGS, view=BigQueryView( dataset_id='my_dataset', view_id='test_view', view_query_template='select * from literally_anything'))) result = SamenessValidationChecker.run_check(job) self.assertEqual( result, DataValidationJobResult( validation_job=job, was_successful=False, failure_description= '1 out of 1 row(s) did not contain matching strings. ' 'The acceptable margin of error is only 0.0, but the ' 'validation returned an error rate of 1.0.', ))
def test_string_sameness_check_different_values_within_margin(self) -> None: num_bad_rows = 2 max_allowed_error = num_bad_rows / 100 self.mock_client.run_query_async.return_value = ( self.return_string_values_with_num_bad_rows(num_bad_rows) ) job = DataValidationJob( region_code="US_VA", validation=SamenessDataValidationCheck( validation_type=ValidationCheckType.SAMENESS, comparison_columns=["a", "b", "c"], sameness_check_type=SamenessDataValidationCheckType.STRINGS, max_allowed_error=max_allowed_error, view=BigQueryView( dataset_id="my_dataset", view_id="test_view", view_query_template="select * from literally_anything", ), ), ) result = SamenessValidationChecker.run_check(job) self.assertEqual( result, DataValidationJobResult( validation_job=job, was_successful=True, failure_description=None ), )
def test_string_sameness_check_different_values_above_margin(self) -> None: num_bad_rows = 5 max_allowed_error = (num_bad_rows - 1) / 100 # Below the number of bad rows self.mock_client.run_query_async.return_value = ( self.return_string_values_with_num_bad_rows(num_bad_rows) ) job = DataValidationJob( region_code="US_VA", validation=SamenessDataValidationCheck( validation_type=ValidationCheckType.SAMENESS, comparison_columns=["a", "b", "c"], sameness_check_type=SamenessDataValidationCheckType.STRINGS, max_allowed_error=max_allowed_error, view=BigQueryView( dataset_id="my_dataset", view_id="test_view", view_query_template="select * from literally_anything", ), ), ) result = SamenessValidationChecker.run_check(job) actual_expected_error = num_bad_rows / 100 self.assertEqual( result, DataValidationJobResult( validation_job=job, was_successful=False, failure_description=f"{num_bad_rows} out of 100 row(s) did not contain matching strings. " f"The acceptable margin of error is only {max_allowed_error}, but the " f"validation returned an error rate of {actual_expected_error}.", ), )
def test_sameness_check_numbers_multiple_rows_above_margin(self): self.mock_client.run_query_async.return_value = [{ 'a': 97, 'b': 100, 'c': 99 }, { 'a': 14, 'b': 21, 'c': 14 }] job = DataValidationJob( region_code='US_VA', validation=SamenessDataValidationCheck( validation_type=ValidationCheckType.SAMENESS, comparison_columns=['a', 'b', 'c'], sameness_check_type=SamenessDataValidationCheckType.NUMBERS, max_allowed_error=0.02, view=BigQueryView( dataset_id='my_dataset', view_id='test_view', view_query_template='select * from literally_anything'))) result = SamenessValidationChecker.run_check(job) self.assertEqual( result, DataValidationJobResult( validation_job=job, was_successful=False, failure_description= '2 row(s) had unacceptable margins of error. The acceptable margin ' 'of error is only 0.02, but the validation returned rows with ' 'errors as high as 0.3333.', ))
def test_sameness_check_strings_different_values_no_allowed_error(self) -> None: self.mock_client.run_query_async.return_value = [{"a": "a", "b": "b", "c": "c"}] job = DataValidationJob( region_code="US_XX", validation=SamenessDataValidationCheck( validation_category=ValidationCategory.EXTERNAL_INDIVIDUAL, validation_type=ValidationCheckType.SAMENESS, comparison_columns=["a", "b", "c"], partition_columns=[], sameness_check_type=SamenessDataValidationCheckType.STRINGS, view_builder=SimpleBigQueryViewBuilder( dataset_id="my_dataset", view_id="test_view", description="test_view description", view_query_template="select * from literally_anything", ), ), ) result = SamenessValidationChecker.run_check(job) self.assertEqual( result, DataValidationJobResult( validation_job=job, result_details=SamenessStringsValidationResultDetails( num_error_rows=1, total_num_rows=1, max_allowed_error=0.0, non_null_counts_per_column_per_partition=[ (tuple(), {"a": 1, "b": 1, "c": 1}), ], ), ), )
def test_string_sameness_check_numbers_values_all_none(self) -> None: self.mock_client.run_query_async.return_value = [ {"a": None, "b": None, "c": None} ] job = DataValidationJob( region_code="US_VA", validation=SamenessDataValidationCheck( validation_type=ValidationCheckType.SAMENESS, comparison_columns=["a", "b", "c"], sameness_check_type=SamenessDataValidationCheckType.NUMBERS, view=BigQueryView( dataset_id="my_dataset", view_id="test_view", view_query_template="select * from literally_anything", ), ), ) with self.assertRaises(ValueError) as e: _ = SamenessValidationChecker.run_check(job) self.assertEqual( str(e.exception), "Unexpected None value for column [a] in validation [test_view].", )
def test_string_sameness_check_different_values_handle_empty_string(self) -> None: self.mock_client.run_query_async.return_value = [ {"a": "same", "b": "same", "c": None} ] job = DataValidationJob( region_code="US_VA", validation=SamenessDataValidationCheck( validation_type=ValidationCheckType.SAMENESS, comparison_columns=["a", "b", "c"], sameness_check_type=SamenessDataValidationCheckType.STRINGS, view=BigQueryView( dataset_id="my_dataset", view_id="test_view", view_query_template="select * from literally_anything", ), ), ) result = SamenessValidationChecker.run_check(job) self.assertEqual( result, DataValidationJobResult( validation_job=job, was_successful=False, failure_description="1 out of 1 row(s) did not contain matching strings. " "The acceptable margin of error is only 0.0, but the " "validation returned an error rate of 1.0.", ), )
def _fetch_validation_jobs_to_perform( region_code_filter: Optional[str] = None, validation_name_filter: Optional[Pattern] = None, dataset_overrides: Optional[Dict[str, str]] = None, ) -> List[DataValidationJob]: """ Creates and returns validation jobs for all validations meeting the name filter, for the given region code, and with the dataset overrides if given. """ validation_checks = get_all_validations() region_configs = get_validation_region_configs() global_config = get_validation_global_config() validation_jobs: List[DataValidationJob] = [] for check in validation_checks: if check.validation_name in global_config.disabled: continue if validation_name_filter is not None and not re.search( validation_name_filter, check.validation_name): continue for region_code, region_config in region_configs.items(): if region_code_filter and region_code != region_code_filter: continue if check.validation_name not in region_config.exclusions: updated_check = check.updated_for_region(region_config) validation_jobs.append( DataValidationJob( validation=updated_check, region_code=region_code, dataset_overrides=dataset_overrides, )) return validation_jobs
def test_sameness_check_numbers_one_none(self) -> None: self.mock_client.run_query_async.return_value = [{"a": 3, "b": 3, "c": None}] job = DataValidationJob( region_code="US_XX", validation=SamenessDataValidationCheck( validation_category=ValidationCategory.EXTERNAL_AGGREGATE, validation_type=ValidationCheckType.SAMENESS, comparison_columns=["a", "b", "c"], sameness_check_type=SamenessDataValidationCheckType.NUMBERS, view_builder=SimpleBigQueryViewBuilder( dataset_id="my_dataset", view_id="test_view", description="test_view description", view_query_template="select * from literally_anything", ), ), ) with self.assertRaises(ValueError) as e: _ = SamenessValidationChecker.run_check(job) self.assertEqual( str(e.exception), "Unexpected None value for column [c] in validation [test_view].", )
def _fetch_validation_jobs_to_perform() -> List[DataValidationJob]: validation_checks = get_all_validations() validation_jobs: List[DataValidationJob] = [] for check in validation_checks: for state_code in STATES_TO_VALIDATE: validation_jobs.append( DataValidationJob(validation=check, region_code=state_code)) return validation_jobs
def test_check_happy_path_existence(): job = DataValidationJob( region_code='US_VA', validation=DataValidationCheck( validation_type=ValidationCheckType.EXISTENCE, view=BigQueryView( dataset_id='my_dataset', view_id='test_view', view_query_template='select * from literally_anything'))) check_class = check_resolver.checker_for_validation(job) assert isinstance(check_class, ExistenceValidationChecker)
def test_validation_job_returns_correct_query(self) -> None: builder = SimpleBigQueryViewBuilder( dataset_id="my_dataset", view_id="test_2", description="test_2 description", view_query_template="select * from literally_anything", ) dataset_overrides = {"my_dataset": "my_dataset_override"} existence_check_job = DataValidationJob( validation=ExistenceDataValidationCheck( view_builder=builder, validation_category=ValidationCategory.INVARIANT ), region_code="US_XX", ) self.assertEqual( "SELECT * FROM `recidiviz-456.my_dataset.test_2` WHERE region_code = 'US_XX';", existence_check_job.query_str(), ) existence_check_job = DataValidationJob( validation=ExistenceDataValidationCheck( view_builder=builder, validation_category=ValidationCategory.INVARIANT ), region_code="US_XX", dataset_overrides=dataset_overrides, ) self.assertEqual( "SELECT * FROM `recidiviz-456.my_dataset_override.test_2` WHERE region_code = 'US_XX';", existence_check_job.query_str(), )
def test_existence_check_no_failures(self): self.mock_client.run_query_async.return_value = [] job = DataValidationJob(region_code='US_VA', validation=ExistenceDataValidationCheck( validation_type=ValidationCheckType.EXISTENCE, view=BigQueryView(dataset_id='my_dataset', view_id='test_view', view_query_template='select * from literally_anything') )) result = ExistenceValidationChecker.run_check(job) self.assertEqual(result, DataValidationJobResult(validation_job=job, was_successful=True, failure_description=None))
def test_check_happy_path_existence(self) -> None: job = DataValidationJob( region_code="US_VA", validation=ExistenceDataValidationCheck( validation_type=ValidationCheckType.EXISTENCE, view=BigQueryView( dataset_id="my_dataset", view_id="test_view", view_query_template="select * from literally_anything", ), ), ) check_class = check_resolver.checker_for_validation(job) assert isinstance(check_class, ExistenceValidationChecker)
def test_existence_check_failures(self): self.mock_client.run_query_async.return_value = ['some result row', 'some other result row'] job = DataValidationJob(region_code='US_VA', validation=ExistenceDataValidationCheck( validation_type=ValidationCheckType.EXISTENCE, view=BigQueryView(dataset_id='my_dataset', view_id='test_view', view_query_template='select * from literally_anything') )) result = ExistenceValidationChecker.run_check(job) self.assertEqual(result, DataValidationJobResult(validation_job=job, was_successful=False, failure_description='Found 2 invalid rows, though 0 were expected'))
def test_sameness_check_strings_multiple_dates(self) -> None: self.mock_client.run_query_async.return_value = [ # January 2021 {"region": "US_XX", "date": "2021-01-31", "a": "00", "b": "00"}, {"region": "US_XX", "date": "2021-01-31", "a": "01", "b": None}, {"region": "US_XX", "date": "2021-01-31", "a": "02", "b": "02"}, {"region": "US_XX", "date": "2021-01-31", "a": None, "b": "03"}, # December 2020 {"region": "US_XX", "date": "2020-12-31", "a": "00", "b": "00"}, {"region": "US_XX", "date": "2020-12-31", "a": "02", "b": "02"}, {"region": "US_XX", "date": "2020-12-31", "a": None, "b": "04"}, ] job = DataValidationJob( region_code="US_XX", validation=SamenessDataValidationCheck( validation_category=ValidationCategory.EXTERNAL_INDIVIDUAL, validation_type=ValidationCheckType.SAMENESS, comparison_columns=["a", "b"], partition_columns=["region", "date"], sameness_check_type=SamenessDataValidationCheckType.STRINGS, max_allowed_error=0.0, view_builder=SimpleBigQueryViewBuilder( dataset_id="my_dataset", view_id="test_view", description="test_view description", view_query_template="select * from literally_anything", ), ), ) result = SamenessValidationChecker.run_check(job) self.assertEqual( result, DataValidationJobResult( validation_job=job, result_details=SamenessStringsValidationResultDetails( num_error_rows=3, total_num_rows=7, max_allowed_error=0.0, non_null_counts_per_column_per_partition=[ (("US_XX", "2021-01-31"), {"a": 3, "b": 3}), (("US_XX", "2020-12-31"), {"a": 2, "b": 3}), ], ), ), )
def _fetch_validation_jobs_to_perform( region_code_filter: Optional[str] = None, ) -> List[DataValidationJob]: validation_checks = get_all_validations() region_configs = get_validation_region_configs() global_config = get_validation_global_config() validation_jobs: List[DataValidationJob] = [] for check in validation_checks: if check.validation_name in global_config.disabled: continue for region_code, region_config in region_configs.items(): if region_code_filter and region_code != region_code_filter: continue if check.validation_name not in region_config.exclusions: updated_check = check.updated_for_region(region_config) validation_jobs.append( DataValidationJob(validation=updated_check, region_code=region_code)) return validation_jobs
def test_string_sameness_check_strings_values_all_none(self): self.mock_client.run_query_async.return_value = [{ 'a': None, 'b': None, 'c': None }] job = DataValidationJob( region_code='US_VA', validation=SamenessDataValidationCheck( validation_type=ValidationCheckType.SAMENESS, comparison_columns=['a', 'b', 'c'], sameness_check_type=SamenessDataValidationCheckType.STRINGS, view=BigQueryView( dataset_id='my_dataset', view_id='test_view', view_query_template='select * from literally_anything'))) result = SamenessValidationChecker.run_check(job) self.assertEqual( result, DataValidationJobResult(validation_job=job, was_successful=True, failure_description=None))
def test_string_sameness_check_numbers_one_none(self): self.mock_client.run_query_async.return_value = [{ 'a': 3, 'b': 3, 'c': None }] job = DataValidationJob( region_code='US_VA', validation=SamenessDataValidationCheck( validation_type=ValidationCheckType.SAMENESS, comparison_columns=['a', 'b', 'c'], sameness_check_type=SamenessDataValidationCheckType.NUMBERS, view=BigQueryView( dataset_id='my_dataset', view_id='test_view', view_query_template='select * from literally_anything'))) with self.assertRaises(ValueError) as e: _ = SamenessValidationChecker.run_check(job) self.assertEqual( str(e.exception), 'Unexpected None value for column [c] in validation [test_view].')