def get_test_validations() -> List[DataValidationJob]:
    return [
        DataValidationJob(
            region_code="US_UT",
            validation=ExistenceDataValidationCheck(
                validation_category=ValidationCategory.INVARIANT,
                view_builder=SimpleBigQueryViewBuilder(
                    dataset_id="my_dataset",
                    view_id="test_1",
                    description="test_1 description",
                    view_query_template="select * from literally_anything",
                ),
            ),
        ),
        DataValidationJob(
            region_code="US_UT",
            validation=ExistenceDataValidationCheck(
                validation_category=ValidationCategory.INVARIANT,
                view_builder=SimpleBigQueryViewBuilder(
                    dataset_id="my_dataset",
                    view_id="test_2",
                    description="test_2 description",
                    view_query_template="select * from literally_anything",
                ),
            ),
        ),
        DataValidationJob(
            region_code="US_VA",
            validation=ExistenceDataValidationCheck(
                validation_category=ValidationCategory.INVARIANT,
                view_builder=SimpleBigQueryViewBuilder(
                    dataset_id="my_dataset",
                    view_id="test_1",
                    description="test_1 description",
                    view_query_template="select * from literally_anything",
                ),
            ),
        ),
        DataValidationJob(
            region_code="US_VA",
            validation=ExistenceDataValidationCheck(
                validation_category=ValidationCategory.INVARIANT,
                view_builder=SimpleBigQueryViewBuilder(
                    dataset_id="my_dataset",
                    view_id="test_2",
                    description="test_2 description",
                    view_query_template="select * from literally_anything",
                ),
            ),
        ),
    ]
示例#2
0
    def test_sameness_check_numbers_different_values_within_margin(self):
        self.mock_client.run_query_async.return_value = [{
            'a': 98,
            'b': 100,
            'c': 99
        }]

        job = DataValidationJob(
            region_code='US_VA',
            validation=SamenessDataValidationCheck(
                validation_type=ValidationCheckType.SAMENESS,
                comparison_columns=['a', 'b', 'c'],
                sameness_check_type=SamenessDataValidationCheckType.NUMBERS,
                max_allowed_error=0.02,
                view=BigQueryView(
                    dataset_id='my_dataset',
                    view_id='test_view',
                    view_query_template='select * from literally_anything')))
        result = SamenessValidationChecker.run_check(job)

        self.assertEqual(
            result,
            DataValidationJobResult(validation_job=job,
                                    was_successful=True,
                                    failure_description=None))
    def test_sameness_check_numbers_different_values_within_margin(self) -> None:
        self.mock_client.run_query_async.return_value = [{"a": 98, "b": 100, "c": 99}]

        job = DataValidationJob(
            region_code="US_XX",
            validation=SamenessDataValidationCheck(
                validation_category=ValidationCategory.EXTERNAL_AGGREGATE,
                validation_type=ValidationCheckType.SAMENESS,
                comparison_columns=["a", "b", "c"],
                sameness_check_type=SamenessDataValidationCheckType.NUMBERS,
                max_allowed_error=0.02,
                view_builder=SimpleBigQueryViewBuilder(
                    dataset_id="my_dataset",
                    view_id="test_view",
                    description="test_view description",
                    view_query_template="select * from literally_anything",
                ),
            ),
        )
        result = SamenessValidationChecker.run_check(job)

        self.assertEqual(
            result,
            DataValidationJobResult(
                validation_job=job,
                result_details=SamenessNumbersValidationResultDetails(
                    failed_rows=[], max_allowed_error=0.02
                ),
            ),
        )
示例#4
0
    def test_existence_check_failures(self) -> None:
        self.mock_client.run_query_async.return_value = [
            "some result row",
            "some other result row",
        ]

        job = DataValidationJob(
            region_code="US_VA",
            validation=ExistenceDataValidationCheck(
                validation_type=ValidationCheckType.EXISTENCE,
                view=BigQueryView(
                    dataset_id="my_dataset",
                    view_id="test_view",
                    view_query_template="select * from literally_anything",
                ),
            ),
        )
        result = ExistenceValidationChecker.run_check(job)

        self.assertEqual(
            result,
            DataValidationJobResult(
                validation_job=job,
                was_successful=False,
                failure_description=
                "Found [2] invalid rows, though [0] were expected",
            ),
        )
示例#5
0
    def test_existence_check_failures_below_threshold(self) -> None:
        self.mock_client.run_query_async.return_value = [
            "some result row",
            "some other result row",
        ]

        job = DataValidationJob(
            region_code="US_VA",
            validation=ExistenceDataValidationCheck(
                validation_type=ValidationCheckType.EXISTENCE,
                view=BigQueryView(
                    dataset_id="my_dataset",
                    view_id="test_view",
                    view_query_template="select * from literally_anything",
                ),
                num_allowed_rows=2,
            ),
        )
        result = ExistenceValidationChecker.run_check(job)

        self.assertEqual(
            result,
            DataValidationJobResult(validation_job=job,
                                    was_successful=True,
                                    failure_description=None),
        )
    def test_existence_check_failures(self) -> None:
        self.mock_client.run_query_async.return_value = [
            "some result row",
            "some other result row",
        ]

        job = DataValidationJob(
            region_code="US_VA",
            validation=ExistenceDataValidationCheck(
                validation_category=ValidationCategory.INVARIANT,
                validation_type=ValidationCheckType.EXISTENCE,
                view_builder=SimpleBigQueryViewBuilder(
                    dataset_id="my_dataset",
                    view_id="test_view",
                    description="test_view description",
                    view_query_template="select * from literally_anything",
                ),
            ),
        )
        result = ExistenceValidationChecker.run_check(job)

        self.assertEqual(
            result,
            DataValidationJobResult(
                validation_job=job,
                result_details=ExistenceValidationResultDetails(
                    num_invalid_rows=2, num_allowed_rows=0),
            ),
        )
    def test_sameness_check_numbers_multiple_rows_above_margin(self) -> None:
        self.mock_client.run_query_async.return_value = [
            {"a": 97, "b": 100, "c": 99},
            {"a": 14, "b": 21, "c": 14},
        ]

        job = DataValidationJob(
            region_code="US_VA",
            validation=SamenessDataValidationCheck(
                validation_type=ValidationCheckType.SAMENESS,
                comparison_columns=["a", "b", "c"],
                sameness_check_type=SamenessDataValidationCheckType.NUMBERS,
                max_allowed_error=0.02,
                view=BigQueryView(
                    dataset_id="my_dataset",
                    view_id="test_view",
                    view_query_template="select * from literally_anything",
                ),
            ),
        )
        result = SamenessValidationChecker.run_check(job)

        self.assertEqual(
            result,
            DataValidationJobResult(
                validation_job=job,
                was_successful=False,
                failure_description="2 row(s) had unacceptable margins of error. The acceptable margin "
                "of error is only 0.02, but the validation returned rows with "
                "errors as high as 0.3333.",
            ),
        )
    def test_string_sameness_check_strings_values_all_none(self) -> None:
        self.mock_client.run_query_async.return_value = [
            {"a": None, "b": None, "c": None}
        ]

        job = DataValidationJob(
            region_code="US_VA",
            validation=SamenessDataValidationCheck(
                validation_type=ValidationCheckType.SAMENESS,
                comparison_columns=["a", "b", "c"],
                sameness_check_type=SamenessDataValidationCheckType.STRINGS,
                view=BigQueryView(
                    dataset_id="my_dataset",
                    view_id="test_view",
                    view_query_template="select * from literally_anything",
                ),
            ),
        )
        result = SamenessValidationChecker.run_check(job)

        self.assertEqual(
            result,
            DataValidationJobResult(
                validation_job=job, was_successful=True, failure_description=None
            ),
        )
    def test_string_sameness_check_different_values_handle_non_string_type(
        self,
    ) -> None:
        self.mock_client.run_query_async.return_value = [
            {"a": "same", "b": "same", "c": 1245}
        ]

        job = DataValidationJob(
            region_code="US_VA",
            validation=SamenessDataValidationCheck(
                validation_type=ValidationCheckType.SAMENESS,
                comparison_columns=["a", "b", "c"],
                sameness_check_type=SamenessDataValidationCheckType.STRINGS,
                view=BigQueryView(
                    dataset_id="my_dataset",
                    view_id="test_view",
                    view_query_template="select * from literally_anything",
                ),
            ),
        )
        with self.assertRaises(ValueError) as e:
            _ = SamenessValidationChecker.run_check(job)

        self.assertEqual(
            str(e.exception),
            "Unexpected type [<class 'int'>] for value [1245] in STRING validation [test_view].",
        )
示例#10
0
    def test_string_sameness_check_different_values_above_margin(self):
        num_bad_rows = 5
        max_allowed_error = (
            (num_bad_rows - 1) / 100)  # Below the number of bad rows

        self.mock_client.run_query_async.return_value = self.return_string_values_with_num_bad_rows(
            num_bad_rows)
        job = DataValidationJob(
            region_code='US_VA',
            validation=SamenessDataValidationCheck(
                validation_type=ValidationCheckType.SAMENESS,
                comparison_columns=['a', 'b', 'c'],
                sameness_check_type=SamenessDataValidationCheckType.STRINGS,
                max_allowed_error=max_allowed_error,
                view=BigQueryView(
                    dataset_id='my_dataset',
                    view_id='test_view',
                    view_query_template='select * from literally_anything')))
        result = SamenessValidationChecker.run_check(job)

        actual_expected_error = (num_bad_rows / 100)

        self.assertEqual(
            result,
            DataValidationJobResult(
                validation_job=job,
                was_successful=False,
                failure_description=
                f'{num_bad_rows} out of 100 row(s) did not contain matching strings. '
                f'The acceptable margin of error is only {max_allowed_error}, but the '
                f'validation returned an error rate of {actual_expected_error}.',
            ))
示例#11
0
    def test_string_sameness_check_different_values_handle_non_string_type(
            self):
        self.mock_client.run_query_async.return_value = [{
            'a': 'same',
            'b': 'same',
            'c': 1245
        }]

        job = DataValidationJob(
            region_code='US_VA',
            validation=SamenessDataValidationCheck(
                validation_type=ValidationCheckType.SAMENESS,
                comparison_columns=['a', 'b', 'c'],
                sameness_check_type=SamenessDataValidationCheckType.STRINGS,
                view=BigQueryView(
                    dataset_id='my_dataset',
                    view_id='test_view',
                    view_query_template='select * from literally_anything')))
        with self.assertRaises(ValueError) as e:
            _ = SamenessValidationChecker.run_check(job)

        self.assertEqual(
            str(e.exception),
            'Unexpected type [<class \'int\'>] for value [1245] in STRING validation [test_view].'
        )
示例#12
0
    def test_string_sameness_check_different_values_handle_empty_string(self):
        self.mock_client.run_query_async.return_value = [{
            'a': 'same',
            'b': 'same',
            'c': None
        }]

        job = DataValidationJob(
            region_code='US_VA',
            validation=SamenessDataValidationCheck(
                validation_type=ValidationCheckType.SAMENESS,
                comparison_columns=['a', 'b', 'c'],
                sameness_check_type=SamenessDataValidationCheckType.STRINGS,
                view=BigQueryView(
                    dataset_id='my_dataset',
                    view_id='test_view',
                    view_query_template='select * from literally_anything')))
        result = SamenessValidationChecker.run_check(job)

        self.assertEqual(
            result,
            DataValidationJobResult(
                validation_job=job,
                was_successful=False,
                failure_description=
                '1 out of 1 row(s) did not contain matching strings. '
                'The acceptable margin of error is only 0.0, but the '
                'validation returned an error rate of 1.0.',
            ))
示例#13
0
    def test_string_sameness_check_different_values_within_margin(self) -> None:
        num_bad_rows = 2
        max_allowed_error = num_bad_rows / 100

        self.mock_client.run_query_async.return_value = (
            self.return_string_values_with_num_bad_rows(num_bad_rows)
        )

        job = DataValidationJob(
            region_code="US_VA",
            validation=SamenessDataValidationCheck(
                validation_type=ValidationCheckType.SAMENESS,
                comparison_columns=["a", "b", "c"],
                sameness_check_type=SamenessDataValidationCheckType.STRINGS,
                max_allowed_error=max_allowed_error,
                view=BigQueryView(
                    dataset_id="my_dataset",
                    view_id="test_view",
                    view_query_template="select * from literally_anything",
                ),
            ),
        )
        result = SamenessValidationChecker.run_check(job)

        self.assertEqual(
            result,
            DataValidationJobResult(
                validation_job=job, was_successful=True, failure_description=None
            ),
        )
示例#14
0
    def test_string_sameness_check_different_values_above_margin(self) -> None:
        num_bad_rows = 5
        max_allowed_error = (num_bad_rows - 1) / 100  # Below the number of bad rows

        self.mock_client.run_query_async.return_value = (
            self.return_string_values_with_num_bad_rows(num_bad_rows)
        )
        job = DataValidationJob(
            region_code="US_VA",
            validation=SamenessDataValidationCheck(
                validation_type=ValidationCheckType.SAMENESS,
                comparison_columns=["a", "b", "c"],
                sameness_check_type=SamenessDataValidationCheckType.STRINGS,
                max_allowed_error=max_allowed_error,
                view=BigQueryView(
                    dataset_id="my_dataset",
                    view_id="test_view",
                    view_query_template="select * from literally_anything",
                ),
            ),
        )
        result = SamenessValidationChecker.run_check(job)

        actual_expected_error = num_bad_rows / 100

        self.assertEqual(
            result,
            DataValidationJobResult(
                validation_job=job,
                was_successful=False,
                failure_description=f"{num_bad_rows} out of 100 row(s) did not contain matching strings. "
                f"The acceptable margin of error is only {max_allowed_error}, but the "
                f"validation returned an error rate of {actual_expected_error}.",
            ),
        )
示例#15
0
    def test_sameness_check_numbers_multiple_rows_above_margin(self):
        self.mock_client.run_query_async.return_value = [{
            'a': 97,
            'b': 100,
            'c': 99
        }, {
            'a': 14,
            'b': 21,
            'c': 14
        }]

        job = DataValidationJob(
            region_code='US_VA',
            validation=SamenessDataValidationCheck(
                validation_type=ValidationCheckType.SAMENESS,
                comparison_columns=['a', 'b', 'c'],
                sameness_check_type=SamenessDataValidationCheckType.NUMBERS,
                max_allowed_error=0.02,
                view=BigQueryView(
                    dataset_id='my_dataset',
                    view_id='test_view',
                    view_query_template='select * from literally_anything')))
        result = SamenessValidationChecker.run_check(job)

        self.assertEqual(
            result,
            DataValidationJobResult(
                validation_job=job,
                was_successful=False,
                failure_description=
                '2 row(s) had unacceptable margins of error. The acceptable margin '
                'of error is only 0.02, but the validation returned rows with '
                'errors as high as 0.3333.',
            ))
示例#16
0
    def test_sameness_check_strings_different_values_no_allowed_error(self) -> None:
        self.mock_client.run_query_async.return_value = [{"a": "a", "b": "b", "c": "c"}]

        job = DataValidationJob(
            region_code="US_XX",
            validation=SamenessDataValidationCheck(
                validation_category=ValidationCategory.EXTERNAL_INDIVIDUAL,
                validation_type=ValidationCheckType.SAMENESS,
                comparison_columns=["a", "b", "c"],
                partition_columns=[],
                sameness_check_type=SamenessDataValidationCheckType.STRINGS,
                view_builder=SimpleBigQueryViewBuilder(
                    dataset_id="my_dataset",
                    view_id="test_view",
                    description="test_view description",
                    view_query_template="select * from literally_anything",
                ),
            ),
        )
        result = SamenessValidationChecker.run_check(job)

        self.assertEqual(
            result,
            DataValidationJobResult(
                validation_job=job,
                result_details=SamenessStringsValidationResultDetails(
                    num_error_rows=1,
                    total_num_rows=1,
                    max_allowed_error=0.0,
                    non_null_counts_per_column_per_partition=[
                        (tuple(), {"a": 1, "b": 1, "c": 1}),
                    ],
                ),
            ),
        )
示例#17
0
    def test_string_sameness_check_numbers_values_all_none(self) -> None:
        self.mock_client.run_query_async.return_value = [
            {"a": None, "b": None, "c": None}
        ]

        job = DataValidationJob(
            region_code="US_VA",
            validation=SamenessDataValidationCheck(
                validation_type=ValidationCheckType.SAMENESS,
                comparison_columns=["a", "b", "c"],
                sameness_check_type=SamenessDataValidationCheckType.NUMBERS,
                view=BigQueryView(
                    dataset_id="my_dataset",
                    view_id="test_view",
                    view_query_template="select * from literally_anything",
                ),
            ),
        )

        with self.assertRaises(ValueError) as e:
            _ = SamenessValidationChecker.run_check(job)

        self.assertEqual(
            str(e.exception),
            "Unexpected None value for column [a] in validation [test_view].",
        )
示例#18
0
    def test_string_sameness_check_different_values_handle_empty_string(self) -> None:
        self.mock_client.run_query_async.return_value = [
            {"a": "same", "b": "same", "c": None}
        ]

        job = DataValidationJob(
            region_code="US_VA",
            validation=SamenessDataValidationCheck(
                validation_type=ValidationCheckType.SAMENESS,
                comparison_columns=["a", "b", "c"],
                sameness_check_type=SamenessDataValidationCheckType.STRINGS,
                view=BigQueryView(
                    dataset_id="my_dataset",
                    view_id="test_view",
                    view_query_template="select * from literally_anything",
                ),
            ),
        )
        result = SamenessValidationChecker.run_check(job)

        self.assertEqual(
            result,
            DataValidationJobResult(
                validation_job=job,
                was_successful=False,
                failure_description="1 out of 1 row(s) did not contain matching strings. "
                "The acceptable margin of error is only 0.0, but the "
                "validation returned an error rate of 1.0.",
            ),
        )
示例#19
0
def _fetch_validation_jobs_to_perform(
    region_code_filter: Optional[str] = None,
    validation_name_filter: Optional[Pattern] = None,
    dataset_overrides: Optional[Dict[str, str]] = None,
) -> List[DataValidationJob]:
    """
    Creates and returns validation jobs for all validations meeting the name filter,
    for the given region code, and with the dataset overrides if given.
    """
    validation_checks = get_all_validations()
    region_configs = get_validation_region_configs()
    global_config = get_validation_global_config()

    validation_jobs: List[DataValidationJob] = []
    for check in validation_checks:
        if check.validation_name in global_config.disabled:
            continue
        if validation_name_filter is not None and not re.search(
                validation_name_filter, check.validation_name):
            continue

        for region_code, region_config in region_configs.items():
            if region_code_filter and region_code != region_code_filter:
                continue
            if check.validation_name not in region_config.exclusions:
                updated_check = check.updated_for_region(region_config)
                validation_jobs.append(
                    DataValidationJob(
                        validation=updated_check,
                        region_code=region_code,
                        dataset_overrides=dataset_overrides,
                    ))

    return validation_jobs
示例#20
0
    def test_sameness_check_numbers_one_none(self) -> None:
        self.mock_client.run_query_async.return_value = [{"a": 3, "b": 3, "c": None}]

        job = DataValidationJob(
            region_code="US_XX",
            validation=SamenessDataValidationCheck(
                validation_category=ValidationCategory.EXTERNAL_AGGREGATE,
                validation_type=ValidationCheckType.SAMENESS,
                comparison_columns=["a", "b", "c"],
                sameness_check_type=SamenessDataValidationCheckType.NUMBERS,
                view_builder=SimpleBigQueryViewBuilder(
                    dataset_id="my_dataset",
                    view_id="test_view",
                    description="test_view description",
                    view_query_template="select * from literally_anything",
                ),
            ),
        )

        with self.assertRaises(ValueError) as e:
            _ = SamenessValidationChecker.run_check(job)

        self.assertEqual(
            str(e.exception),
            "Unexpected None value for column [c] in validation [test_view].",
        )
示例#21
0
def _fetch_validation_jobs_to_perform() -> List[DataValidationJob]:
    validation_checks = get_all_validations()

    validation_jobs: List[DataValidationJob] = []
    for check in validation_checks:
        for state_code in STATES_TO_VALIDATE:
            validation_jobs.append(
                DataValidationJob(validation=check, region_code=state_code))

    return validation_jobs
示例#22
0
def test_check_happy_path_existence():
    job = DataValidationJob(
        region_code='US_VA',
        validation=DataValidationCheck(
            validation_type=ValidationCheckType.EXISTENCE,
            view=BigQueryView(
                dataset_id='my_dataset',
                view_id='test_view',
                view_query_template='select * from literally_anything')))
    check_class = check_resolver.checker_for_validation(job)

    assert isinstance(check_class, ExistenceValidationChecker)
    def test_validation_job_returns_correct_query(self) -> None:
        builder = SimpleBigQueryViewBuilder(
            dataset_id="my_dataset",
            view_id="test_2",
            description="test_2 description",
            view_query_template="select * from literally_anything",
        )

        dataset_overrides = {"my_dataset": "my_dataset_override"}

        existence_check_job = DataValidationJob(
            validation=ExistenceDataValidationCheck(
                view_builder=builder, validation_category=ValidationCategory.INVARIANT
            ),
            region_code="US_XX",
        )

        self.assertEqual(
            "SELECT * FROM `recidiviz-456.my_dataset.test_2` WHERE region_code = 'US_XX';",
            existence_check_job.query_str(),
        )

        existence_check_job = DataValidationJob(
            validation=ExistenceDataValidationCheck(
                view_builder=builder, validation_category=ValidationCategory.INVARIANT
            ),
            region_code="US_XX",
            dataset_overrides=dataset_overrides,
        )

        self.assertEqual(
            "SELECT * FROM `recidiviz-456.my_dataset_override.test_2` WHERE region_code = 'US_XX';",
            existence_check_job.query_str(),
        )
示例#24
0
    def test_existence_check_no_failures(self):
        self.mock_client.run_query_async.return_value = []

        job = DataValidationJob(region_code='US_VA',
                                validation=ExistenceDataValidationCheck(
                                    validation_type=ValidationCheckType.EXISTENCE,
                                    view=BigQueryView(dataset_id='my_dataset',
                                                      view_id='test_view',
                                                      view_query_template='select * from literally_anything')
                                ))
        result = ExistenceValidationChecker.run_check(job)

        self.assertEqual(result,
                         DataValidationJobResult(validation_job=job, was_successful=True, failure_description=None))
示例#25
0
    def test_check_happy_path_existence(self) -> None:
        job = DataValidationJob(
            region_code="US_VA",
            validation=ExistenceDataValidationCheck(
                validation_type=ValidationCheckType.EXISTENCE,
                view=BigQueryView(
                    dataset_id="my_dataset",
                    view_id="test_view",
                    view_query_template="select * from literally_anything",
                ),
            ),
        )
        check_class = check_resolver.checker_for_validation(job)

        assert isinstance(check_class, ExistenceValidationChecker)
示例#26
0
    def test_existence_check_failures(self):
        self.mock_client.run_query_async.return_value = ['some result row', 'some other result row']

        job = DataValidationJob(region_code='US_VA',
                                validation=ExistenceDataValidationCheck(
                                    validation_type=ValidationCheckType.EXISTENCE,
                                    view=BigQueryView(dataset_id='my_dataset',
                                                      view_id='test_view',
                                                      view_query_template='select * from literally_anything')
                                ))
        result = ExistenceValidationChecker.run_check(job)

        self.assertEqual(result,
                         DataValidationJobResult(validation_job=job,
                                                 was_successful=False,
                                                 failure_description='Found 2 invalid rows, though 0 were expected'))
示例#27
0
    def test_sameness_check_strings_multiple_dates(self) -> None:
        self.mock_client.run_query_async.return_value = [
            # January 2021
            {"region": "US_XX", "date": "2021-01-31", "a": "00", "b": "00"},
            {"region": "US_XX", "date": "2021-01-31", "a": "01", "b": None},
            {"region": "US_XX", "date": "2021-01-31", "a": "02", "b": "02"},
            {"region": "US_XX", "date": "2021-01-31", "a": None, "b": "03"},
            # December 2020
            {"region": "US_XX", "date": "2020-12-31", "a": "00", "b": "00"},
            {"region": "US_XX", "date": "2020-12-31", "a": "02", "b": "02"},
            {"region": "US_XX", "date": "2020-12-31", "a": None, "b": "04"},
        ]

        job = DataValidationJob(
            region_code="US_XX",
            validation=SamenessDataValidationCheck(
                validation_category=ValidationCategory.EXTERNAL_INDIVIDUAL,
                validation_type=ValidationCheckType.SAMENESS,
                comparison_columns=["a", "b"],
                partition_columns=["region", "date"],
                sameness_check_type=SamenessDataValidationCheckType.STRINGS,
                max_allowed_error=0.0,
                view_builder=SimpleBigQueryViewBuilder(
                    dataset_id="my_dataset",
                    view_id="test_view",
                    description="test_view description",
                    view_query_template="select * from literally_anything",
                ),
            ),
        )
        result = SamenessValidationChecker.run_check(job)

        self.assertEqual(
            result,
            DataValidationJobResult(
                validation_job=job,
                result_details=SamenessStringsValidationResultDetails(
                    num_error_rows=3,
                    total_num_rows=7,
                    max_allowed_error=0.0,
                    non_null_counts_per_column_per_partition=[
                        (("US_XX", "2021-01-31"), {"a": 3, "b": 3}),
                        (("US_XX", "2020-12-31"), {"a": 2, "b": 3}),
                    ],
                ),
            ),
        )
示例#28
0
def _fetch_validation_jobs_to_perform(
    region_code_filter: Optional[str] = None, ) -> List[DataValidationJob]:
    validation_checks = get_all_validations()
    region_configs = get_validation_region_configs()
    global_config = get_validation_global_config()

    validation_jobs: List[DataValidationJob] = []
    for check in validation_checks:
        if check.validation_name in global_config.disabled:
            continue

        for region_code, region_config in region_configs.items():
            if region_code_filter and region_code != region_code_filter:
                continue
            if check.validation_name not in region_config.exclusions:
                updated_check = check.updated_for_region(region_config)
                validation_jobs.append(
                    DataValidationJob(validation=updated_check,
                                      region_code=region_code))

    return validation_jobs
示例#29
0
    def test_string_sameness_check_strings_values_all_none(self):
        self.mock_client.run_query_async.return_value = [{
            'a': None,
            'b': None,
            'c': None
        }]

        job = DataValidationJob(
            region_code='US_VA',
            validation=SamenessDataValidationCheck(
                validation_type=ValidationCheckType.SAMENESS,
                comparison_columns=['a', 'b', 'c'],
                sameness_check_type=SamenessDataValidationCheckType.STRINGS,
                view=BigQueryView(
                    dataset_id='my_dataset',
                    view_id='test_view',
                    view_query_template='select * from literally_anything')))
        result = SamenessValidationChecker.run_check(job)

        self.assertEqual(
            result,
            DataValidationJobResult(validation_job=job,
                                    was_successful=True,
                                    failure_description=None))
示例#30
0
    def test_string_sameness_check_numbers_one_none(self):
        self.mock_client.run_query_async.return_value = [{
            'a': 3,
            'b': 3,
            'c': None
        }]

        job = DataValidationJob(
            region_code='US_VA',
            validation=SamenessDataValidationCheck(
                validation_type=ValidationCheckType.SAMENESS,
                comparison_columns=['a', 'b', 'c'],
                sameness_check_type=SamenessDataValidationCheckType.NUMBERS,
                view=BigQueryView(
                    dataset_id='my_dataset',
                    view_id='test_view',
                    view_query_template='select * from literally_anything')))

        with self.assertRaises(ValueError) as e:
            _ = SamenessValidationChecker.run_check(job)

        self.assertEqual(
            str(e.exception),
            'Unexpected None value for column [c] in validation [test_view].')