def test_parse_validation_graph_with_bad_metrics_args(): df = pd.DataFrame({"a": [1, 5, 22, 3, 5, 10], "b": [1, 2, 3, 4, 5, 6]}) expectation_configuration = ExpectationConfiguration( expectation_type="expect_column_value_z_scores_to_be_less_than", kwargs={ "column": "a", "mostly": 0.9, "threshold": 4, "double_sided": True, }, ) graph = ValidationGraph() engine = PandasExecutionEngine() validator = Validator(execution_engine=engine) for configuration in [expectation_configuration]: expectation_impl = get_expectation_impl( "expect_column_value_z_scores_to_be_less_than") validation_dependencies = expectation_impl( configuration).get_validation_dependencies( configuration, execution_engine=engine, ) for metric_configuration in validation_dependencies["metrics"].values( ): validator.build_metric_dependency_graph( graph=graph, execution_engine=engine, metric_configuration=metric_configuration, configuration=configuration, ) ready_metrics, needed_metrics = validator._parse_validation_graph( validation_graph=graph, metrics=("nonexistent", "NONE")) assert len(ready_metrics) == 2 and len(needed_metrics) == 9
def get_runtime_kwargs(self, runtime_configuration=None): expectation_kwargs_dict = self.kwarg_lookup_dict.get( self.expectation_type, None ) if expectation_kwargs_dict is None: impl = get_expectation_impl(self.expectation_type) if impl is not None: runtime_keys = impl.runtime_keys default_kwarg_values = impl.default_kwarg_values else: expectation_kwargs_dict = self._get_default_custom_kwargs() default_kwarg_values = expectation_kwargs_dict.get( "default_kwarg_values", dict() ) runtime_keys = self.runtime_kwargs else: default_kwarg_values = expectation_kwargs_dict.get( "default_kwarg_values", dict() ) runtime_keys = self.runtime_kwargs success_kwargs = self.get_success_kwargs() lookup_kwargs = deepcopy(self.kwargs) if runtime_configuration: lookup_kwargs.update(runtime_configuration) runtime_kwargs = { key: lookup_kwargs.get(key, default_kwarg_values.get(key)) for key in runtime_keys } runtime_kwargs["result_format"] = parse_result_format( runtime_kwargs["result_format"] ) runtime_kwargs.update(success_kwargs) return runtime_kwargs
def test_parse_validation_graph(): df = pd.DataFrame({"a": [1, 5, 22, 3, 5, 10], "b": [1, 2, 3, 4, 5, 6]}) expectation_configuration = ExpectationConfiguration( expectation_type="expect_column_value_z_scores_to_be_less_than", kwargs={ "column": "a", "mostly": 0.9, "threshold": 4, "double_sided": True, }, ) # noinspection PyUnusedLocal expectation = ExpectColumnValueZScoresToBeLessThan( expectation_configuration) # noinspection PyUnusedLocal batch = Batch(data=df) graph = ValidationGraph() engine = PandasExecutionEngine() for configuration in [expectation_configuration]: expectation_impl = get_expectation_impl( "expect_column_value_z_scores_to_be_less_than") validation_dependencies = expectation_impl( configuration).get_validation_dependencies(configuration, engine) for metric_configuration in validation_dependencies["metrics"].values( ): Validator(execution_engine=engine).build_metric_dependency_graph( graph=graph, execution_engine=engine, metric_configuration=metric_configuration, configuration=configuration, ) ready_metrics, needed_metrics = Validator(engine)._parse_validation_graph( validation_graph=graph, metrics=dict()) assert len(ready_metrics) == 2 and len(needed_metrics) == 9
def get_success_kwargs(self): expectation_kwargs_dict = self.kwarg_lookup_dict.get( self.expectation_type, None ) if expectation_kwargs_dict is None: impl = get_expectation_impl(self.expectation_type) if impl is not None: success_keys = impl.success_keys default_kwarg_values = impl.default_kwarg_values else: expectation_kwargs_dict = self._get_default_custom_kwargs() default_kwarg_values = expectation_kwargs_dict.get( "default_kwarg_values", dict() ) success_keys = expectation_kwargs_dict["success_kwargs"] else: default_kwarg_values = expectation_kwargs_dict.get( "default_kwarg_values", dict() ) success_keys = expectation_kwargs_dict["success_kwargs"] domain_kwargs = self.get_domain_kwargs() success_kwargs = { key: self.kwargs.get(key, default_kwarg_values.get(key)) for key in success_keys } success_kwargs.update(domain_kwargs) return success_kwargs
def get_domain_kwargs(self): expectation_kwargs_dict = self.kwarg_lookup_dict.get( self.expectation_type, None ) if expectation_kwargs_dict is None: impl = get_expectation_impl(self.expectation_type) if impl is not None: domain_keys = impl.domain_keys default_kwarg_values = impl.default_kwarg_values else: expectation_kwargs_dict = self._get_default_custom_kwargs() default_kwarg_values = expectation_kwargs_dict.get( "default_kwarg_values", dict() ) domain_keys = expectation_kwargs_dict["domain_kwargs"] else: default_kwarg_values = expectation_kwargs_dict.get( "default_kwarg_values", dict() ) domain_keys = expectation_kwargs_dict["domain_kwargs"] domain_kwargs = { key: self.kwargs.get(key, default_kwarg_values.get(key)) for key in domain_keys } missing_kwargs = set(domain_keys) - set(domain_kwargs.keys()) if missing_kwargs: raise InvalidExpectationKwargsError( f"Missing domain kwargs: {list(missing_kwargs)}" ) return domain_kwargs
def test_populate_dependencies(): df = pd.DataFrame({"a": [1, 5, 22, 3, 5, 10], "b": [1, 2, 3, 4, 5, 6]}) expectationConfiguration = ExpectationConfiguration( expectation_type="expect_column_value_z_scores_to_be_less_than", kwargs={ "column": "a", "mostly": 0.9, "threshold": 4, "double_sided": True, }, ) expectation = ExpectColumnValueZScoresToBeLessThan( expectationConfiguration) batch = Batch(data=df) graph = ValidationGraph() engine = PandasExecutionEngine() for configuration in [expectationConfiguration]: expectation_impl = get_expectation_impl( "expect_column_value_z_scores_to_be_less_than") validation_dependencies = expectation_impl( configuration).get_validation_dependencies( configuration, engine, ) for metric_configuration in validation_dependencies["metrics"].values( ): Validator(execution_engine=engine).build_metric_dependency_graph( graph, metric_configuration, configuration, execution_engine=engine) assert len(graph.edges) == 10
def test_populate_dependencies_with_incorrect_metric_name(): df = pd.DataFrame({"a": [1, 5, 22, 3, 5, 10], "b": [1, 2, 3, 4, 5, 6]}) expectation_configuration = ExpectationConfiguration( expectation_type="expect_column_value_z_scores_to_be_less_than", kwargs={ "column": "a", "mostly": 0.9, "threshold": 4, "double_sided": True, }, ) # noinspection PyUnusedLocal expectation = ExpectColumnValueZScoresToBeLessThan(expectation_configuration) # noinspection PyUnusedLocal batch = Batch(data=df) graph = ValidationGraph() engine = PandasExecutionEngine() for configuration in [expectation_configuration]: expectation_impl = get_expectation_impl( "expect_column_value_z_scores_to_be_less_than" ) validation_dependencies = expectation_impl( configuration ).get_validation_dependencies( configuration, engine, ) try: Validator(execution_engine=engine).build_metric_dependency_graph( graph=graph, execution_engine=engine, metric_configuration=MetricConfiguration( "column_values.not_a_metric", IDDict() ), configuration=configuration, ) except ge_exceptions.MetricProviderError as e: graph = e assert isinstance(graph, ge_exceptions.MetricProviderError)
def test_resolve_validation_graph_with_bad_config_catch_exceptions_true( basic_datasource, ): df = pd.DataFrame({"a": [1, 5, 22, 3, 5, 10], "b": [1, 2, 3, 4, 5, None]}) batch = basic_datasource.get_single_batch_from_batch_request( RuntimeBatchRequest( **{ "datasource_name": "my_datasource", "data_connector_name": "test_runtime_data_connector", "data_asset_name": "IN_MEMORY_DATA_ASSET", "runtime_parameters": { "batch_data": df, }, "batch_identifiers": { "pipeline_stage_name": 0, "airflow_run_id": 0, "custom_key_0": 0, }, })) expectation_configuration = ExpectationConfiguration( expectation_type="expect_column_max_to_be_between", kwargs={ "column": "not_in_table", "min_value": 1, "max_value": 29 }, ) runtime_configuration = { "catch_exceptions": True, "result_format": { "result_format": "BASIC" }, } execution_engine = PandasExecutionEngine() validator = Validator(execution_engine=execution_engine, batches=[batch]) expectation_impl = get_expectation_impl( expectation_configuration.expectation_type) validation_dependencies = expectation_impl().get_validation_dependencies( expectation_configuration, execution_engine, runtime_configuration)["metrics"] graph = ValidationGraph() for metric_configuration in validation_dependencies.values(): validator.build_metric_dependency_graph( graph=graph, execution_engine=execution_engine, metric_configuration=metric_configuration, configuration=expectation_configuration, runtime_configuration=runtime_configuration, ) metrics: Dict[Tuple[str, str, str], Any] = {} aborted_metrics_info: Dict[Tuple[str, str, str], Dict[ str, Union[MetricConfiguration, Set[ExceptionInfo], int]], ] = validator.resolve_validation_graph( graph=graph, metrics=metrics, runtime_configuration=runtime_configuration, ) assert len(aborted_metrics_info) == 1 aborted_metric_info_item = list(aborted_metrics_info.values())[0] assert aborted_metric_info_item[ "num_failures"] == MAX_METRIC_COMPUTATION_RETRIES assert len(aborted_metric_info_item["exception_info"]) == 1 exception_info = next(iter(aborted_metric_info_item["exception_info"])) assert (exception_info["exception_message"] == 'Error: The column "not_in_table" in BatchData does not exist.')
def _check_linting( expectation_instance) -> ExpectationDiagnosticCheckMessage: """Check if linting checks pass for Expectation""" sub_messages: List[dict] = [] message: str = "Passes all linting checks" passed: bool = False black_ok: bool = False isort_ok: bool = False file_and_class_names_ok: bool = False rx_expectation_instance_repr = re.compile( r"<.*\.([^\.]*) object at .*") try: expectation_camel_name = rx_expectation_instance_repr.match( repr(expectation_instance)).group(1) except AttributeError: sub_messages.append({ "message": "Arg passed to _check_linting was not an instance of an Expectation, so cannot check linting", "passed": False, }) return ExpectationDiagnosticCheckMessage( message=message, passed=passed, sub_messages=sub_messages, ) impl = get_expectation_impl(camel_to_snake(expectation_camel_name)) try: source_file_path = inspect.getfile(impl) except TypeError: sub_messages.append({ "message": "inspect.getfile(impl) raised a TypeError (impl is a built-in class)", "passed": False, }) return ExpectationDiagnosticCheckMessage( message=message, passed=passed, sub_messages=sub_messages, ) snaked_impl_name = camel_to_snake(impl.__name__) source_file_base_no_ext = os.path.basename(source_file_path).rsplit( ".", 1)[0] with open(source_file_path) as fp: code = fp.read() if snaked_impl_name != source_file_base_no_ext: sub_messages.append({ "message": f"The snake_case of {impl.__name__} ({snaked_impl_name}) does not match filename part ({source_file_base_no_ext})", "passed": False, }) else: file_and_class_names_ok = True if black is None: sub_messages.append({ "message": "Could not find 'black', so cannot check linting", "passed": False, }) if isort is None: sub_messages.append({ "message": "Could not find 'isort', so cannot check linting", "passed": False, }) if black and isort: blacked_code = lint_code(code) if code != blacked_code: sub_messages.append({ "message": "Your code would be reformatted with black", "passed": False, }) else: black_ok = True isort_ok = isort.check_code( code, **isort.profiles.black, ignore_whitespace=True, known_local_folder=["great_expectations"], ) if not isort_ok: sub_messages.append({ "message": "Your code would be reformatted with isort", "passed": False, }) passed = black_ok and isort_ok and file_and_class_names_ok return ExpectationDiagnosticCheckMessage( message=message, passed=passed, sub_messages=sub_messages, )
def _get_expectation_impl(self): return get_expectation_impl(self.expectation_type)
def test_registry_basics(): expectation = get_expectation_impl("expect_column_values_to_be_in_set") assert expectation == ExpectColumnValuesToBeInSet