def test_spark_expect_column_value_z_scores_to_be_less_than_impl( spark_session, basic_spark_df_execution_engine ): df = pd.DataFrame({"a": [1, 5, 22, 3, 5, 10]}) spark = get_or_create_spark_application( spark_config={ "spark.sql.catalogImplementation": "hive", "spark.executor.memory": "450m", # "spark.driver.allowMultipleContexts": "true", # This directive does not appear to have any effect. } ) df = spark.createDataFrame(df) expectationConfiguration = ExpectationConfiguration( expectation_type="expect_column_value_z_scores_to_be_less_than", kwargs={ "column": "a", "mostly": 0.9, "threshold": 4, "double_sided": True, }, ) expectation = ExpectColumnValueZScoresToBeLessThan(expectationConfiguration) engine = basic_spark_df_execution_engine engine.load_batch_data(batch_id="my_id", batch_data=df) result = expectation.validate(Validator(execution_engine=engine)) assert result == ExpectationValidationResult( success=True, )
def test_sa_expect_column_value_z_scores_to_be_less_than_impl(postgresql_engine): df = pd.DataFrame({"a": [1, 5, 22, 3, 5, 10]}) df.to_sql( name="z_score_test_data", con=postgresql_engine, index=False, if_exists="replace", ) expectationConfiguration = ExpectationConfiguration( expectation_type="expect_column_value_z_scores_to_be_less_than", kwargs={ "column": "a", "mostly": 0.9, "threshold": 4, "double_sided": True, }, ) expectation = ExpectColumnValueZScoresToBeLessThan(expectationConfiguration) engine = SqlAlchemyExecutionEngine(engine=postgresql_engine) engine.load_batch_data( "my_id", SqlAlchemyBatchData(execution_engine=engine, table_name="z_score_test_data"), ) result = expectation.validate(Validator(execution_engine=engine)) assert result == ExpectationValidationResult( success=True, )
def test_parse_validation_graph(): df = pd.DataFrame({"a": [1, 5, 22, 3, 5, 10], "b": [1, 2, 3, 4, 5, 6]}) expectation_configuration = ExpectationConfiguration( expectation_type="expect_column_value_z_scores_to_be_less_than", kwargs={ "column": "a", "mostly": 0.9, "threshold": 4, "double_sided": True, }, ) # noinspection PyUnusedLocal expectation = ExpectColumnValueZScoresToBeLessThan( expectation_configuration) # noinspection PyUnusedLocal batch = Batch(data=df) graph = ValidationGraph() engine = PandasExecutionEngine() for configuration in [expectation_configuration]: expectation_impl = get_expectation_impl( "expect_column_value_z_scores_to_be_less_than") validation_dependencies = expectation_impl( configuration).get_validation_dependencies(configuration, engine) for metric_configuration in validation_dependencies["metrics"].values( ): Validator(execution_engine=engine).build_metric_dependency_graph( graph=graph, execution_engine=engine, metric_configuration=metric_configuration, configuration=configuration, ) ready_metrics, needed_metrics = Validator(engine)._parse_validation_graph( validation_graph=graph, metrics=dict()) assert len(ready_metrics) == 2 and len(needed_metrics) == 9
def test_populate_dependencies(): df = pd.DataFrame({"a": [1, 5, 22, 3, 5, 10], "b": [1, 2, 3, 4, 5, 6]}) expectationConfiguration = ExpectationConfiguration( expectation_type="expect_column_value_z_scores_to_be_less_than", kwargs={ "column": "a", "mostly": 0.9, "threshold": 4, "double_sided": True, }, ) expectation = ExpectColumnValueZScoresToBeLessThan( expectationConfiguration) batch = Batch(data=df) graph = ValidationGraph() engine = PandasExecutionEngine() for configuration in [expectationConfiguration]: expectation_impl = get_expectation_impl( "expect_column_value_z_scores_to_be_less_than") validation_dependencies = expectation_impl( configuration).get_validation_dependencies( configuration, engine, ) for metric_configuration in validation_dependencies["metrics"].values( ): Validator(execution_engine=engine).build_metric_dependency_graph( graph, metric_configuration, configuration, execution_engine=engine) assert len(graph.edges) == 10
def test_expect_column_value_z_scores_to_be_less_than_impl(): df = pd.DataFrame({"a": [1, 5, 22, 3, 5, 10]}) expectationConfiguration = ExpectationConfiguration( expectation_type="expect_column_value_z_scores_to_be_less_than", kwargs={ "column": "a", "mostly": 0.9, "threshold": 4, "double_sided": True, }, ) expectation = ExpectColumnValueZScoresToBeLessThan( expectationConfiguration) engine = PandasExecutionEngine(batch_data_dict={"my_id": df}) result = expectation.validate(Validator(execution_engine=engine)) assert result == ExpectationValidationResult(success=True, )
def test_graph_validate_with_runtime_config(basic_datasource): df = pd.DataFrame( {"a": [1, 5, 22, 3, 5, 10, 2, 3], "b": [97, 332, 3, 4, 5, 6, 7, None]} ) expectationConfiguration = ExpectationConfiguration( expectation_type="expect_column_value_z_scores_to_be_less_than", kwargs={"column": "b", "mostly": 1, "threshold": 2, "double_sided": True}, ) expectation = ExpectColumnValueZScoresToBeLessThan(expectationConfiguration) batch = basic_datasource.get_single_batch_from_batch_request( BatchRequest( **{ "datasource_name": "my_datasource", "data_connector_name": "test_runtime_data_connector", "batch_data": df, "partition_request": PartitionRequest( **{ "partition_identifiers": { "pipeline_stage_name": 0, "run_id": 0, "custom_key_0": 0, } } ), } ) ) try: result = Validator( execution_engine=PandasExecutionEngine(), batches=(batch,) ).graph_validate( configurations=[expectationConfiguration], runtime_configuration={"result_format": "COMPLETE"}, ) except AssertionError as e: result = e assert result == [ ExpectationValidationResult( success=False, meta={}, result={ "element_count": 8, "unexpected_count": 1, "unexpected_percent": 12.5, "partial_unexpected_list": [332.0], "missing_count": 1, "missing_percent": 12.5, "unexpected_percent_nonmissing": 14.285714285714285, "partial_unexpected_index_list": None, "partial_unexpected_counts": [{"value": 332.0, "count": 1}], "unexpected_list": [332.0], "unexpected_index_list": None, }, expectation_config=None, exception_info=None, ) ]
def test_populate_dependencies_with_incorrect_metric_name(): df = pd.DataFrame({"a": [1, 5, 22, 3, 5, 10], "b": [1, 2, 3, 4, 5, 6]}) expectation_configuration = ExpectationConfiguration( expectation_type="expect_column_value_z_scores_to_be_less_than", kwargs={ "column": "a", "mostly": 0.9, "threshold": 4, "double_sided": True, }, ) # noinspection PyUnusedLocal expectation = ExpectColumnValueZScoresToBeLessThan(expectation_configuration) # noinspection PyUnusedLocal batch = Batch(data=df) graph = ValidationGraph() engine = PandasExecutionEngine() for configuration in [expectation_configuration]: expectation_impl = get_expectation_impl( "expect_column_value_z_scores_to_be_less_than" ) validation_dependencies = expectation_impl( configuration ).get_validation_dependencies( configuration, engine, ) try: Validator(execution_engine=engine).build_metric_dependency_graph( graph=graph, execution_engine=engine, metric_configuration=MetricConfiguration( "column_values.not_a_metric", IDDict() ), configuration=configuration, ) except ge_exceptions.MetricProviderError as e: graph = e assert isinstance(graph, ge_exceptions.MetricProviderError)