def test_validate_dataset(dataset, basic_expectation_suite): res = ge.validate(dataset, basic_expectation_suite) assert res["success"] is True assert res["statistics"]["evaluated_expectations"] == 4 if isinstance(dataset, ge.dataset.PandasDataset): res = ge.validate(dataset, expectation_suite=basic_expectation_suite, data_asset_class=ge.dataset.PandasDataset) assert res["success"] is True assert res["statistics"]["evaluated_expectations"] == 4 with pytest.raises(ValueError, match=r"The validate util method only supports validation for subtypes of the provided data_asset_type"): ge.validate(dataset, basic_expectation_suite, data_asset_class=ge.dataset.SqlAlchemyDataset) elif isinstance(dataset, ge.dataset.SqlAlchemyDataset): res = ge.validate(dataset, expectation_suite=basic_expectation_suite, data_asset_class=ge.dataset.SqlAlchemyDataset) assert res["success"] is True assert res["statistics"]["evaluated_expectations"] == 4 with pytest.raises(ValueError, match=r"The validate util method only supports validation for subtypes of the provided data_asset_type"): ge.validate(dataset, expectation_suite=basic_expectation_suite, data_asset_class=ge.dataset.PandasDataset) elif isinstance(dataset, ge.dataset.SparkDFDataset): res = ge.validate(dataset, basic_expectation_suite, data_asset_class=ge.dataset.SparkDFDataset) assert res["success"] is True assert res["statistics"]["evaluated_expectations"] == 4 with pytest.raises(ValueError, match=r"The validate util method only supports validation for subtypes of the provided data_asset_type"): ge.validate(dataset, expectation_suite=basic_expectation_suite, data_asset_class=ge.dataset.PandasDataset)
def test_validate_invalid_parameters(dataset, basic_expectation_suite, data_context): with pytest.raises( ValueError, match= "Either an expectation suite or a DataContext is required for validation." ): ge.validate(dataset)
def test_validate_non_dataset(file_data_asset, empty_expectation_suite): with pytest.raises( ValueError, match=r"The validate util method only supports dataset validations" ): ge.validate(file_data_asset, empty_expectation_suite, data_asset_type=ge.data_asset.FileDataAsset)
def test_validate_non_dataset(file_data_asset, empty_expectation_suite): with pytest.raises( ValueError, match=r"The validate util method only supports dataset validations" ): with pytest.warns( Warning, match="No great_expectations version found in configuration object.", ): ge.validate( file_data_asset, empty_expectation_suite, data_asset_class=ge.data_asset.FileDataAsset, )
def test_validate_using_data_context( dataset, data_context_parameterized_expectation_suite ): # Before running, the data context should not have compiled parameters assert ( data_context_parameterized_expectation_suite._evaluation_parameter_dependencies_compiled is False ) with pytest.warns( Warning, match=r"This configuration object was built using version" ): res = ge.validate( dataset, expectation_suite_name="my_dag_node.default", data_context=data_context_parameterized_expectation_suite, ) # Since the handling of evaluation parameters is no longer happening without an action, # the context should still be not compiles after validation. assert ( data_context_parameterized_expectation_suite._evaluation_parameter_dependencies_compiled is False ) # And, we should have validated the right number of expectations from the context-provided config assert res.success is False assert res.statistics["evaluated_expectations"] == 2
def test_validate_using_data_context_path(dataset, data_context): data_context_path = data_context.root_directory res = ge.validate(dataset, expectation_suite_name="my_dag_node.default", data_context=data_context_path) # We should have now found the right suite with expectations to evaluate assert res.success is False assert res["statistics"]["evaluated_expectations"] == 2
def test_validate_using_data_context_path(dataset, data_context): data_context_path = data_context.root_directory res = ge.validate(dataset, data_asset_name="mydatasource/mygenerator/my_dag_node", data_context=data_context_path) # We should have now found the right suite with expectations to evaluate assert res["success"] is False assert res["statistics"]["evaluated_expectations"] == 2
def test_validate_using_data_context_path(dataset, data_context): data_context_path = data_context.root_directory res = ge.validate( dataset, data_asset_name="parameterized_expectation_suite_fixture", data_context=data_context_path) # We should have now found the right suite with expectations to evaluate assert res["success"] == False assert res["statistics"]["evaluated_expectations"] == 2
def test_validate_using_data_context(dataset, data_context): # Before running, the data context should not have compiled parameters assert data_context._compiled is False res = ge.validate(dataset, data_asset_name="mydatasource/mygenerator/my_dag_node", data_context=data_context) # After handling a validation result registration, it should be assert data_context._compiled is True # And, we should have validated the right number of expectations from the context-provided config assert res["success"] is False assert res["statistics"]["evaluated_expectations"] == 2
def test_validate_using_data_context(dataset, data_context): # Before running, the data context should not have compiled parameters assert data_context._evaluation_parameter_dependencies_compiled is False res = ge.validate(dataset, expectation_suite_name="my_dag_node.default", data_context=data_context) # Since the handling of evaluation parameters is no longer happening without an action, # the context should still be not compiles after validation. assert data_context._evaluation_parameter_dependencies_compiled is False # And, we should have validated the right number of expectations from the context-provided config assert res.success is False assert res.statistics["evaluated_expectations"] == 2
def test_validate_using_data_context_path( dataset, data_context_parameterized_expectation_suite): data_context_path = data_context_parameterized_expectation_suite.root_directory with pytest.warns( Warning, match=r"This configuration object was built using version"): res = ge.validate( dataset, expectation_suite_name="my_dag_node.default", data_context=data_context_path, ) # We should have now found the right suite with expectations to evaluate assert res.success is False assert res["statistics"]["evaluated_expectations"] == 2
def validate(self, df: pd.DataFrame) -> "GEValidationReport": """ Validate provided dataframe against GE expectation suite. 1. Pandas dataframe is converted into PandasDataset (GE type) 2. Some fixes applied to the data to avoid crashes inside GE (see _prepare_dataset) 3. Each expectation from ExpectationSuite instance tested against resulting dataset Return GEValidationReport, which parses great expectation's schema into list of generic ValidationErrors. """ dataset = PandasDataset(df) dataset = _prepare_dataset(dataset) results = ge.validate( dataset, expectation_suite=self.expectation_suite, result_format="COMPLETE" ) return GEValidationReport(results)
def __main__(): run_id = str(uuid.uuid1()) if len(sys.argv) <= 1: print("Please specify a filepath to process.") sys.exit(-2) df = load_data(sys.argv[1]) validation_result = ge.validate( df, data_context=ge.data_context.DataContext('../'), data_asset_name="notable_works_by_charles_dickens", run_id=run_id) if validation_result["success"] == False: print("Validation error for run {0:s}".format(str(run_id))) sys.exit(-1) df = add_columns(df) params = compute_model_parameters(df) print("processed run {run_id}.format(run_id)") print(json.dumps(params, indent=2))
def test_validate_dataset(dataset, basic_expectation_suite): res = ge.validate(dataset, basic_expectation_suite) # assert res.success is True # will not be true for mysql, where "infinities" column is missing assert res["statistics"]["evaluated_expectations"] == 4 if isinstance(dataset, ge.dataset.PandasDataset): res = ge.validate( dataset, expectation_suite=basic_expectation_suite, data_asset_class=ge.dataset.PandasDataset, ) assert res.success is True assert res["statistics"]["evaluated_expectations"] == 4 with pytest.raises( ValueError, match= r"The validate util method only supports validation for subtypes of the provided data_asset_type", ): ge.validate( dataset, basic_expectation_suite, data_asset_class=ge.dataset.SqlAlchemyDataset, ) elif (isinstance(dataset, ge.dataset.SqlAlchemyDataset) and dataset.sql_engine_dialect.name.lower() != "mysql"): res = ge.validate( dataset, expectation_suite=basic_expectation_suite, data_asset_class=ge.dataset.SqlAlchemyDataset, ) assert res.success is True assert res["statistics"]["evaluated_expectations"] == 4 with pytest.raises( ValueError, match= r"The validate util method only supports validation for subtypes of the provided data_asset_type", ): ge.validate( dataset, expectation_suite=basic_expectation_suite, data_asset_class=ge.dataset.PandasDataset, ) elif (isinstance(dataset, ge.dataset.SqlAlchemyDataset) and dataset.sql_engine_dialect.name.lower() == "mysql"): # mysql cannot use the infinities column res = ge.validate( dataset, expectation_suite=basic_expectation_suite, data_asset_class=ge.dataset.SqlAlchemyDataset, ) assert res.success is False assert res["statistics"]["evaluated_expectations"] == 4 with pytest.raises( ValueError, match= r"The validate util method only supports validation for subtypes of the provided data_asset_type", ): ge.validate( dataset, expectation_suite=basic_expectation_suite, data_asset_class=ge.dataset.PandasDataset, ) elif isinstance(dataset, ge.dataset.SparkDFDataset): res = ge.validate(dataset, basic_expectation_suite, data_asset_class=ge.dataset.SparkDFDataset) assert res.success is True assert res["statistics"]["evaluated_expectations"] == 4 with pytest.raises( ValueError, match= r"The validate util method only supports validation for subtypes of the provided data_asset_type", ): ge.validate( dataset, expectation_suite=basic_expectation_suite, data_asset_class=ge.dataset.PandasDataset, )
def test_top_level_validate(self): my_df = pd.DataFrame({"x": [1, 2, 3, 4, 5]}) validation_result = ge.validate( my_df, { "dataset_name": None, "meta": { "great_expectations.__version__": ge.__version__ }, "expectations": [{ "expectation_type": "expect_column_to_exist", "kwargs": { "column": "x" } }, { "expectation_type": "expect_column_values_to_be_between", "kwargs": { "column": "x", "min_value": 3, "max_value": 5 } }] }) self.assertEqual( validation_result, { "results": [{ "expectation_config": { "kwargs": { "column": "x" }, "expectation_type": "expect_column_to_exist", }, "exception_info": { "exception_message": None, "exception_traceback": None, "raised_exception": False }, "success": True }, { "expectation_config": { "expectation_type": "expect_column_values_to_be_between", "kwargs": { "column": "x", "max_value": 5, "min_value": 3 } }, "exception_info": { "exception_message": None, "exception_traceback": None, "raised_exception": False }, "success": False, "result": { 'element_count': 5, 'missing_count': 0, 'missing_percent': 0.0, "unexpected_percent": 0.4, "partial_unexpected_list": [1, 2], "unexpected_percent_nonmissing": 0.4, "unexpected_count": 2 } }], "success": False, "statistics": { "evaluated_expectations": 2, "successful_expectations": 1, "unsuccessful_expectations": 1, "success_percent": 50, } })