def test_build_suite_when_suite_already_exists(mock_emit, cardinality_dataset): """ What does this test do and why? Confirms that creating a new suite on an existing profiler wipes the previous suite """ profiler = UserConfigurableProfiler( cardinality_dataset, table_expectations_only=True, excluded_expectations=["expect_table_row_count_to_be_between"], ) suite = profiler.build_suite() _, expectations = get_set_of_columns_and_expectations_from_suite(suite) assert len(suite.expectations) == 1 assert "expect_table_columns_to_match_ordered_list" in expectations profiler.excluded_expectations = [ "expect_table_columns_to_match_ordered_list" ] suite = profiler.build_suite() _, expectations = get_set_of_columns_and_expectations_from_suite(suite) assert len(suite.expectations) == 1 assert "expect_table_row_count_to_be_between" in expectations # Note 20211209 - Currently the only method called by the Profiler that is instrumented for usage_statistics # is ExpectationSuite's add_expectation(). It will not send a usage_stats event when called from a Profiler. # this number can change in the future if our instrumentation changes. assert mock_emit.call_count == 0 assert mock_emit.call_args_list == []
def test_config_with_not_null_only(titanic_data_context_modular_api, nulls_validator, possible_expectations_set): """ What does this test do and why? Confirms that the not_null_only key in config works as expected. """ excluded_expectations = [ i for i in possible_expectations_set if "null" not in i ] validator = nulls_validator profiler_without_not_null_only = UserConfigurableProfiler( validator, excluded_expectations, not_null_only=False) suite_without_not_null_only = profiler_without_not_null_only.build_suite() _, expectations = get_set_of_columns_and_expectations_from_suite( suite_without_not_null_only) assert expectations == { "expect_column_values_to_be_null", "expect_column_values_to_not_be_null", } profiler_with_not_null_only = UserConfigurableProfiler( validator, excluded_expectations, not_null_only=True) not_null_only_suite = profiler_with_not_null_only.build_suite() _, expectations = get_set_of_columns_and_expectations_from_suite( not_null_only_suite) assert expectations == {"expect_column_values_to_not_be_null"} no_config_profiler = UserConfigurableProfiler(validator) no_config_suite = no_config_profiler.build_suite() _, expectations = get_set_of_columns_and_expectations_from_suite( no_config_suite) assert "expect_column_values_to_be_null" in expectations
def test_build_suite_with_config_and_no_semantic_types_dict( titanic_validator, possible_expectations_set): """ What does this test do and why? Tests that the build_suite function works as expected with a config and without a semantic_types dict """ profiler = UserConfigurableProfiler( titanic_validator, ignored_columns=["Survived", "Unnamed: 0"], excluded_expectations=["expect_column_mean_to_be_between"], primary_or_compound_key=["Name"], table_expectations_only=False, value_set_threshold="very_few", ) suite = profiler.build_suite() ( columns_with_expectations, expectations_from_suite, ) = get_set_of_columns_and_expectations_from_suite(suite) columns_expected_in_suite = {"Name", "PClass", "Age", "Sex", "SexCode"} assert columns_with_expectations == columns_expected_in_suite assert expectations_from_suite.issubset(possible_expectations_set) assert "expect_column_mean_to_be_between" not in expectations_from_suite assert len(suite.expectations) == 29
def test_profiler_all_expectation_types_spark( titanic_data_context_modular_api, taxi_validator_spark, possible_expectations_set, taxi_data_semantic_types, taxi_data_ignored_columns, ): """ What does this test do and why? Ensures that all available expectation types work as expected for spark """ context = titanic_data_context_modular_api profiler = UserConfigurableProfiler( taxi_validator_spark, semantic_types_dict=taxi_data_semantic_types, ignored_columns=taxi_data_ignored_columns, # TODO: Add primary_or_compound_key test # primary_or_compound_key=[ # "vendor_id", # "pickup_datetime", # "dropoff_datetime", # "trip_distance", # "pickup_location_id", # "dropoff_location_id", # ], ) assert profiler.column_info.get("rate_code_id") with pytest.deprecated_call( ): # parse_strings_as_datetimes is deprecated in V3 suite = profiler.build_suite() assert len(suite.expectations) == 45 ( columns_with_expectations, expectations_from_suite, ) = get_set_of_columns_and_expectations_from_suite(suite) unexpected_expectations = { "expect_column_values_to_be_unique", "expect_column_values_to_be_null", "expect_compound_columns_to_be_unique", } assert expectations_from_suite == { i for i in possible_expectations_set if i not in unexpected_expectations } ignored_included_columns_overlap = [ i for i in columns_with_expectations if i in taxi_data_ignored_columns ] assert len(ignored_included_columns_overlap) == 0 with pytest.deprecated_call( ): # parse_strings_as_datetimes is deprecated in V3 results = context.run_validation_operator( "action_list_operator", assets_to_validate=[taxi_validator_spark]) assert results["success"]
def test_build_suite_with_config_and_no_semantic_types_dict( mock_emit, titanic_validator, possible_expectations_set ): """ What does this test do and why? Tests that the build_suite function works as expected with a config and without a semantic_types dict """ profiler = UserConfigurableProfiler( titanic_validator, ignored_columns=["Survived", "Unnamed: 0"], excluded_expectations=["expect_column_mean_to_be_between"], primary_or_compound_key=["Name"], table_expectations_only=False, value_set_threshold="very_few", ) suite = profiler.build_suite() ( columns_with_expectations, expectations_from_suite, ) = get_set_of_columns_and_expectations_from_suite(suite) columns_expected_in_suite = {"Name", "PClass", "Age", "Sex", "SexCode"} assert columns_with_expectations == columns_expected_in_suite assert expectations_from_suite.issubset(possible_expectations_set) assert "expect_column_mean_to_be_between" not in expectations_from_suite assert len(suite.expectations) == 29 assert mock_emit.call_count == 1 assert "expectation_suite.add_expectation" not in [ mock_emit.call_args_list[0][0][0]["event"] ] # noinspection PyUnresolvedReferences expected_events: List[unittest.mock._Call] # noinspection PyUnresolvedReferences actual_events: List[unittest.mock._Call] expected_events = [ mock.call( { "event": "legacy_profiler.build_suite", "event_payload": { "profile_dataset_type": "Validator", "excluded_expectations_specified": True, "ignored_columns_specified": True, "not_null_only": False, "primary_or_compound_key_specified": True, "semantic_types_dict_specified": False, "table_expectations_only": False, "value_set_threshold_specified": True, "api_version": "v2", }, "success": True, } ), ] actual_events = mock_emit.call_args_list assert actual_events == expected_events
def test_config_with_not_null_only( titanic_data_context_modular_api, possible_expectations_set ): """ What does this test do and why? Confirms that the not_null_only key in config works as expected. """ excluded_expectations = [i for i in possible_expectations_set if "null" not in i] df = pd.DataFrame( { "mostly_null": [i if i % 3 == 0 else None for i in range(0, 1000)], "mostly_not_null": [None if i % 3 == 0 else i for i in range(0, 1000)], } ) validator = get_pandas_runtime_validator(titanic_data_context_modular_api, df) profiler_without_not_null_only = UserConfigurableProfiler( validator, excluded_expectations, not_null_only=False ) suite_without_not_null_only = profiler_without_not_null_only.build_suite() _, expectations = get_set_of_columns_and_expectations_from_suite( suite_without_not_null_only ) assert expectations == { "expect_column_values_to_be_null", "expect_column_values_to_not_be_null", } profiler_with_not_null_only = UserConfigurableProfiler( validator, excluded_expectations, not_null_only=True ) not_null_only_suite = profiler_with_not_null_only.build_suite() _, expectations = get_set_of_columns_and_expectations_from_suite( not_null_only_suite ) assert expectations == {"expect_column_values_to_not_be_null"} no_config_profiler = UserConfigurableProfiler(validator) no_config_suite = no_config_profiler.build_suite() _, expectations = get_set_of_columns_and_expectations_from_suite(no_config_suite) assert "expect_column_values_to_be_null" in expectations
def test_profiler_all_expectation_types_pandas( titanic_data_context_modular_api, taxi_validator_pandas, possible_expectations_set, taxi_data_semantic_types, taxi_data_ignored_columns, ): """ What does this test do and why? Ensures that all available expectation types work as expected for pandas """ context = titanic_data_context_modular_api profiler = UserConfigurableProfiler( taxi_validator_pandas, semantic_types_dict=taxi_data_semantic_types, ignored_columns=taxi_data_ignored_columns, primary_or_compound_key=[ "vendor_id", "pickup_datetime", "dropoff_datetime", "trip_distance", "pickup_location_id", "dropoff_location_id", ], ) assert profiler.column_info.get("rate_code_id") suite = profiler.build_suite() assert len(suite.expectations) == 41 ( columns_with_expectations, expectations_from_suite, ) = get_set_of_columns_and_expectations_from_suite(suite) unexpected_expectations = { "expect_column_values_to_be_unique", "expect_column_values_to_be_null", "expect_column_values_to_be_between", } assert expectations_from_suite == { i for i in possible_expectations_set if i not in unexpected_expectations } ignored_included_columns_overlap = [ i for i in columns_with_expectations if i in taxi_data_ignored_columns ] assert len(ignored_included_columns_overlap) == 0 results = context.run_validation_operator( "action_list_operator", assets_to_validate=[taxi_validator_pandas] ) assert results["success"]
def test_build_suite_when_suite_already_exists(cardinality_dataset): """ What does this test do and why? Confirms that creating a new suite on an existing profiler wipes the previous suite """ profiler = UserConfigurableProfiler( cardinality_dataset, table_expectations_only=True, excluded_expectations=["expect_table_row_count_to_be_between"], ) suite = profiler.build_suite() _, expectations = get_set_of_columns_and_expectations_from_suite(suite) assert len(suite.expectations) == 1 assert "expect_table_columns_to_match_ordered_list" in expectations profiler.excluded_expectations = ["expect_table_columns_to_match_ordered_list"] suite = profiler.build_suite() _, expectations = get_set_of_columns_and_expectations_from_suite(suite) assert len(suite.expectations) == 1 assert "expect_table_row_count_to_be_between" in expectations
def test_build_suite_with_semantic_types_dict( mock_emit, cardinality_dataset, possible_expectations_set, ): """ What does this test do and why? Tests that the build_suite function works as expected with a semantic_types dict """ semantic_types = { "numeric": ["col_few", "col_many", "col_very_many"], "value_set": ["col_two", "col_very_few"], } profiler = UserConfigurableProfiler( cardinality_dataset, semantic_types_dict=semantic_types, primary_or_compound_key=["col_unique"], ignored_columns=["col_one"], value_set_threshold="unique", table_expectations_only=False, excluded_expectations=["expect_column_values_to_not_be_null"], ) suite = profiler.build_suite() ( columns_with_expectations, expectations_from_suite, ) = get_set_of_columns_and_expectations_from_suite(suite) assert "column_one" not in columns_with_expectations assert "expect_column_values_to_not_be_null" not in expectations_from_suite assert expectations_from_suite.issubset(possible_expectations_set) assert len(suite.expectations) == 33 value_set_expectations = [ i for i in suite.expectations if i.expectation_type == "expect_column_values_to_be_in_set" ] value_set_columns = { i.kwargs.get("column") for i in value_set_expectations } assert len(value_set_columns) == 2 assert value_set_columns == {"col_two", "col_very_few"} # Note 20211209 - Currently the only method called by the Profiler that is instrumented for usage_statistics # is ExpectationSuite's add_expectation(). It will not send a usage_stats event when called from a Profiler. # this number can change in the future if our instrumentation changes. assert mock_emit.call_count == 0 assert mock_emit.call_args_list == []
def test_build_suite_with_semantic_types_dict( cardinality_validator, possible_expectations_set, ): """ What does this test do and why? Tests that the build_suite function works as expected with a semantic_types dict """ semantic_types = { "numeric": ["col_few", "col_many", "col_very_many"], "value_set": ["col_two", "col_very_few"], } profiler = UserConfigurableProfiler( cardinality_validator, semantic_types_dict=semantic_types, primary_or_compound_key=["col_unique"], ignored_columns=["col_one"], value_set_threshold="unique", table_expectations_only=False, excluded_expectations=["expect_column_values_to_not_be_null"], ) suite = profiler.build_suite() ( columns_with_expectations, expectations_from_suite, ) = get_set_of_columns_and_expectations_from_suite(suite) assert "column_one" not in columns_with_expectations assert "expect_column_values_to_not_be_null" not in expectations_from_suite assert expectations_from_suite.issubset(possible_expectations_set) assert len(suite.expectations) == 32 value_set_expectations = [ i for i in suite.expectations if i.expectation_type == "expect_column_values_to_be_in_set" ] value_set_columns = { i.kwargs.get("column") for i in value_set_expectations } assert len(value_set_columns) == 2 assert value_set_columns == {"col_two", "col_very_few"}
def test_error_handling_for_expect_compound_columns_to_be_unique( taxi_validator_pandas, taxi_data_ignored_columns, caplog): # TODO: When this expectation is implemented for V3, remove this test and test for this expectation ignored_columns = taxi_data_ignored_columns + [ "pickup_datetime", "dropoff_datetime", "total_amount", "passenger_count", "payment_type", "rate_code_id", "store_and_fwd_flag", "passenger_count", "store_and_fwd_flag", "vendor_id", "trip_distance", ] profiler = UserConfigurableProfiler( taxi_validator_pandas, ignored_columns=ignored_columns, primary_or_compound_key=[ "vendor_id", "pickup_datetime", "dropoff_datetime", "trip_distance", "pickup_location_id", "dropoff_location_id", ], ) with caplog.at_level(logging.WARNING): suite = profiler.build_suite() log_warnings = caplog.messages assert len(log_warnings) == 1 assert ( log_warnings[0] == "expect_compound_columns_to_be_unique is not currently available in the V3 (Batch Request) API. Specifying a compound key will not add any expectations. This will be updated when that expectation becomes available." ) assert len(suite.expectations) == 2 ( columns_with_expectations, expectations_from_suite, ) = get_set_of_columns_and_expectations_from_suite(suite) expected_expectations = { "expect_table_columns_to_match_ordered_list", "expect_table_row_count_to_be_between", } assert expected_expectations == expectations_from_suite profiler_with_single_column_key = UserConfigurableProfiler( taxi_validator_pandas, ignored_columns=ignored_columns, primary_or_compound_key=["pickup_datetime"], ) suite = profiler_with_single_column_key.build_suite() assert len(suite.expectations) == 3 ( columns_with_expectations, expectations_from_suite, ) = get_set_of_columns_and_expectations_from_suite(suite) expected_expectations = { "expect_table_columns_to_match_ordered_list", "expect_table_row_count_to_be_between", "expect_column_values_to_be_unique", } assert expected_expectations == expectations_from_suite
def test_profiler_all_expectation_types_sqlalchemy( titanic_data_context_modular_api, taxi_validator_sqlalchemy, possible_expectations_set, ): """ What does this test do and why? Ensures that all available expectation types work as expected for sqlalchemy """ if taxi_validator_sqlalchemy == None: pytest.skip("a message") context = titanic_data_context_modular_api ignored_columns = [ "pickup_location_id", "dropoff_location_id", "fare_amount", "extra", "mta_tax", "tip_amount", "tolls_amount", "improvement_surcharge", "congestion_surcharge", ] semantic_types = { "datetime": ["pickup_datetime", "dropoff_datetime"], "numeric": ["total_amount", "passenger_count"], "value_set": [ "payment_type", "rate_code_id", "store_and_fwd_flag", "passenger_count", ], "boolean": ["store_and_fwd_flag"], } profiler = UserConfigurableProfiler( taxi_validator_sqlalchemy, semantic_types_dict=semantic_types, ignored_columns=ignored_columns, # TODO: Add primary_or_compound_key test # primary_or_compound_key=[ # "vendor_id", # "pickup_datetime", # "dropoff_datetime", # "trip_distance", # "pickup_location_id", # "dropoff_location_id", # ], ) assert profiler.column_info.get("rate_code_id") suite = profiler.build_suite() assert len(suite.expectations) == 45 ( columns_with_expectations, expectations_from_suite, ) = get_set_of_columns_and_expectations_from_suite(suite) unexpected_expectations = { "expect_column_values_to_be_unique", "expect_column_values_to_be_null", "expect_compound_columns_to_be_unique", } assert expectations_from_suite == { i for i in possible_expectations_set if i not in unexpected_expectations } ignored_included_columns_overlap = [ i for i in columns_with_expectations if i in ignored_columns ] assert len(ignored_included_columns_overlap) == 0 results = context.run_validation_operator( "action_list_operator", assets_to_validate=[taxi_validator_sqlalchemy] ) assert results["success"]
def test_profiler_all_expectation_types(titanic_data_context, possible_expectations_set): """ What does this test do and why? Ensures that all available expectation types work as expected """ context = titanic_data_context df = ge.read_csv( file_relative_path( __file__, "../test_sets/taxi_yellow_trip_data_samples/yellow_trip_data_sample_2019-01.csv", )) batch_df = ge.dataset.PandasDataset(df) ignored_columns = [ "pickup_location_id", "dropoff_location_id", "fare_amount", "extra", "mta_tax", "tip_amount", "tolls_amount", "improvement_surcharge", "congestion_surcharge", ] semantic_types = { "datetime": ["pickup_datetime", "dropoff_datetime"], "numeric": ["total_amount", "passenger_count"], "value_set": [ "payment_type", "rate_code_id", "store_and_fwd_flag", "passenger_count", ], "boolean": ["store_and_fwd_flag"], } profiler = UserConfigurableProfiler( batch_df, semantic_types_dict=semantic_types, ignored_columns=ignored_columns, primary_or_compound_key=[ "vendor_id", "pickup_datetime", "dropoff_datetime", "trip_distance", "pickup_location_id", "dropoff_location_id", ], ) assert profiler.column_info.get("rate_code_id") suite = profiler.build_suite() assert len(suite.expectations) == 46 ( columns_with_expectations, expectations_from_suite, ) = get_set_of_columns_and_expectations_from_suite(suite) unexpected_expectations = { "expect_column_values_to_be_unique", "expect_column_values_to_be_null", } assert expectations_from_suite == { i for i in possible_expectations_set if i not in unexpected_expectations } ignored_included_columns_overlap = [ i for i in columns_with_expectations if i in ignored_columns ] assert len(ignored_included_columns_overlap) == 0 results = context.run_validation_operator("action_list_operator", assets_to_validate=[batch_df]) assert results["success"]
def test_notebook_execution_onboarding_data_assistant_pandas_backend( titanic_v013_multi_datasource_pandas_data_context_with_checkpoints_v1_with_empty_store_stats_enabled, ): """ To set this test up we: - create a suite using User-Configurable Profiler - verify that no validations have happened - create the suite edit notebook by hijacking the private cli method We then: - execute that notebook (Note this will raise various errors like CellExecutionError if any cell in the notebook fails - create a new context from disk - verify that a validation has been run with our expectation suite """ context: DataContext = titanic_v013_multi_datasource_pandas_data_context_with_checkpoints_v1_with_empty_store_stats_enabled root_dir: str = context.root_directory uncommitted_dir: str = os.path.join(root_dir, "uncommitted") expectation_suite_name: str = "warning" context.create_expectation_suite( expectation_suite_name=expectation_suite_name) batch_request: dict = { "datasource_name": "my_datasource", "data_connector_name": "my_basic_data_connector", "data_asset_name": "Titanic_1912", } # Sanity check test setup original_suite: ExpectationSuite = context.get_expectation_suite( expectation_suite_name=expectation_suite_name) assert len(original_suite.expectations) == 0 assert context.list_expectation_suite_names() == [expectation_suite_name] assert context.list_datasources() == [ { "name": "my_datasource", "class_name": "Datasource", "module_name": "great_expectations.datasource", "execution_engine": { "class_name": "PandasExecutionEngine", "module_name": "great_expectations.execution_engine", }, "data_connectors": { "my_basic_data_connector": { "module_name": "great_expectations.datasource.data_connector", "base_directory": f"{root_dir}/../data/titanic", "default_regex": { "pattern": "(.*)\\.csv", "group_names": ["data_asset_name"], }, "class_name": "InferredAssetFilesystemDataConnector", }, "my_special_data_connector": { "glob_directive": "*.csv", "assets": { "users": { "pattern": "(.+)_(\\d+)_(\\d+)\\.csv", "group_names": ["name", "timestamp", "size"], "class_name": "Asset", "base_directory": f"{root_dir}/../data/titanic", "module_name": "great_expectations.datasource.data_connector.asset", } }, "module_name": "great_expectations.datasource.data_connector", "base_directory": f"{root_dir}/../data/titanic", "default_regex": { "pattern": "(.+)\\.csv", "group_names": ["name"] }, "class_name": "ConfiguredAssetFilesystemDataConnector", }, "my_other_data_connector": { "glob_directive": "*.csv", "assets": { "users": { "class_name": "Asset", "module_name": "great_expectations.datasource.data_connector.asset", } }, "module_name": "great_expectations.datasource.data_connector", "base_directory": f"{root_dir}/../data/titanic", "default_regex": { "pattern": "(.+)\\.csv", "group_names": ["name"] }, "class_name": "ConfiguredAssetFilesystemDataConnector", }, "my_runtime_data_connector": { "module_name": "great_expectations.datasource.data_connector", "batch_identifiers": ["pipeline_stage_name", "airflow_run_id"], "class_name": "RuntimeDataConnector", }, }, }, { "name": "my_additional_datasource", "class_name": "Datasource", "module_name": "great_expectations.datasource", "execution_engine": { "module_name": "great_expectations.execution_engine", "class_name": "PandasExecutionEngine", }, "data_connectors": { "my_additional_data_connector": { "module_name": "great_expectations.datasource.data_connector", "default_regex": { "pattern": "(.*)\\.csv", "group_names": ["data_asset_name"], }, "base_directory": f"{root_dir}/../data/titanic", "class_name": "InferredAssetFilesystemDataConnector", } }, }, ] assert context.get_validation_result( expectation_suite_name="warning") == {} # Create notebook # do not want to actually send usage_message, since the function call is not the result of actual usage _suite_edit_workflow( context=context, expectation_suite_name=expectation_suite_name, profile=True, profiler_name=None, usage_event="test_notebook_execution", interactive_mode=CLISuiteInteractiveFlagCombinations. UNPROMPTED_INTERACTIVE_FALSE_MANUAL_TRUE, no_jupyter=True, create_if_not_exist=False, datasource_name=None, batch_request=batch_request, additional_batch_request_args=None, suppress_usage_message=True, assume_yes=True, ) edit_notebook_path: str = os.path.join(uncommitted_dir, "edit_warning.ipynb") assert os.path.isfile(edit_notebook_path) run_notebook( notebook_path=edit_notebook_path, notebook_dir=uncommitted_dir, string_to_be_replaced= "context.open_data_docs(resource_identifier=validation_result_identifier)", replacement_string="", ) # Assertions about output context = DataContext(context_root_dir=root_dir) obs_validation_result: ExpectationSuiteValidationResult = ( context.get_validation_result(expectation_suite_name="warning")) assert obs_validation_result.statistics == { "evaluated_expectations": 2, "successful_expectations": 2, "unsuccessful_expectations": 0, "success_percent": 100.0, } expected_expectation_configurations: List[ExpectationConfiguration] = [ ExpectationConfiguration( **{ "kwargs": { "max_value": 1313, "min_value": 1313 }, "expectation_type": "expect_table_row_count_to_be_between", "meta": { "profiler_details": { "metric_configuration": { "domain_kwargs": {}, "metric_dependencies": None, "metric_name": "table.row_count", "metric_value_kwargs": None, }, "num_batches": 1, } }, }), ExpectationConfiguration( **{ "kwargs": { "column_set": [ "Age", "Name", "PClass", "Sex", "SexCode", "Survived", "Unnamed: 0", ], "exact_match": None, }, "expectation_type": "expect_table_columns_to_match_set", "meta": { "profiler_details": { "success_ratio": 1.0 } }, }), ] suite: ExpectationSuite = context.get_expectation_suite( expectation_suite_name=expectation_suite_name) expectation_configurations: List[ExpectationConfiguration] = [] expectation_configuration: ExpectationConfiguration for expectation_configuration in suite.expectations: kwargs: dict = expectation_configuration.kwargs key: str value: Any kwargs = { key: sorted(value) if isinstance(value, (list, set, tuple)) else value for key, value in kwargs.items() } expectation_configuration.kwargs = kwargs expectation_configurations.append(expectation_configuration) assert expectation_configurations == expected_expectation_configurations columns_with_expectations: Set[str] expectations_from_suite: Set[str] ( columns_with_expectations, expectations_from_suite, ) = get_set_of_columns_and_expectations_from_suite(suite=suite) expected_expectations: Set[str] = { "expect_table_row_count_to_be_between", "expect_table_columns_to_match_set", } assert columns_with_expectations == set() assert expectations_from_suite == expected_expectations
def test_notebook_execution_with_pandas_backend( titanic_data_context_no_data_docs): """ This tests that the notebook is written to disk and executes without error. To set this test up we: - create a scaffold notebook - verify that no validations have happened We then: - execute that notebook (Note this will raise various errors like CellExecutionError if any cell in the notebook fails - create a new context from disk - verify that a validation has been run with our expectation suite """ # Since we'll run the notebook, we use a context with no data docs to avoid # the renderer's default behavior of building and opening docs, which is not # part of this test. context = titanic_data_context_no_data_docs root_dir = context.root_directory uncommitted_dir = os.path.join(root_dir, "uncommitted") suite_name = "my_suite" suite = context.create_expectation_suite(suite_name) csv_path = os.path.join(root_dir, "..", "data", "Titanic.csv") batch_kwargs = {"datasource": "mydatasource", "path": csv_path} # Sanity check test setup assert context.list_expectation_suite_names() == [suite_name] assert context.list_datasources() == [{ "module_name": "great_expectations.datasource", "class_name": "PandasDatasource", "data_asset_type": { "module_name": "great_expectations.dataset", "class_name": "PandasDataset", }, "batch_kwargs_generators": { "mygenerator": { "class_name": "SubdirReaderBatchKwargsGenerator", "base_directory": "../data", } }, "name": "mydatasource", }] assert context.get_validation_result(suite_name) == {} notebook_path = os.path.join(uncommitted_dir, f"{suite_name}.ipynb") assert not os.path.isfile(notebook_path) # Create notebook renderer = SuiteScaffoldNotebookRenderer(titanic_data_context_no_data_docs, suite, batch_kwargs) renderer.render_to_disk(notebook_path) assert os.path.isfile(notebook_path) with open(notebook_path) as f: nb = nbformat.read(f, as_version=4) # Run notebook ep = ExecutePreprocessor(timeout=600, kernel_name="python3") ep.preprocess(nb, {"metadata": {"path": uncommitted_dir}}) # Useful to inspect executed notebook output_notebook = os.path.join(uncommitted_dir, "output.ipynb") with open(output_notebook, "w") as f: nbformat.write(nb, f) # Assertions about output context = DataContext(root_dir) obs_validation_result = context.get_validation_result(suite_name) assert obs_validation_result.statistics == { "evaluated_expectations": 2, "successful_expectations": 2, "unsuccessful_expectations": 0, "success_percent": 100, } suite = context.get_expectation_suite(suite_name) assert suite.expectations ( columns_with_expectations, expectations_from_suite, ) = get_set_of_columns_and_expectations_from_suite(suite) expected_expectations = { "expect_table_columns_to_match_ordered_list", "expect_table_row_count_to_be_between", } assert columns_with_expectations == set() assert expectations_from_suite == expected_expectations
def test_expect_compound_columns_to_be_unique( taxi_validator_spark, taxi_data_ignored_columns, caplog ): """ Until all ExecutionEngine implementations for V3 are completed for this expectation: 1) Use the "taxi_validator_" argument for this test method, corresponding to one of the ExecutionEngine subclasses, for which this expectation has not yet been implemented (and update the :param annotation below accordingly); 2) With every additional ExecutionEngine implementation for this expectation, update the corresponding "test_profiler_all_expectation_types_" test method to include this expectation in the appropriate assertion. 3) Once this expectation has been implemented for all ExecutionEngine subclasses, delete this test method entirely. :param taxi_validator_spark: :param taxi_data_ignored_columns: :param caplog: :return: """ taxi_validator = taxi_validator_spark ignored_columns = taxi_data_ignored_columns + [ "pickup_datetime", "dropoff_datetime", "total_amount", "passenger_count", "payment_type", "rate_code_id", "store_and_fwd_flag", "passenger_count", "store_and_fwd_flag", "vendor_id", "trip_distance", ] profiler = UserConfigurableProfiler( taxi_validator, ignored_columns=ignored_columns, primary_or_compound_key=[ "vendor_id", "pickup_datetime", "dropoff_datetime", "trip_distance", "pickup_location_id", "dropoff_location_id", ], ) with caplog.at_level(logging.WARNING): suite = profiler.build_suite() log_warning_records = list( filter(lambda record: record.levelname == "WARNING", caplog.records) ) assert len(log_warning_records) == 0 assert len(suite.expectations) == 3 ( columns_with_expectations, expectations_from_suite, ) = get_set_of_columns_and_expectations_from_suite(suite) expected_expectations = { "expect_table_columns_to_match_ordered_list", "expect_table_row_count_to_be_between", "expect_compound_columns_to_be_unique", } assert expected_expectations == expectations_from_suite profiler_with_single_column_key = UserConfigurableProfiler( taxi_validator, ignored_columns=ignored_columns, primary_or_compound_key=["pickup_datetime"], ) suite = profiler_with_single_column_key.build_suite() assert len(suite.expectations) == 3 ( columns_with_expectations, expectations_from_suite, ) = get_set_of_columns_and_expectations_from_suite(suite) expected_expectations = { "expect_table_columns_to_match_ordered_list", "expect_table_row_count_to_be_between", "expect_column_values_to_be_unique", } assert expected_expectations == expectations_from_suite
def test_build_suite_when_suite_already_exists( mock_emit, cardinality_validator, ): """ What does this test do and why? Confirms that creating a new suite on an existing profiler wipes the previous suite """ profiler = UserConfigurableProfiler( cardinality_validator, table_expectations_only=True, excluded_expectations=["expect_table_row_count_to_be_between"], ) suite = profiler.build_suite() _, expectations = get_set_of_columns_and_expectations_from_suite(suite) assert len(suite.expectations) == 1 assert "expect_table_columns_to_match_ordered_list" in expectations profiler.excluded_expectations = ["expect_table_columns_to_match_ordered_list"] suite = profiler.build_suite() _, expectations = get_set_of_columns_and_expectations_from_suite(suite) assert len(suite.expectations) == 1 assert "expect_table_row_count_to_be_between" in expectations assert mock_emit.call_count == 2 # noinspection PyUnresolvedReferences expected_events: List[unittest.mock._Call] # noinspection PyUnresolvedReferences actual_events: List[unittest.mock._Call] expected_events = [ mock.call( { "event": "legacy_profiler.build_suite", "event_payload": { "profile_dataset_type": "Validator", "excluded_expectations_specified": True, "ignored_columns_specified": True, "not_null_only": False, "primary_or_compound_key_specified": False, "semantic_types_dict_specified": False, "table_expectations_only": True, "value_set_threshold_specified": True, "api_version": "v2", }, "success": True, } ), mock.call( { "event": "legacy_profiler.build_suite", "event_payload": { "profile_dataset_type": "Validator", "excluded_expectations_specified": True, "ignored_columns_specified": True, "not_null_only": False, "primary_or_compound_key_specified": False, "semantic_types_dict_specified": False, "table_expectations_only": True, "value_set_threshold_specified": True, "api_version": "v2", }, "success": True, } ), ] actual_events = mock_emit.call_args_list assert actual_events == expected_events
def test_build_suite_with_semantic_types_dict( mock_emit, cardinality_validator, possible_expectations_set, ): """ What does this test do and why? Tests that the build_suite function works as expected with a semantic_types dict """ semantic_types = { "numeric": ["col_few", "col_many", "col_very_many"], "value_set": ["col_two", "col_very_few"], } profiler = UserConfigurableProfiler( cardinality_validator, semantic_types_dict=semantic_types, primary_or_compound_key=["col_unique"], ignored_columns=["col_one"], value_set_threshold="unique", table_expectations_only=False, excluded_expectations=["expect_column_values_to_not_be_null"], ) suite = profiler.build_suite() ( columns_with_expectations, expectations_from_suite, ) = get_set_of_columns_and_expectations_from_suite(suite) assert "column_one" not in columns_with_expectations assert "expect_column_values_to_not_be_null" not in expectations_from_suite assert expectations_from_suite.issubset(possible_expectations_set) assert len(suite.expectations) == 32 value_set_expectations = [ i for i in suite.expectations if i.expectation_type == "expect_column_values_to_be_in_set" ] value_set_columns = {i.kwargs.get("column") for i in value_set_expectations} assert len(value_set_columns) == 2 assert value_set_columns == {"col_two", "col_very_few"} # Note 20211209 - Profiler will also call ExpectationSuite's add_expectation(), but it will not # send a usage_stats event when called from a Profiler. assert mock_emit.call_count == 1 # noinspection PyUnresolvedReferences expected_events: List[unittest.mock._Call] # noinspection PyUnresolvedReferences actual_events: List[unittest.mock._Call] expected_events = [ mock.call( { "event": "legacy_profiler.build_suite", "event_payload": { "profile_dataset_type": "Validator", "excluded_expectations_specified": True, "ignored_columns_specified": True, "not_null_only": False, "primary_or_compound_key_specified": True, "semantic_types_dict_specified": True, "table_expectations_only": False, "value_set_threshold_specified": True, "api_version": "v2", }, "success": True, } ), ] actual_events = mock_emit.call_args_list assert actual_events == expected_events