def test_notebook_execution_with_pandas_backend( titanic_data_context_no_data_docs): """ This tests that the notebook is written to disk and executes without error. To set this test up we: - create a scaffold notebook - verify that no validations have happened We then: - execute that notebook (Note this will raise various errors like CellExecutionError if any cell in the notebook fails - create a new context from disk - verify that a validation has been run with our expectation suite """ # Since we'll run the notebook, we use a context with no data docs to avoid # the renderer's default behavior of building and opening docs, which is not # part of this test. context = titanic_data_context_no_data_docs root_dir = context.root_directory uncommitted_dir = os.path.join(root_dir, "uncommitted") suite_name = "my_suite" suite = context.create_expectation_suite(suite_name) csv_path = os.path.join(root_dir, "..", "data", "Titanic.csv") batch_kwargs = {"datasource": "mydatasource", "path": csv_path} # Sanity check test setup assert context.list_expectation_suite_names() == [suite_name] assert context.list_datasources() == [{ "module_name": "great_expectations.datasource", "class_name": "PandasDatasource", "data_asset_type": { "module_name": "great_expectations.dataset", "class_name": "PandasDataset", }, "batch_kwargs_generators": { "mygenerator": { "class_name": "SubdirReaderBatchKwargsGenerator", "base_directory": "../data", } }, "name": "mydatasource", }] assert context.get_validation_result(suite_name) == {} notebook_path = os.path.join(uncommitted_dir, f"{suite_name}.ipynb") assert not os.path.isfile(notebook_path) # Create notebook renderer = SuiteScaffoldNotebookRenderer(titanic_data_context_no_data_docs, suite, batch_kwargs) renderer.render_to_disk(notebook_path) assert os.path.isfile(notebook_path) with open(notebook_path) as f: nb = nbformat.read(f, as_version=4) # Run notebook ep = ExecutePreprocessor(timeout=600, kernel_name="python3") ep.preprocess(nb, {"metadata": {"path": uncommitted_dir}}) # Useful to inspect executed notebook output_notebook = os.path.join(uncommitted_dir, "output.ipynb") with open(output_notebook, "w") as f: nbformat.write(nb, f) # Assertions about output context = DataContext(root_dir) obs_validation_result = context.get_validation_result(suite_name) assert obs_validation_result.statistics == { "evaluated_expectations": 3, "successful_expectations": 3, "unsuccessful_expectations": 0, "success_percent": 100, } suite = context.get_expectation_suite(suite_name) assert suite.expectations
def test_notebook_execution_rule_based_profiler_with_pandas_backend( titanic_v013_multi_datasource_pandas_data_context_with_checkpoints_v1_with_empty_store_stats_enabled, bobby_columnar_table_multi_batch, ): """ To set this test up we: - create a suite using Rule-Based Profiler - verify that no validations have happened - create the suite edit notebook by hijacking the private cli method We then: - execute that notebook (Note this will raise various errors like CellExecutionError if any cell in the notebook fails - create a new context from disk - verify that a validation has been run with our expectation suite """ context: DataContext = titanic_v013_multi_datasource_pandas_data_context_with_checkpoints_v1_with_empty_store_stats_enabled root_dir: str = context.root_directory uncommitted_dir: str = os.path.join(root_dir, "uncommitted") expectation_suite_name: str = "warning" context.create_expectation_suite( expectation_suite_name=expectation_suite_name) batch_request: dict = { "datasource_name": "my_datasource", "data_connector_name": "my_basic_data_connector", "data_asset_name": "Titanic_1912", } # Sanity check test setup original_suite: ExpectationSuite = context.get_expectation_suite( expectation_suite_name=expectation_suite_name) assert len(original_suite.expectations) == 0 assert context.list_expectation_suite_names() == [expectation_suite_name] assert context.list_datasources() == [ { "name": "my_datasource", "class_name": "Datasource", "module_name": "great_expectations.datasource", "execution_engine": { "class_name": "PandasExecutionEngine", "module_name": "great_expectations.execution_engine", }, "data_connectors": { "my_basic_data_connector": { "module_name": "great_expectations.datasource.data_connector", "base_directory": f"{root_dir}/../data/titanic", "default_regex": { "pattern": "(.*)\\.csv", "group_names": ["data_asset_name"], }, "class_name": "InferredAssetFilesystemDataConnector", }, "my_special_data_connector": { "glob_directive": "*.csv", "assets": { "users": { "pattern": "(.+)_(\\d+)_(\\d+)\\.csv", "group_names": ["name", "timestamp", "size"], "class_name": "Asset", "base_directory": f"{root_dir}/../data/titanic", "module_name": "great_expectations.datasource.data_connector.asset", } }, "module_name": "great_expectations.datasource.data_connector", "base_directory": f"{root_dir}/../data/titanic", "default_regex": { "pattern": "(.+)\\.csv", "group_names": ["name"] }, "class_name": "ConfiguredAssetFilesystemDataConnector", }, "my_other_data_connector": { "glob_directive": "*.csv", "assets": { "users": { "class_name": "Asset", "module_name": "great_expectations.datasource.data_connector.asset", } }, "module_name": "great_expectations.datasource.data_connector", "base_directory": f"{root_dir}/../data/titanic", "default_regex": { "pattern": "(.+)\\.csv", "group_names": ["name"] }, "class_name": "ConfiguredAssetFilesystemDataConnector", }, "my_runtime_data_connector": { "module_name": "great_expectations.datasource.data_connector", "batch_identifiers": ["pipeline_stage_name", "airflow_run_id"], "class_name": "RuntimeDataConnector", }, }, }, { "name": "my_additional_datasource", "class_name": "Datasource", "module_name": "great_expectations.datasource", "execution_engine": { "module_name": "great_expectations.execution_engine", "class_name": "PandasExecutionEngine", }, "data_connectors": { "my_additional_data_connector": { "module_name": "great_expectations.datasource.data_connector", "default_regex": { "pattern": "(.*)\\.csv", "group_names": ["data_asset_name"], }, "base_directory": f"{root_dir}/../data/titanic", "class_name": "InferredAssetFilesystemDataConnector", } }, }, ] assert context.get_validation_result( expectation_suite_name="warning") == {} # Load profiler configs & loop (run tests for each one) yaml_config: str = bobby_columnar_table_multi_batch["profiler_config"] # Instantiate Profiler profiler_config: dict = yaml.load(yaml_config) # Roundtrip through schema validation to remove any illegal fields add/or restore any missing fields. deserialized_config: dict = ruleBasedProfilerConfigSchema.load( profiler_config) serialized_config: dict = ruleBasedProfilerConfigSchema.dump( deserialized_config) # `class_name`/`module_name` are generally consumed through `instantiate_class_from_config` # so we need to manually remove those values if we wish to use the **kwargs instantiation pattern serialized_config.pop("class_name") serialized_config.pop("module_name") profiler: RuleBasedProfiler = RuleBasedProfiler( **serialized_config, data_context=context, ) profiler_name: str = "bobby_user_workflow" context.save_profiler( profiler=profiler, name=profiler_name, ) # Create notebook # do not want to actually send usage_message, since the function call is not the result of actual usage _suite_edit_workflow( context=context, expectation_suite_name=expectation_suite_name, profile=True, profiler_name=profiler_name, usage_event="test_notebook_execution", interactive_mode=CLISuiteInteractiveFlagCombinations. UNPROMPTED_INTERACTIVE_FALSE_MANUAL_TRUE, no_jupyter=True, create_if_not_exist=False, datasource_name=None, batch_request=batch_request, additional_batch_request_args=None, suppress_usage_message=True, assume_yes=True, ) edit_notebook_path: str = os.path.join(uncommitted_dir, "edit_warning.ipynb") assert os.path.isfile(edit_notebook_path) run_notebook( notebook_path=edit_notebook_path, notebook_dir=uncommitted_dir, string_to_be_replaced= "context.open_data_docs(resource_identifier=validation_result_identifier)", replacement_string="", ) # Assertions about output context = DataContext(context_root_dir=root_dir) obs_validation_result: ExpectationSuiteValidationResult = ( context.get_validation_result(expectation_suite_name="warning")) assert obs_validation_result.statistics == { "evaluated_expectations": 13, "successful_expectations": 13, "unsuccessful_expectations": 0, "success_percent": 100.0, } expected_expectation_configurations: List[ExpectationConfiguration] = [ ExpectationConfiguration( **{ "meta": { "profiler_details": { "metric_configuration": { "domain_kwargs": {}, "metric_dependencies": None, "metric_name": "table.row_count", "metric_value_kwargs": None, }, "num_batches": 1, } }, "kwargs": { "max_value": 1313, "min_value": 1313 }, "expectation_type": "expect_table_row_count_to_be_between", }), ExpectationConfiguration( **{ "meta": { "profiler_details": { "metric_configuration": { "domain_kwargs": { "column": "Unnamed: 0" }, "metric_dependencies": None, "metric_name": "column.min", "metric_value_kwargs": None, }, "num_batches": 1, } }, "kwargs": { "column": "Unnamed: 0", "max_value": 1, "min_value": 1, "mostly": 1.0, }, "expectation_type": "expect_column_min_to_be_between", }), ExpectationConfiguration( **{ "meta": { "profiler_details": { "metric_configuration": { "domain_kwargs": { "column": "Unnamed: 0" }, "metric_dependencies": None, "metric_name": "column.max", "metric_value_kwargs": None, }, "num_batches": 1, } }, "kwargs": { "column": "Unnamed: 0", "max_value": 1313, "min_value": 1313, "mostly": 1.0, }, "expectation_type": "expect_column_max_to_be_between", }), ExpectationConfiguration( **{ "meta": { "profiler_details": { "metric_configuration": { "domain_kwargs": { "column": "Age" }, "metric_dependencies": None, "metric_name": "column.min", "metric_value_kwargs": None, }, "num_batches": 1, } }, "kwargs": { "column": "Age", "max_value": 0.17, "min_value": 0.17, "mostly": 1.0, }, "expectation_type": "expect_column_min_to_be_between", }), ExpectationConfiguration( **{ "meta": { "profiler_details": { "metric_configuration": { "domain_kwargs": { "column": "Age" }, "metric_dependencies": None, "metric_name": "column.max", "metric_value_kwargs": None, }, "num_batches": 1, } }, "kwargs": { "column": "Age", "max_value": 71.0, "min_value": 71.0, "mostly": 1.0, }, "expectation_type": "expect_column_max_to_be_between", }), ExpectationConfiguration( **{ "meta": { "profiler_details": { "metric_configuration": { "domain_kwargs": { "column": "Survived" }, "metric_dependencies": None, "metric_name": "column.min", "metric_value_kwargs": None, }, "num_batches": 1, } }, "kwargs": { "column": "Survived", "max_value": 0, "min_value": 0, "mostly": 1.0, }, "expectation_type": "expect_column_min_to_be_between", }), ExpectationConfiguration( **{ "meta": { "profiler_details": { "metric_configuration": { "domain_kwargs": { "column": "Survived" }, "metric_dependencies": None, "metric_name": "column.max", "metric_value_kwargs": None, }, "num_batches": 1, } }, "kwargs": { "column": "Survived", "max_value": 1, "min_value": 1, "mostly": 1.0, }, "expectation_type": "expect_column_max_to_be_between", }), ExpectationConfiguration( **{ "meta": { "profiler_details": { "metric_configuration": { "domain_kwargs": { "column": "SexCode" }, "metric_dependencies": None, "metric_name": "column.min", "metric_value_kwargs": None, }, "num_batches": 1, } }, "kwargs": { "column": "SexCode", "max_value": 0, "min_value": 0, "mostly": 1.0, }, "expectation_type": "expect_column_min_to_be_between", }), ExpectationConfiguration( **{ "meta": { "profiler_details": { "metric_configuration": { "domain_kwargs": { "column": "SexCode" }, "metric_dependencies": None, "metric_name": "column.max", "metric_value_kwargs": None, }, "num_batches": 1, } }, "kwargs": { "column": "SexCode", "max_value": 1, "min_value": 1, "mostly": 1.0, }, "expectation_type": "expect_column_max_to_be_between", }), ExpectationConfiguration( **{ "meta": {}, "kwargs": { "column": "PClass", "value_set": [ "*", "1st", "2nd", "3rd", ], }, "expectation_type": "expect_column_values_to_be_in_set", }), ExpectationConfiguration( **{ "meta": {}, "kwargs": { "column": "Sex", "value_set": ["female", "male"] }, "expectation_type": "expect_column_values_to_be_in_set", }), ExpectationConfiguration( **{ "meta": {}, "kwargs": { "column": "Survived", "value_set": [0, 1] }, "expectation_type": "expect_column_values_to_be_in_set", }), ExpectationConfiguration( **{ "meta": {}, "kwargs": { "column": "SexCode", "value_set": [0, 1] }, "expectation_type": "expect_column_values_to_be_in_set", }), ] suite: ExpectationSuite = context.get_expectation_suite( expectation_suite_name=expectation_suite_name) expectation_configurations: List[ExpectationConfiguration] = [] expectation_configuration: ExpectationConfiguration for expectation_configuration in suite.expectations: kwargs: dict = expectation_configuration.kwargs key: str value: Any kwargs = { key: sorted(value) if isinstance(value, (list, set, tuple)) else value for key, value in kwargs.items() } expectation_configuration.kwargs = kwargs expectation_configurations.append(expectation_configuration) assert expectation_configurations == expected_expectation_configurations
def test_cli_datasource_profile_with_additional_batch_kwargs( caplog, empty_data_context, filesystem_csv_2 ): empty_data_context.add_datasource( "my_datasource", class_name="PandasDatasource", batch_kwargs_generators={ "subdir_reader": { "class_name": "SubdirReaderBatchKwargsGenerator", "base_directory": str(filesystem_csv_2), } }, ) not_so_empty_data_context = empty_data_context project_root_dir = not_so_empty_data_context.root_directory runner = CliRunner(mix_stderr=False) result = runner.invoke( cli, [ "datasource", "profile", "-d", project_root_dir, "--additional-batch-kwargs", '{"reader_options": {"sep": ",", "parse_dates": [0]}}', "--no-view", ], input="Y\n", catch_exceptions=False, ) stdout = result.output assert result.exit_code == 0 assert ( "Profiling 'my_datasource' will create expectations and documentation." in stdout ) assert "Would you like to profile 'my_datasource'" in stdout assert ( "Great Expectations is building Data Docs from the data you just profiled!" in stdout ) context = DataContext(project_root_dir) assert len(context.list_datasources()) == 1 expectations_store = context.stores["expectations_store"] suites = expectations_store.list_keys() assert len(suites) == 1 expected_suite_name = "my_datasource.subdir_reader.f1.BasicDatasetProfiler" assert suites[0].expectation_suite_name == expected_suite_name validations_store = context.stores["validations_store"] validation_keys = validations_store.list_keys() assert len(validation_keys) == 1 validation = validations_store.get(validation_keys[0]) assert validation.meta["expectation_suite_name"] == expected_suite_name assert validation.success is False assert len(validation.results) == 9 batch_id = validation_keys[0].batch_identifier evr = context.get_validation_result( expectation_suite_name=expected_suite_name, batch_identifier=batch_id ) reader_options = evr.meta["batch_kwargs"]["reader_options"] assert reader_options["parse_dates"] == [0] assert reader_options["sep"] == "," assert "Preparing column 1 of 1" in caplog.messages[0] assert len(caplog.messages) == 1 assert_no_tracebacks(result)
def test_notebook_execution_onboarding_data_assistant_pandas_backend( titanic_v013_multi_datasource_pandas_data_context_with_checkpoints_v1_with_empty_store_stats_enabled, ): """ To set this test up we: - create a suite using User-Configurable Profiler - verify that no validations have happened - create the suite edit notebook by hijacking the private cli method We then: - execute that notebook (Note this will raise various errors like CellExecutionError if any cell in the notebook fails - create a new context from disk - verify that a validation has been run with our expectation suite """ context: DataContext = titanic_v013_multi_datasource_pandas_data_context_with_checkpoints_v1_with_empty_store_stats_enabled root_dir: str = context.root_directory uncommitted_dir: str = os.path.join(root_dir, "uncommitted") expectation_suite_name: str = "warning" context.create_expectation_suite( expectation_suite_name=expectation_suite_name) batch_request: dict = { "datasource_name": "my_datasource", "data_connector_name": "my_basic_data_connector", "data_asset_name": "Titanic_1912", } # Sanity check test setup original_suite: ExpectationSuite = context.get_expectation_suite( expectation_suite_name=expectation_suite_name) assert len(original_suite.expectations) == 0 assert context.list_expectation_suite_names() == [expectation_suite_name] assert context.list_datasources() == [ { "name": "my_datasource", "class_name": "Datasource", "module_name": "great_expectations.datasource", "execution_engine": { "class_name": "PandasExecutionEngine", "module_name": "great_expectations.execution_engine", }, "data_connectors": { "my_basic_data_connector": { "module_name": "great_expectations.datasource.data_connector", "base_directory": f"{root_dir}/../data/titanic", "default_regex": { "pattern": "(.*)\\.csv", "group_names": ["data_asset_name"], }, "class_name": "InferredAssetFilesystemDataConnector", }, "my_special_data_connector": { "glob_directive": "*.csv", "assets": { "users": { "pattern": "(.+)_(\\d+)_(\\d+)\\.csv", "group_names": ["name", "timestamp", "size"], "class_name": "Asset", "base_directory": f"{root_dir}/../data/titanic", "module_name": "great_expectations.datasource.data_connector.asset", } }, "module_name": "great_expectations.datasource.data_connector", "base_directory": f"{root_dir}/../data/titanic", "default_regex": { "pattern": "(.+)\\.csv", "group_names": ["name"] }, "class_name": "ConfiguredAssetFilesystemDataConnector", }, "my_other_data_connector": { "glob_directive": "*.csv", "assets": { "users": { "class_name": "Asset", "module_name": "great_expectations.datasource.data_connector.asset", } }, "module_name": "great_expectations.datasource.data_connector", "base_directory": f"{root_dir}/../data/titanic", "default_regex": { "pattern": "(.+)\\.csv", "group_names": ["name"] }, "class_name": "ConfiguredAssetFilesystemDataConnector", }, "my_runtime_data_connector": { "module_name": "great_expectations.datasource.data_connector", "batch_identifiers": ["pipeline_stage_name", "airflow_run_id"], "class_name": "RuntimeDataConnector", }, }, }, { "name": "my_additional_datasource", "class_name": "Datasource", "module_name": "great_expectations.datasource", "execution_engine": { "module_name": "great_expectations.execution_engine", "class_name": "PandasExecutionEngine", }, "data_connectors": { "my_additional_data_connector": { "module_name": "great_expectations.datasource.data_connector", "default_regex": { "pattern": "(.*)\\.csv", "group_names": ["data_asset_name"], }, "base_directory": f"{root_dir}/../data/titanic", "class_name": "InferredAssetFilesystemDataConnector", } }, }, ] assert context.get_validation_result( expectation_suite_name="warning") == {} # Create notebook # do not want to actually send usage_message, since the function call is not the result of actual usage _suite_edit_workflow( context=context, expectation_suite_name=expectation_suite_name, profile=True, profiler_name=None, usage_event="test_notebook_execution", interactive_mode=CLISuiteInteractiveFlagCombinations. UNPROMPTED_INTERACTIVE_FALSE_MANUAL_TRUE, no_jupyter=True, create_if_not_exist=False, datasource_name=None, batch_request=batch_request, additional_batch_request_args=None, suppress_usage_message=True, assume_yes=True, ) edit_notebook_path: str = os.path.join(uncommitted_dir, "edit_warning.ipynb") assert os.path.isfile(edit_notebook_path) run_notebook( notebook_path=edit_notebook_path, notebook_dir=uncommitted_dir, string_to_be_replaced= "context.open_data_docs(resource_identifier=validation_result_identifier)", replacement_string="", ) # Assertions about output context = DataContext(context_root_dir=root_dir) obs_validation_result: ExpectationSuiteValidationResult = ( context.get_validation_result(expectation_suite_name="warning")) assert obs_validation_result.statistics == { "evaluated_expectations": 2, "successful_expectations": 2, "unsuccessful_expectations": 0, "success_percent": 100.0, } expected_expectation_configurations: List[ExpectationConfiguration] = [ ExpectationConfiguration( **{ "kwargs": { "max_value": 1313, "min_value": 1313 }, "expectation_type": "expect_table_row_count_to_be_between", "meta": { "profiler_details": { "metric_configuration": { "domain_kwargs": {}, "metric_dependencies": None, "metric_name": "table.row_count", "metric_value_kwargs": None, }, "num_batches": 1, } }, }), ExpectationConfiguration( **{ "kwargs": { "column_set": [ "Age", "Name", "PClass", "Sex", "SexCode", "Survived", "Unnamed: 0", ], "exact_match": None, }, "expectation_type": "expect_table_columns_to_match_set", "meta": { "profiler_details": { "success_ratio": 1.0 } }, }), ] suite: ExpectationSuite = context.get_expectation_suite( expectation_suite_name=expectation_suite_name) expectation_configurations: List[ExpectationConfiguration] = [] expectation_configuration: ExpectationConfiguration for expectation_configuration in suite.expectations: kwargs: dict = expectation_configuration.kwargs key: str value: Any kwargs = { key: sorted(value) if isinstance(value, (list, set, tuple)) else value for key, value in kwargs.items() } expectation_configuration.kwargs = kwargs expectation_configurations.append(expectation_configuration) assert expectation_configurations == expected_expectation_configurations columns_with_expectations: Set[str] expectations_from_suite: Set[str] ( columns_with_expectations, expectations_from_suite, ) = get_set_of_columns_and_expectations_from_suite(suite=suite) expected_expectations: Set[str] = { "expect_table_row_count_to_be_between", "expect_table_columns_to_match_set", } assert columns_with_expectations == set() assert expectations_from_suite == expected_expectations