def test_config_variables_in_test_yaml_config(mock_emit, empty_data_context_stats_enabled, sa): context: DataContext = empty_data_context_stats_enabled db_file = file_relative_path( __file__, os.path.join("..", "test_sets", "test_cases_for_sql_data_connector.db"), ) context.save_config_variable("db_file", db_file) context.save_config_variable("data_connector_name", "my_very_awesome_data_connector") context.save_config_variable("suffix", "__whole_table") context.save_config_variable("sampling_n", "10") print(context.config_variables) first_config = """ class_name: SimpleSqlalchemyDatasource connection_string: sqlite:///${db_file} introspection: ${data_connector_name}: data_asset_name_suffix: ${suffix} sampling_method: _sample_using_limit sampling_kwargs: n: ${sampling_n} """ my_datasource = context.test_yaml_config(first_config) assert ("test_cases_for_sql_data_connector.db" in my_datasource.execution_engine.connection_string) assert mock_emit.call_count == 1 # Substitute anonymized names since it changes for each run anonymized_datasource_name = mock_emit.call_args_list[0][0][0][ "event_payload"]["anonymized_name"] anonymized_data_connector_name = mock_emit.call_args_list[0][0][0][ "event_payload"]["anonymized_data_connectors"][0]["anonymized_name"] expected_call_args_list = [ mock.call({ "event": "data_context.test_yaml_config", "event_payload": { "anonymized_name": anonymized_datasource_name, "parent_class": "SimpleSqlalchemyDatasource", "anonymized_execution_engine": { "parent_class": "SqlAlchemyExecutionEngine" }, "anonymized_data_connectors": [{ "anonymized_name": anonymized_data_connector_name, "parent_class": "InferredAssetSqlDataConnector", }], }, "success": True, }), ] assert mock_emit.call_args_list == expected_call_args_list report_object = context.test_yaml_config(first_config, return_mode="report_object") print(json.dumps(report_object, indent=2)) assert report_object["data_connectors"]["count"] == 1 assert set(report_object["data_connectors"].keys()) == { "count", "my_very_awesome_data_connector", } assert mock_emit.call_count == 2 expected_call_args_list.append( mock.call({ "event": "data_context.test_yaml_config", "event_payload": { "anonymized_name": anonymized_datasource_name, "parent_class": "SimpleSqlalchemyDatasource", "anonymized_execution_engine": { "parent_class": "SqlAlchemyExecutionEngine" }, "anonymized_data_connectors": [{ "anonymized_name": anonymized_data_connector_name, "parent_class": "InferredAssetSqlDataConnector", }], }, "success": True, }), ) assert mock_emit.call_args_list == expected_call_args_list
def test_DataContext_raises_error_on_missing_config_version_aka_version_zero_with_v2_config( ): local_dir = file_relative_path( __file__, os.path.join(BASE_DIR, "version_2-0_but_no_version_defined")) with pytest.raises(ge_exceptions.InvalidDataContextConfigError): DataContext(local_dir)
def test_project_upgrade_with_exception(v10_project_directory, caplog): # test project upgrade that requires manual steps # copy v2 yml shutil.copy( file_relative_path( __file__, "../../test_fixtures/upgrade_helper/great_expectations_v1_basic_with_exception" ".yml", ), os.path.join(v10_project_directory, "great_expectations.yml"), ) runner = CliRunner(mix_stderr=False) result = runner.invoke( cli, ["project", "upgrade", "-d", v10_project_directory], input="\n", catch_exceptions=False, ) stdout = result.stdout with open( file_relative_path( __file__, "../../test_fixtures/upgrade_helper/test_project_upgrade_with_exception_expected_stdout.fixture", )) as f: expected_stdout = f.read() expected_stdout = expected_stdout.replace("GE_PROJECT_DIR", v10_project_directory) assert stdout == expected_stdout expected_project_tree_str = """\ great_expectations/ .gitignore great_expectations.yml checkpoints/ .gitkeep expectations/ .gitkeep notebooks/ .gitkeep plugins/ custom_store_backends/ __init__.py my_custom_store_backend.py uncommitted/ config_variables.yml data_docs/ local_site/ expectations/ .gitkeep static/ .gitkeep validations/ diabetic_data/ warning/ 20200430T191246.763896Z/ 20200430T191246.763896Z/ c3b4c5df224fef4b1a056a0f3b93aba5.html logs/ project_upgrades/ UpgradeHelperV11_20190926T134241.000000Z.json validations/ diabetic_data/ warning/ 20200430T191246.763896Z/ 20200430T191246.763896Z/ c3b4c5df224fef4b1a056a0f3b93aba5.json """ obs_project_tree_str = gen_directory_tree_str(v10_project_directory) assert obs_project_tree_str == expected_project_tree_str # make sure config number not incremented assert (DataContext.get_ge_config_version( context_root_dir=v10_project_directory) == 1) with open( file_relative_path( __file__, "../../test_fixtures/upgrade_helper/UpgradeHelperV11_basic_upgrade_with_exception_log" ".json", )) as f: expected_upgrade_log_dict = json.load(f) expected_upgrade_log_str = json.dumps(expected_upgrade_log_dict) expected_upgrade_log_str = expected_upgrade_log_str.replace( "GE_PROJECT_DIR", v10_project_directory) expected_upgrade_log_str = expected_upgrade_log_str.replace( "GE_PATH", os.path.split(great_expectations.__file__)[0]) expected_upgrade_log_dict = json.loads(expected_upgrade_log_str) with open( f"{v10_project_directory}/uncommitted/logs/project_upgrades/UpgradeHelperV11_20190926T134241.000000Z" f".json") as f: obs_upgrade_log_dict = json.load(f) obs_upgrade_log_dict["exceptions"][0]["exception_message"] = "" assert obs_upgrade_log_dict == expected_upgrade_log_dict
def test_cli_init_on_existing_project_with_no_uncommitted_dirs_answering_yes_to_fixing_them( mock_webbrowser, caplog, tmp_path_factory, ): """ This test walks through the onboarding experience. The user just checked an existing project out of source control and does not yet have an uncommitted directory. """ root_dir = tmp_path_factory.mktemp("hiya") root_dir = str(root_dir) os.makedirs(os.path.join(root_dir, "data")) data_folder_path = os.path.join(root_dir, "data") data_path = os.path.join(root_dir, "data", "Titanic.csv") fixture_path = file_relative_path( __file__, os.path.join("..", "test_sets", "Titanic.csv")) shutil.copy(fixture_path, data_path) # Create a new project from scratch that we will use for the test in the next step runner = CliRunner(mix_stderr=False) result = runner.invoke( cli, ["init", "-d", root_dir], input="\n\n1\n1\n{}\n\n\n\n2\n{}\n\n\n\n".format( data_folder_path, data_path), catch_exceptions=False, ) stdout = result.output assert result.exit_code == 0 assert mock_webbrowser.call_count == 1 assert ( "{}/great_expectations/uncommitted/data_docs/local_site/validations/Titanic/warning/" .format(root_dir) in mock_webbrowser.call_args[0][0]) assert "Great Expectations is now set up." in stdout context = DataContext(os.path.join(root_dir, DataContext.GE_DIR)) uncommitted_dir = os.path.join(context.root_directory, "uncommitted") shutil.rmtree(uncommitted_dir) assert not os.path.isdir(uncommitted_dir) # Test the second invocation of init runner = CliRunner(mix_stderr=False) with pytest.warns( UserWarning, match="Warning. An existing `great_expectations.yml` was found"): result = runner.invoke(cli, ["init", "-d", root_dir], input="Y\nn\n", catch_exceptions=False) stdout = result.stdout assert result.exit_code == 0 assert "Great Expectations added some missing files required to run." in stdout assert "You may see new files in" in stdout assert "OK. You must run" not in stdout assert "great_expectations init" not in stdout assert "to fix the missing files!" not in stdout assert "Would you like to build & view this project's Data Docs!?" in stdout assert os.path.isdir(uncommitted_dir) config_var_path = os.path.join(uncommitted_dir, "config_variables.yml") assert os.path.isfile(config_var_path) with open(config_var_path) as f: assert f.read() == CONFIG_VARIABLES_TEMPLATE assert_no_logging_messages_or_tracebacks(caplog, result)
def test_DataContext_raises_error_on_config_not_found(): local_dir = file_relative_path(__file__, os.path.join(BASE_DIR, "")) with pytest.raises(ge_exceptions.ConfigNotFoundError): DataContext(local_dir)
def test_cli_init_on_new_project( mock_emit, mock_webbrowser, caplog, tmp_path_factory, monkeypatch ): monkeypatch.delenv( "GE_USAGE_STATS", raising=False ) # Undo the project-wide test default project_dir = str(tmp_path_factory.mktemp("test_cli_init_diff")) os.makedirs(os.path.join(project_dir, "data")) data_folder_path = os.path.join(project_dir, "data") data_path = os.path.join(project_dir, "data", "Titanic.csv") fixture_path = file_relative_path(__file__, "../test_sets/Titanic.csv") shutil.copy(fixture_path, data_path) runner = CliRunner(mix_stderr=False) result = runner.invoke( cli, ["init", "-d", project_dir], input="\n\n1\n1\n{}\n\n\n\n2\n{}\n\n\n\n".format(data_folder_path, data_path), catch_exceptions=False, ) stdout = result.output assert mock_webbrowser.call_count == 1 assert ( "{}/great_expectations/uncommitted/data_docs/local_site/validations/Titanic/warning/".format( project_dir ) in mock_webbrowser.call_args[0][0] ) assert len(stdout) < 6000, "CLI output is unreasonably long." assert "Always know what to expect from your data" in stdout assert "What data would you like Great Expectations to connect to" in stdout assert "What are you processing your files with" in stdout assert "Enter the path of a data file (relative or absolute, s3a:// and gs:// paths are ok too)" in stdout assert "Name the new Expectation Suite [Titanic.warning]" in stdout assert ( "Great Expectations will choose a couple of columns and generate expectations about them" in stdout ) assert "Generating example Expectation Suite..." in stdout assert "Building" in stdout assert "Data Docs" in stdout assert "Done generating example Expectation Suite" in stdout assert "Great Expectations is now set up" in stdout assert os.path.isdir(os.path.join(project_dir, "great_expectations")) config_path = os.path.join(project_dir, "great_expectations/great_expectations.yml") assert os.path.isfile(config_path) config = yaml.load(open(config_path)) data_source_class = config["datasources"]["data__dir"]["data_asset_type"][ "class_name" ] assert data_source_class == "PandasDataset" obs_tree = gen_directory_tree_str(os.path.join(project_dir, "great_expectations")) # Instead of monkey patching guids, just regex out the guids guid_safe_obs_tree = re.sub( r"[a-z0-9]{32}(?=\.(json|html))", "foobarbazguid", obs_tree ) # print(guid_safe_obs_tree) assert ( guid_safe_obs_tree == """great_expectations/ .gitignore great_expectations.yml checkpoints/ expectations/ Titanic/ warning.json notebooks/ pandas/ validation_playground.ipynb spark/ validation_playground.ipynb sql/ validation_playground.ipynb plugins/ custom_data_docs/ renderers/ styles/ data_docs_custom_styles.css views/ uncommitted/ config_variables.yml data_docs/ local_site/ index.html expectations/ Titanic/ warning.html static/ fonts/ HKGrotesk/ HKGrotesk-Bold.otf HKGrotesk-BoldItalic.otf HKGrotesk-Italic.otf HKGrotesk-Light.otf HKGrotesk-LightItalic.otf HKGrotesk-Medium.otf HKGrotesk-MediumItalic.otf HKGrotesk-Regular.otf HKGrotesk-SemiBold.otf HKGrotesk-SemiBoldItalic.otf images/ favicon.ico glossary_scroller.gif iterative-dev-loop.png logo-long-vector.svg logo-long.png short-logo-vector.svg short-logo.png validation_failed_unexpected_values.gif styles/ data_docs_custom_styles_template.css data_docs_default_styles.css validations/ Titanic/ warning/ 20190926T134241.000000Z/ 20190926T134241.000000Z/ foobarbazguid.html validations/ Titanic/ warning/ 20190926T134241.000000Z/ 20190926T134241.000000Z/ foobarbazguid.json """ ) assert mock_emit.call_count == 9 assert mock_emit.call_args_list[1] == mock.call( {"event_payload": {}, "event": "cli.init.create", "success": True} ) assert_no_logging_messages_or_tracebacks(caplog, result)
def titanic_dataset(): df = ge.read_csv(file_relative_path(__file__, "../test_sets/Titanic.csv")) batch_df = PandasDataset(df) return batch_df
def taxicab_context(): return DataContext(context_root_dir=file_relative_path( __file__, "./configs/great_expectations_taxicab_context.yml"))
def test_file_format_map_output(): incomplete_file_path = file_relative_path( __file__, "../test_sets/toy_data_incomplete.csv") incomplete_file_dat = ge.data_asset.FileDataAsset(incomplete_file_path) null_file_path = file_relative_path(__file__, "../test_sets/null_file.csv") null_file_dat = ge.data_asset.FileDataAsset(null_file_path) white_space_path = file_relative_path(__file__, "../test_sets/white_space.txt") white_space_dat = ge.data_asset.FileDataAsset(white_space_path) # Boolean Expectation Output expectation = incomplete_file_dat.expect_file_line_regex_match_count_to_equal( regex=r",\S", expected_count=3, skip=1, result_format="BOOLEAN_ONLY", include_config=False, ) expected_result = ExpectationValidationResult(success=False) assert expected_result == expectation # Empty File Expectations expectation = null_file_dat.expect_file_line_regex_match_count_to_equal( regex=r",\S", expected_count=3, skip=1, result_format="BASIC", include_config=False, ) expected_result = ExpectationValidationResult( success=None, result={ "element_count": 0, "missing_count": 0, "missing_percent": None, "unexpected_count": 0, "unexpected_percent": None, "unexpected_percent_nonmissing": None, "partial_unexpected_list": [], }, ) assert expected_result == expectation # White Space File expectation = white_space_dat.expect_file_line_regex_match_count_to_equal( regex=r",\S", expected_count=3, result_format="BASIC", include_config=False) expected_result = ExpectationValidationResult( success=None, result={ "element_count": 11, "missing_count": 11, "missing_percent": 100.0, "unexpected_count": 0, "unexpected_percent": 0, "unexpected_percent_nonmissing": None, "partial_unexpected_list": [], }, ) assert expected_result == expectation # Complete Result Format expectation = incomplete_file_dat.expect_file_line_regex_match_count_to_equal( regex=r",\S", expected_count=3, skip=1, result_format="COMPLETE", include_config=False, ) expected_result = ExpectationValidationResult( success=False, result={ "element_count": 9, "missing_count": 2, "missing_percent": (2 / 9 * 100), "unexpected_count": 3, "unexpected_percent": (3 / 9 * 100), "unexpected_percent_nonmissing": (3 / 7 * 100), "partial_unexpected_list": ["A,C,1\n", "B,1,4\n", "A,1,4\n"], "partial_unexpected_counts": [ { "value": "A,1,4\n", "count": 1 }, { "value": "A,C,1\n", "count": 1 }, { "value": "B,1,4\n", "count": 1 }, ], "partial_unexpected_index_list": [0, 3, 5], "unexpected_list": ["A,C,1\n", "B,1,4\n", "A,1,4\n"], "unexpected_index_list": [0, 3, 5], }, ) assert expected_result == expectation # Invalid Result Format with pytest.raises(ValueError): expectation = incomplete_file_dat.expect_file_line_regex_match_count_to_equal( regex=r",\S", expected_count=3, skip=1, result_format="JOKE", include_config=False, )
def bobby_columnar_table_multi_batch(): """ # TODO: <Alex>ALEX -- Add DocString</Alex> """ verbose_profiler_config_file_path: str = file_relative_path( __file__, "bobby_user_workflow_verbose_profiler_config.yml" ) verbose_profiler_config: str with open(verbose_profiler_config_file_path) as f: verbose_profiler_config = f.read() my_row_count_range_rule_expectation_configurations_oneshot_sampling_method: List[ ExpectationConfiguration ] = [ ExpectationConfiguration( **{ "kwargs": {"min_value": 7505, "max_value": 8495}, "expectation_type": "expect_table_row_count_to_be_between", "meta": { "profiler_details": { "metric_configuration": { "metric_name": "table.row_count", "domain_kwargs": {}, }, "num_batches": 2, }, }, }, ), ] my_column_ranges_rule_expectation_configurations_oneshot_sampling_method: List[ ExpectationConfiguration ] = [ ExpectationConfiguration( **{ "expectation_type": "expect_column_min_to_be_between", "meta": { "profiler_details": { "metric_configuration": { "metric_name": "column.min", "domain_kwargs": {"column": "VendorID"}, }, "num_batches": 2, } }, "kwargs": { "column": "VendorID", "min_value": 1, "max_value": 1, "mostly": 1.0, }, }, ), ExpectationConfiguration( **{ "expectation_type": "expect_column_max_to_be_between", "meta": { "profiler_details": { "metric_configuration": { "metric_name": "column.max", "domain_kwargs": {"column": "VendorID"}, }, "num_batches": 2, } }, "kwargs": { "column": "VendorID", "min_value": 4, "max_value": 4, "mostly": 1.0, }, }, ), ExpectationConfiguration( **{ "expectation_type": "expect_column_min_to_be_between", "meta": { "profiler_details": { "metric_configuration": { "metric_name": "column.min", "domain_kwargs": {"column": "passenger_count"}, }, "num_batches": 2, } }, "kwargs": { "column": "passenger_count", "min_value": 0, "max_value": 1, "mostly": 1.0, }, }, ), ExpectationConfiguration( **{ "expectation_type": "expect_column_max_to_be_between", "meta": { "profiler_details": { "metric_configuration": { "metric_name": "column.max", "domain_kwargs": {"column": "passenger_count"}, }, "num_batches": 2, } }, "kwargs": { "column": "passenger_count", "min_value": 6, "max_value": 6, "mostly": 1.0, }, }, ), ExpectationConfiguration( **{ "expectation_type": "expect_column_min_to_be_between", "meta": { "profiler_details": { "metric_configuration": { "metric_name": "column.min", "domain_kwargs": {"column": "trip_distance"}, }, "num_batches": 2, } }, "kwargs": { "column": "trip_distance", "min_value": 0.0, "max_value": 0.0, "mostly": 1.0, }, }, ), ExpectationConfiguration( **{ "expectation_type": "expect_column_max_to_be_between", "meta": { "profiler_details": { "metric_configuration": { "metric_name": "column.max", "domain_kwargs": {"column": "trip_distance"}, }, "num_batches": 2, } }, "kwargs": { "column": "trip_distance", "min_value": 37.62, "max_value": 57.85, "mostly": 1.0, }, }, ), ExpectationConfiguration( **{ "expectation_type": "expect_column_min_to_be_between", "meta": { "profiler_details": { "metric_configuration": { "metric_name": "column.min", "domain_kwargs": {"column": "RatecodeID"}, }, "num_batches": 2, } }, "kwargs": { "column": "RatecodeID", "min_value": 1, "max_value": 1, "mostly": 1.0, }, }, ), ExpectationConfiguration( **{ "expectation_type": "expect_column_max_to_be_between", "meta": { "profiler_details": { "metric_configuration": { "metric_name": "column.max", "domain_kwargs": {"column": "RatecodeID"}, }, "num_batches": 2, } }, "kwargs": { "column": "RatecodeID", "min_value": 5, "max_value": 6, "mostly": 1.0, }, }, ), ExpectationConfiguration( **{ "expectation_type": "expect_column_min_to_be_between", "meta": { "profiler_details": { "metric_configuration": { "metric_name": "column.min", "domain_kwargs": {"column": "PULocationID"}, }, "num_batches": 2, } }, "kwargs": { "column": "PULocationID", "min_value": 1, "max_value": 1, "mostly": 1.0, }, }, ), ExpectationConfiguration( **{ "expectation_type": "expect_column_max_to_be_between", "meta": { "profiler_details": { "metric_configuration": { "metric_name": "column.max", "domain_kwargs": {"column": "PULocationID"}, }, "num_batches": 2, } }, "kwargs": { "column": "PULocationID", "min_value": 265, "max_value": 265, "mostly": 1.0, }, }, ), ExpectationConfiguration( **{ "expectation_type": "expect_column_min_to_be_between", "meta": { "profiler_details": { "metric_configuration": { "metric_name": "column.min", "domain_kwargs": {"column": "DOLocationID"}, }, "num_batches": 2, } }, "kwargs": { "column": "DOLocationID", "min_value": 1, "max_value": 1, "mostly": 1.0, }, }, ), ExpectationConfiguration( **{ "expectation_type": "expect_column_max_to_be_between", "meta": { "profiler_details": { "metric_configuration": { "metric_name": "column.max", "domain_kwargs": {"column": "DOLocationID"}, }, "num_batches": 2, } }, "kwargs": { "column": "DOLocationID", "min_value": 265, "max_value": 265, "mostly": 1.0, }, }, ), ExpectationConfiguration( **{ "expectation_type": "expect_column_min_to_be_between", "meta": { "profiler_details": { "metric_configuration": { "metric_name": "column.min", "domain_kwargs": {"column": "payment_type"}, }, "num_batches": 2, } }, "kwargs": { "column": "payment_type", "min_value": 1, "max_value": 1, "mostly": 1.0, }, }, ), ExpectationConfiguration( **{ "expectation_type": "expect_column_max_to_be_between", "meta": { "profiler_details": { "metric_configuration": { "metric_name": "column.max", "domain_kwargs": {"column": "payment_type"}, }, "num_batches": 2, } }, "kwargs": { "column": "payment_type", "min_value": 4, "max_value": 4, "mostly": 1.0, }, }, ), ExpectationConfiguration( **{ "expectation_type": "expect_column_min_to_be_between", "meta": { "profiler_details": { "metric_configuration": { "metric_name": "column.min", "domain_kwargs": {"column": "fare_amount"}, }, "num_batches": 2, } }, "kwargs": { "column": "fare_amount", "min_value": -51.84, "max_value": -21.16, "mostly": 1.0, }, }, ), ExpectationConfiguration( **{ "expectation_type": "expect_column_max_to_be_between", "meta": { "profiler_details": { "metric_configuration": { "metric_name": "column.max", "domain_kwargs": {"column": "fare_amount"}, }, "num_batches": 2, } }, "kwargs": { "column": "fare_amount", "min_value": 228.94, "max_value": 2990.05, "mostly": 1.0, }, }, ), ExpectationConfiguration( **{ "expectation_type": "expect_column_min_to_be_between", "meta": { "profiler_details": { "metric_configuration": { "metric_name": "column.min", "domain_kwargs": {"column": "extra"}, }, "num_batches": 2, } }, "kwargs": { "column": "extra", "min_value": -36.53, "max_value": -1.18, "mostly": 1.0, }, }, ), ExpectationConfiguration( **{ "expectation_type": "expect_column_max_to_be_between", "meta": { "profiler_details": { "metric_configuration": { "metric_name": "column.max", "domain_kwargs": {"column": "extra"}, }, "num_batches": 2, } }, "kwargs": { "column": "extra", "min_value": 4.51, "max_value": 6.99, "mostly": 1.0, }, }, ), ExpectationConfiguration( **{ "expectation_type": "expect_column_min_to_be_between", "meta": { "profiler_details": { "metric_configuration": { "metric_name": "column.min", "domain_kwargs": {"column": "mta_tax"}, }, "num_batches": 2, } }, "kwargs": { "column": "mta_tax", "min_value": -0.5, "max_value": -0.5, "mostly": 1.0, }, }, ), ExpectationConfiguration( **{ "expectation_type": "expect_column_max_to_be_between", "meta": { "profiler_details": { "metric_configuration": { "metric_name": "column.max", "domain_kwargs": {"column": "mta_tax"}, }, "num_batches": 2, } }, "kwargs": { "column": "mta_tax", "min_value": 0.69, "max_value": 37.32, "mostly": 1.0, }, }, ), ExpectationConfiguration( **{ "expectation_type": "expect_column_min_to_be_between", "meta": { "profiler_details": { "metric_configuration": { "metric_name": "column.min", "domain_kwargs": {"column": "tip_amount"}, }, "num_batches": 2, } }, "kwargs": { "column": "tip_amount", "min_value": 0.0, "max_value": 0.0, "mostly": 1.0, }, }, ), ExpectationConfiguration( **{ "expectation_type": "expect_column_max_to_be_between", "meta": { "profiler_details": { "metric_configuration": { "metric_name": "column.max", "domain_kwargs": {"column": "tip_amount"}, }, "num_batches": 2, } }, "kwargs": { "column": "tip_amount", "min_value": 46.84, "max_value": 74.86, "mostly": 1.0, }, }, ), ExpectationConfiguration( **{ "expectation_type": "expect_column_min_to_be_between", "meta": { "profiler_details": { "metric_configuration": { "metric_name": "column.min", "domain_kwargs": {"column": "tolls_amount"}, }, "num_batches": 2, } }, "kwargs": { "column": "tolls_amount", "min_value": 0.0, "max_value": 0.0, "mostly": 1.0, }, }, ), ExpectationConfiguration( **{ "expectation_type": "expect_column_max_to_be_between", "meta": { "profiler_details": { "metric_configuration": { "metric_name": "column.max", "domain_kwargs": {"column": "tolls_amount"}, }, "num_batches": 2, } }, "kwargs": { "column": "tolls_amount", "min_value": 26.4, "max_value": 497.67, "mostly": 1.0, }, }, ), ExpectationConfiguration( **{ "expectation_type": "expect_column_min_to_be_between", "meta": { "profiler_details": { "metric_configuration": { "metric_name": "column.min", "domain_kwargs": {"column": "improvement_surcharge"}, }, "num_batches": 2, } }, "kwargs": { "column": "improvement_surcharge", "min_value": -0.3, "max_value": -0.3, "mostly": 1.0, }, }, ), ExpectationConfiguration( **{ "expectation_type": "expect_column_max_to_be_between", "meta": { "profiler_details": { "metric_configuration": { "metric_name": "column.max", "domain_kwargs": {"column": "improvement_surcharge"}, }, "num_batches": 2, } }, "kwargs": { "column": "improvement_surcharge", "min_value": 0.3, "max_value": 0.3, "mostly": 1.0, }, }, ), ExpectationConfiguration( **{ "expectation_type": "expect_column_min_to_be_between", "meta": { "profiler_details": { "metric_configuration": { "metric_name": "column.min", "domain_kwargs": {"column": "total_amount"}, }, "num_batches": 2, } }, "kwargs": { "column": "total_amount", "min_value": -52.66, "max_value": -24.44, "mostly": 1.0, }, }, ), ExpectationConfiguration( **{ "expectation_type": "expect_column_max_to_be_between", "meta": { "profiler_details": { "metric_configuration": { "metric_name": "column.max", "domain_kwargs": {"column": "total_amount"}, }, "num_batches": 2, } }, "kwargs": { "column": "total_amount", "min_value": 550.18, "max_value": 2992.47, "mostly": 1.0, }, }, ), ExpectationConfiguration( **{ "expectation_type": "expect_column_min_to_be_between", "meta": { "profiler_details": { "metric_configuration": { "metric_name": "column.min", "domain_kwargs": {"column": "congestion_surcharge"}, }, "num_batches": 2, } }, "kwargs": { "column": "congestion_surcharge", "min_value": -2.49, "max_value": -0.01, "mostly": 1.0, }, }, ), ExpectationConfiguration( **{ "expectation_type": "expect_column_max_to_be_between", "meta": { "profiler_details": { "metric_configuration": { "metric_name": "column.max", "domain_kwargs": {"column": "congestion_surcharge"}, }, "num_batches": 2, } }, "kwargs": { "column": "congestion_surcharge", "min_value": 0.01, "max_value": 2.49, "mostly": 1.0, }, }, ), ] expectation_configurations: List[ExpectationConfiguration] = [] expectation_configurations.extend( my_row_count_range_rule_expectation_configurations_oneshot_sampling_method ) expectation_configurations.extend( my_column_ranges_rule_expectation_configurations_oneshot_sampling_method ) expectation_suite_name_oneshot_sampling_method: str = ( "bobby_columnar_table_multi_batch_oneshot_sampling_method" ) expected_expectation_suite_oneshot_sampling_method: ExpectationSuite = ( ExpectationSuite( expectation_suite_name=expectation_suite_name_oneshot_sampling_method ) ) expectation_configuration: ExpectationConfiguration for expectation_configuration in expectation_configurations: expected_expectation_suite_oneshot_sampling_method.add_expectation( expectation_configuration ) yaml = YAML() profiler_config: dict = yaml.load(verbose_profiler_config) expected_expectation_suite_oneshot_sampling_method.add_citation( comment="Suite created by Rule-Based Profiler with the configuration included.", profiler_config=profiler_config, ) return { "profiler_config": verbose_profiler_config, "test_configuration_oneshot_sampling_method": { "expectation_suite_name": expectation_suite_name_oneshot_sampling_method, "expected_expectation_suite": expected_expectation_suite_oneshot_sampling_method, }, }
def test_validate(): with open( file_relative_path(__file__, "./test_sets/titanic_expectations.json")) as f: my_expectation_suite = expectationSuiteSchema.loads(f.read()) with mock.patch("uuid.uuid1") as uuid: uuid.return_value = "1234" my_df = ge.read_csv( file_relative_path(__file__, "./test_sets/Titanic.csv"), expectation_suite=my_expectation_suite, ) my_df.set_default_expectation_argument("result_format", "COMPLETE") with mock.patch("datetime.datetime") as mock_datetime: mock_datetime.utcnow.return_value = datetime(1955, 11, 5) results = my_df.validate(catch_exceptions=False) with open( file_relative_path( __file__, "./test_sets/titanic_expected_data_asset_validate_results.json" )) as f: expected_results = expectationSuiteValidationResultSchema.loads( f.read()) del results.meta["great_expectations.__version__"] assert expected_results == results # Now, change the results and ensure they are no longer equal results.results[0] = ExpectationValidationResult() assert expected_results != results # Finally, confirm that only_return_failures works # and does not affect the "statistics" field. with mock.patch("datetime.datetime") as mock_datetime: mock_datetime.utcnow.return_value = datetime(1955, 11, 5) validation_results = my_df.validate(only_return_failures=True) del validation_results.meta["great_expectations.__version__"] expected_results = ExpectationSuiteValidationResult( meta={ "expectation_suite_name": "titanic", "run_id": "19551105T000000.000000Z", "batch_kwargs": { "ge_batch_id": "1234" }, "batch_markers": {}, "batch_parameters": {}, }, results=[ ExpectationValidationResult( expectation_config=ExpectationConfiguration( expectation_type="expect_column_values_to_be_in_set", kwargs={ "column": "PClass", "value_set": ["1st", "2nd", "3rd"] }, ), success=False, exception_info={ "exception_message": None, "exception_traceback": None, "raised_exception": False, }, result={ "partial_unexpected_index_list": [456], "unexpected_count": 1, "unexpected_list": ["*"], "unexpected_percent": 0.07616146230007616, "element_count": 1313, "missing_percent": 0.0, "partial_unexpected_counts": [{ "count": 1, "value": "*" }], "partial_unexpected_list": ["*"], "unexpected_percent_nonmissing": 0.07616146230007616, "missing_count": 0, "unexpected_index_list": [456], }, ) ], success=expected_results.success, # unaffected statistics=expected_results["statistics"], # unaffected ) assert expected_results == validation_results
def __init__(self, requirements_relative_base_dir: str = "../../../") -> None: self._requirements_relative_base_dir = file_relative_path( __file__, requirements_relative_base_dir) self._dev_requirements_prefix: str = "requirements-dev"
import os import shutil import subprocess import sys import pytest from assets.scripts.build_gallery import execute_shell_command from great_expectations.data_context.util import file_relative_path integration_test_matrix = [ { "name": "pandas_two_batch_requests_two_validators", "base_dir": file_relative_path(__file__, "../../"), "data_context_dir": "tests/integration/fixtures/yellow_trip_data_pandas_fixture/great_expectations", "data_dir": "tests/test_sets/taxi_yellow_trip_data_samples", "user_flow_script": "tests/integration/fixtures/yellow_trip_data_pandas_fixture/two_batch_requests_two_validators.py", "expected_stderrs": "", "expected_stdouts": "", }, ] def idfn(test_configuration): return test_configuration.get("name") @pytest.mark.docs @pytest.mark.integration
def test_snowflake_key_pair_credentials(mock_prompt, basic_sqlalchemy_datasource): database_key_path_pass = file_relative_path( __file__, "../../test_fixtures/database_key_test.p8") mock_prompt.side_effect = [ "3", "user", "ABCD.us-east-1", "default_db", "default_schema", "xsmall", "public", database_key_path_pass, "test123", ] credentials = _collect_snowflake_credentials(None) assert credentials == { "drivername": "snowflake", "database": "default_db", "host": "ABCD.us-east-1", "private_key_path": database_key_path_pass, "private_key_passphrase": "test123", "query": { "role": "public", "schema": "default_schema", "warehouse": "xsmall" }, "username": "******", } # making sure with the correct params the key is read correctly basic_sqlalchemy_datasource._get_sqlalchemy_key_pair_auth_url( "snowflake", deepcopy(credentials)) # check that with a bad pass phrase an informative message is returned to the user credentials["private_key_passphrase"] = "bad_pass" with pytest.raises(DatasourceKeyPairAuthBadPassphraseError) as e: basic_sqlalchemy_datasource._get_sqlalchemy_key_pair_auth_url( "snowflake", deepcopy(credentials)) assert "passphrase incorrect" in e.value.message # check that with no pass the key is read correctly database_key_path_no_pass = file_relative_path( __file__, "../../test_fixtures/database_key_test_no_pass.p8") credentials["private_key_path"] = database_key_path_no_pass credentials["private_key_passphrase"] = "" ( sqlalchemy_uri, create_engine_kwargs, ) = basic_sqlalchemy_datasource._get_sqlalchemy_key_pair_auth_url( "snowflake", deepcopy(credentials)) assert ( str(sqlalchemy_uri) == "snowflake://[email protected]/default_db?role=public&schema=default_schema&warehouse=xsmall" ) assert create_engine_kwargs.get("connect_args", {}).get( "private_key", "") # check that the private_key is not empty
def test_v2_to_v3_project_upgrade_without_manual_steps( v20_project_directory_with_v30_configuration_and_no_checkpoints, caplog): runner: CliRunner = CliRunner(mix_stderr=False) result: Result = runner.invoke( cli, [ "-c", v20_project_directory_with_v30_configuration_and_no_checkpoints, "--v3-api", "project", "upgrade", ], input="\n", catch_exceptions=False, ) stdout: str = result.stdout with open( file_relative_path( __file__, "../../test_fixtures/upgrade_helper/test_v2_to_v3_project_upgrade_without_manual_steps_expected_stdout.fixture", )) as f: expected_stdout: str = f.read() expected_stdout = expected_stdout.replace( "GE_PROJECT_DIR", v20_project_directory_with_v30_configuration_and_no_checkpoints, ) assert stdout == expected_stdout expected_project_tree_str: str = """\ great_expectations/ .gitignore great_expectations.yml expectations/ .ge_store_backend_id .gitkeep notebooks/ .gitkeep pandas/ validation_playground.ipynb spark/ validation_playground.ipynb sql/ validation_playground.ipynb plugins/ custom_data_docs/ styles/ data_docs_custom_styles.css uncommitted/ config_variables.yml data_docs/ local_site/ expectations/ .gitkeep static/ .gitkeep validations/ diabetic_data/ warning/ 20200430T191246.763896Z/ c3b4c5df224fef4b1a056a0f3b93aba5.html logs/ project_upgrades/ UpgradeHelperV13_20210119T132639.000000Z.json validations/ .ge_store_backend_id diabetic_data/ warning/ 20200430T191246.763896Z/ c3b4c5df224fef4b1a056a0f3b93aba5.json """ obs_project_tree_str: str = gen_directory_tree_str( startpath= v20_project_directory_with_v30_configuration_and_no_checkpoints) assert obs_project_tree_str == expected_project_tree_str # make sure config number incremented assert (DataContext.get_ge_config_version( context_root_dir= v20_project_directory_with_v30_configuration_and_no_checkpoints) == 3.0 ) with open( file_relative_path( __file__, "../../test_fixtures/upgrade_helper/UpgradeHelperV13_upgrade_without_manual_steps_log.json", )) as f: expected_upgrade_log_dict: dict = json.load(f) expected_upgrade_log_str: str = json.dumps(expected_upgrade_log_dict) expected_upgrade_log_str = expected_upgrade_log_str.replace( "GE_PROJECT_DIR", v20_project_directory_with_v30_configuration_and_no_checkpoints, ) expected_upgrade_log_dict = json.loads(expected_upgrade_log_str) with open( f"{v20_project_directory_with_v30_configuration_and_no_checkpoints}/uncommitted/logs/project_upgrades/UpgradeHelperV13_20210119T132639.000000Z.json" ) as f: obs_upgrade_log_dict: dict = json.load(f) assert obs_upgrade_log_dict == expected_upgrade_log_dict
def test_expectation_suite_filedata_asset(): # Load in data files file_path = file_relative_path(__file__, "../test_sets/toy_data_complete.csv") # Create FileDataAsset objects f_dat = ge.data_asset.FileDataAsset(file_path) # Set up expectations f_dat.expect_file_line_regex_match_count_to_equal( regex=r",\S", expected_count=3, skip=1, result_format="BASIC", catch_exceptions=True, ) f_dat.expect_file_line_regex_match_count_to_be_between( regex=r",\S", expected_max_count=2, skip=1, result_format="SUMMARY", include_config=True, ) # Test basic config output complete_config = f_dat.get_expectation_suite() assert [ ExpectationConfiguration( expectation_type="expect_file_line_regex_match_count_to_equal", kwargs=ExpectationKwargs(expected_count=3, regex=",\\S", skip=1), ) ] == complete_config.expectations # Include result format kwargs complete_config2 = f_dat.get_expectation_suite( discard_result_format_kwargs=False, discard_failed_expectations=False) assert [ ExpectationConfiguration( expectation_type="expect_file_line_regex_match_count_to_equal", kwargs={ "expected_count": 3, "regex": ",\\S", "result_format": "BASIC", "skip": 1, }, ), ExpectationConfiguration( expectation_type="expect_file_line_regex_match_count_to_be_between", kwargs={ "expected_max_count": 2, "regex": ",\\S", "result_format": "SUMMARY", "skip": 1, }, ), ] == complete_config2.expectations # Discard Failing Expectations complete_config3 = f_dat.get_expectation_suite( discard_result_format_kwargs=False, discard_failed_expectations=True) assert [ ExpectationConfiguration( expectation_type="expect_file_line_regex_match_count_to_equal", kwargs={ "expected_count": 3, "regex": ",\\S", "result_format": "BASIC", "skip": 1, }, ) ] == complete_config3.expectations
def _load_script_template() -> str: with open(file_relative_path(__file__, "checkpoint_script_template.py")) as f: template = f.read() return template
def test_pandas_source_read_csv( data_context_parameterized_expectation_suite, tmp_path_factory ): basedir = tmp_path_factory.mktemp("test_create_pandas_datasource") shutil.copy(file_relative_path(__file__, "../test_sets/unicode.csv"), basedir) data_context_parameterized_expectation_suite.add_datasource( "mysource", module_name="great_expectations.datasource", class_name="PandasDatasource", reader_options={"encoding": "utf-8"}, batch_kwargs_generators={ "subdir_reader": { "class_name": "SubdirReaderBatchKwargsGenerator", "base_directory": str(basedir), } }, ) data_context_parameterized_expectation_suite.create_expectation_suite( expectation_suite_name="unicode" ) batch = data_context_parameterized_expectation_suite.get_batch( data_context_parameterized_expectation_suite.build_batch_kwargs( "mysource", "subdir_reader", "unicode" ), "unicode", ) assert len(batch["Μ"] == 1) assert "😁" in list(batch["Μ"]) data_context_parameterized_expectation_suite.add_datasource( "mysource2", module_name="great_expectations.datasource", class_name="PandasDatasource", batch_kwargs_generators={ "subdir_reader": { "class_name": "SubdirReaderBatchKwargsGenerator", "base_directory": str(basedir), } }, ) batch = data_context_parameterized_expectation_suite.get_batch( data_context_parameterized_expectation_suite.build_batch_kwargs( "mysource2", "subdir_reader", "unicode" ), "unicode", ) assert "😁" in list(batch["Μ"]) data_context_parameterized_expectation_suite.add_datasource( "mysource3", module_name="great_expectations.datasource", class_name="PandasDatasource", batch_kwargs_generators={ "subdir_reader": { "class_name": "SubdirReaderBatchKwargsGenerator", "base_directory": str(basedir), "reader_options": {"encoding": "utf-16"}, } }, ) with pytest.raises(UnicodeError, match="UTF-16 stream does not start with BOM"): batch = data_context_parameterized_expectation_suite.get_batch( data_context_parameterized_expectation_suite.build_batch_kwargs( "mysource3", "subdir_reader", "unicode" ), "unicode", ) with pytest.raises(LookupError, match="unknown encoding: blarg"): batch_kwargs = data_context_parameterized_expectation_suite.build_batch_kwargs( "mysource3", "subdir_reader", "unicode" ) batch_kwargs.update({"reader_options": {"encoding": "blarg"}}) batch = data_context_parameterized_expectation_suite.get_batch( batch_kwargs=batch_kwargs, expectation_suite_name="unicode" ) with pytest.raises(LookupError, match="unknown encoding: blarg"): batch = data_context_parameterized_expectation_suite.get_batch( expectation_suite_name="unicode", batch_kwargs=data_context_parameterized_expectation_suite.build_batch_kwargs( "mysource", "subdir_reader", "unicode", reader_options={"encoding": "blarg"}, ), ) batch = data_context_parameterized_expectation_suite.get_batch( batch_kwargs=data_context_parameterized_expectation_suite.build_batch_kwargs( "mysource2", "subdir_reader", "unicode", reader_options={"encoding": "utf-8"}, ), expectation_suite_name="unicode", ) assert "😁" in list(batch["Μ"])
def copy_relative_path(relative_src, dest): shutil.copy(file_relative_path(__file__, relative_src), dest)
def test_expect_file_line_regex_match_count_to_be_between(): #####Invlaid File Path###### joke_file_path = "joke.txt" assert not os.path.isfile(joke_file_path) joke_dat = ge.data_asset.FileDataAsset(joke_file_path) with pytest.raises(IOError): joke_dat.expect_file_line_regex_match_count_to_be_between( regex=r",\S", expected_min_count=0, expected_max_count=4, skip=1 ) complete_file_path = file_relative_path( __file__, "../test_sets/toy_data_complete.csv" ) file_dat = ge.data_asset.FileDataAsset(complete_file_path) # Invalid Skip Parameter with pytest.raises(ValueError): file_dat.expect_file_line_regex_match_count_to_be_between( regex=r",\S", expected_min_count=0, expected_max_count=4, skip=2.4 ) # Invalid Regex with pytest.raises(ValueError): file_dat.expect_file_line_regex_match_count_to_be_between( regex=2, expected_min_count=1, expected_max_count=8, skip=2 ) # Non-integer min value with pytest.raises(ValueError): file_dat.expect_file_line_regex_match_count_to_be_between( regex=r",\S", expected_min_count=1.3, expected_max_count=8, skip=1 ) # Negative min value with pytest.raises(ValueError): file_dat.expect_file_line_regex_match_count_to_be_between( regex=r",\S", expected_min_count=-2, expected_max_count=8, skip=1 ) # Non-integer max value with pytest.raises(ValueError): file_dat.expect_file_line_regex_match_count_to_be_between( regex=r",\S", expected_min_count=0, expected_max_count="foo", skip=1 ) # Negative max value with pytest.raises(ValueError): file_dat.expect_file_line_regex_match_count_to_be_between( regex=r",\S", expected_min_count=0, expected_max_count=-1, skip=1 ) # Min count more than max count with pytest.raises(ValueError): file_dat.expect_file_line_regex_match_count_to_be_between( regex=r",\S", expected_min_count=4, expected_max_count=3, skip=1 ) # Count does not fall in range fail_trial = file_dat.expect_file_line_regex_match_count_to_be_between( regex=r",\S", expected_min_count=9, expected_max_count=12, skip=1 ) assert not fail_trial.success assert fail_trial.result["unexpected_percent"] == 100 assert fail_trial.result["missing_percent"] == 0 # Count does fall in range success_trial = file_dat.expect_file_line_regex_match_count_to_be_between( regex=r",\S", expected_min_count=0, expected_max_count=4, skip=1 ) assert success_trial.success assert success_trial.result["unexpected_percent"] == 0 assert success_trial.result["missing_percent"] == 0
def test_profiler_all_expectation_types(titanic_data_context, possible_expectations_set): """ What does this test do and why? Ensures that all available expectation types work as expected """ context = titanic_data_context df = ge.read_csv( file_relative_path( __file__, "../test_sets/taxi_yellow_tripdata_samples/yellow_tripdata_sample_2019-01.csv", )) batch_df = ge.dataset.PandasDataset(df) ignored_columns = [ "pickup_location_id", "dropoff_location_id", "fare_amount", "extra", "mta_tax", "tip_amount", "tolls_amount", "improvement_surcharge", "congestion_surcharge", ] semantic_types = { "datetime": ["pickup_datetime", "dropoff_datetime"], "numeric": ["total_amount", "passenger_count"], "value_set": [ "payment_type", "rate_code_id", "store_and_fwd_flag", "passenger_count", ], "boolean": ["store_and_fwd_flag"], } profiler = UserConfigurableProfiler( batch_df, semantic_types_dict=semantic_types, ignored_columns=ignored_columns, primary_or_compound_key=[ "vendor_id", "pickup_datetime", "dropoff_datetime", "trip_distance", "pickup_location_id", "dropoff_location_id", ], ) assert profiler.column_info.get("rate_code_id") suite = profiler.build_suite() assert len(suite.expectations) == 46 ( columns_with_expectations, expectations_from_suite, ) = get_set_of_columns_and_expectations_from_suite(suite) unexpected_expectations = { "expect_column_values_to_be_unique", "expect_column_values_to_be_null", } assert expectations_from_suite == { i for i in possible_expectations_set if i not in unexpected_expectations } ignored_included_columns_overlap = [ i for i in columns_with_expectations if i in ignored_columns ] assert len(ignored_included_columns_overlap) == 0 results = context.run_validation_operator("action_list_operator", assets_to_validate=[batch_df]) assert results["success"]
def test_validate_distribution_parameters(self): D = ge.read_csv( file_relative_path( __file__, "../test_sets/fixed_distributional_test_dataset.csv")) # ------ p_value ------ with self.assertRaises(ValueError): # p_value is 0 D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than( "norm", distribution="norm", params=[0, 1], p_value=0) with self.assertRaises(ValueError): # p_value negative D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than( "norm", distribution="norm", params=[0, 1], p_value=-0.1) with self.assertRaises(ValueError): P_value = 1 D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than( "norm", distribution="norm", params=[0, 1], p_value=1) with self.assertRaises(ValueError): # p_value greater than 1 D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than( "norm", distribution="norm", params=[0, 1], p_value=1.1) with self.assertRaises(ValueError): # params is none D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than( "norm", distribution="norm", params=None) # ---- std_dev ------ with self.assertRaises(ValueError): # std_dev is 0, dict D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than( "norm", distribution="norm", params={ "mean": 0, "std_dev": 0 }) with self.assertRaises(ValueError): # std_dev is negative, dict D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than( "norm", distribution="norm", params={ "mean": 0, "std_dev": -1 }) with self.assertRaises(ValueError): # std_dev is 0, list D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than( "norm", distribution="norm", params=[0, 0]) with self.assertRaises(ValueError): # std_dev is negative, list D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than( "norm", distribution="norm", params=[0, -1]) # ------- beta ------ with self.assertRaises(ValueError): # beta, alpha is 0, dict params D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than( "beta", distribution="beta", params={ "alpha": 0, "beta": 0.1 }) with self.assertRaises(ValueError): # beta, alpha is negative, dict params D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than( "beta", distribution="beta", params={ "alpha": -1, "beta": 0.1 }) with self.assertRaises(ValueError): # beta, beta is 0, dict params D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than( "beta", distribution="beta", params={ "alpha": 0.1, "beta": 0 }) with self.assertRaises(ValueError): # beta, beta is negative, dict params D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than( "beta", distribution="beta", params={ "alpha": 0, "beta": -1 }) with self.assertRaises(ValueError): # beta, alpha is 0, list params D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than( "beta", distribution="beta", params=[0, 0.1]) with self.assertRaises(ValueError): # beta, alpha is negative, list params D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than( "beta", distribution="beta", params=[-1, 0.1]) with self.assertRaises(ValueError): # beta, beta is 0, list params D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than( "beta", distribution="beta", params=[0.1, 0]) with self.assertRaises(ValueError): # beta, beta is negative, list params D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than( "beta", distribution="beta", params=[0.1, -1]) with self.assertRaises(ValueError): # beta, missing alpha, dict D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than( "beta", distribution="beta", params={"beta": 0.1}) with self.assertRaises(ValueError): # beta, missing beta, dict D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than( "beta", distribution="beta", params={"alpha": 0.1}) with self.assertRaises(ValueError): # beta, missing beta, list D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than( "beta", distribution="beta", params=[1]) with self.assertRaises(ValueError): # beta, missing beta, list D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than( "beta", distribution="beta", params=[1, 1, 1, 1, 1]) # ------ Gamma ------- with self.assertRaises(ValueError): # gamma, alpha is 0, dict D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than( "gamma", distribution="gamma", params={"alpha": 0}) with self.assertRaises(ValueError): # gamma, alpha is negative, dict D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than( "gamma", distribution="gamma", params={"alpha": -1}) with self.assertRaises(ValueError): # gamma, alpha is 0, dict D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than( "gamma", distribution="gamma", params={"alpha": 0}) with self.assertRaises(ValueError): # gamma, alpha is missing, dict D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than( "gamma", distribution="gamma", params={}) with self.assertRaises(ValueError): # gamma, alpha is missing, list D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than( "gamma", distribution="gamma", params=[]) with self.assertRaises(ValueError): # gamma, alpha is 0, list D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than( "gamma", distribution="gamma", params=[0]) with self.assertRaises(ValueError): # gamma, alpha is negative, list D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than( "gamma", distribution="gamma", params=[-1]) with self.assertRaises(ValueError): # gamma, too many arguments, list D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than( "gamma", distribution="gamma", params=[1, 1, 1, 1]) # ----- chi2 -------- with self.assertRaises(ValueError): # chi2, df is 0, dict D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than( "chi2", distribution="chi2", params={"df": 0}) with self.assertRaises(ValueError): # chi2, df is negative, dict D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than( "chi2", distribution="chi2", params={"df": -1}) with self.assertRaises(ValueError): # chi2, df is missing, dict D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than( "chi2", distribution="chi2", params={}) with self.assertRaises(ValueError): # chi2, df is 0, list D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than( "chi2", distribution="chi2", params=[0]) with self.assertRaises(ValueError): # chi2, df is negative, list D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than( "chi2", distribution="chi2", params=[-1]) with self.assertRaises(ValueError): # chi2, df is missing, list D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than( "chi2", distribution="chi2", params=[]) with self.assertRaises(ValueError): # chi2, too many parameters, list D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than( "chi2", distribution="chi2", params=[1, 1, 1, 5]) # ----- norm ------ with self.assertRaises(ValueError): # norm, too many arguments, list D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than( "norm", distribution="norm", params=[0, 1, 500]) # ----- uniform ----- with self.assertRaises(ValueError): # uniform, scale is 0, list D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than( "uniform", distribution="uniform", params=[0, 0]) with self.assertRaises(ValueError): # uniform, scale is negative, list D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than( "uniform", distribution="uniform", params=[0, -1]) with self.assertRaises(ValueError): # uniform, scale is negative, dict D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than( "uniform", distribution="uniform", params={ "loc": 0, "scale": -1 }) with self.assertRaises(ValueError): # uniform, scale is 0, dict D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than( "uniform", distribution="uniform", params={ "loc": 0, "scale": 0 }) with self.assertRaises(ValueError): # uniform, too many parameters, list D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than( "uniform", distribution="uniform", params=[0, 1, 500]) # --- expon --- with self.assertRaises(ValueError): # expon, scale is 0, list D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than( "exponential", distribution="expon", params=[0, 0]) with self.assertRaises(ValueError): # expon, scale is negative, list D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than( "exponential", distribution="expon", params=[0, -1]) with self.assertRaises(ValueError): # expon, scale is 0, dict D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than( "exponential", distribution="expon", params={ "loc": 0, "scale": 0 }) with self.assertRaises(ValueError): # expon, scale is negative, dict D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than( "exponential", distribution="expon", params={ "loc": 0, "scale": -1 }) with self.assertRaises(ValueError): # expon, too many parameters, list D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than( "exponential", distribution="expon", params=[0, 1, 500]) # --- misc --- with self.assertRaises(AttributeError): # non-supported distribution D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than( "exponential", distribution="fakedistribution", params=[0, 1])
def test_get_batch_list_from_new_style_datasource_with_file_system_datasource_configured_assets( empty_data_context, tmp_path_factory): context = empty_data_context base_directory = str( tmp_path_factory.mktemp( "test_get_batch_list_from_new_style_datasource_with_file_system_datasource_configured_assets" )) titanic_asset_base_directory_path: str = os.path.join( base_directory, "data") os.makedirs(titanic_asset_base_directory_path) titanic_csv_source_file_path: str = file_relative_path( __file__, "../test_sets/Titanic.csv") titanic_csv_destination_file_path: str = str( os.path.join(base_directory, "data/Titanic_19120414_1313.csv")) shutil.copy(titanic_csv_source_file_path, titanic_csv_destination_file_path) config = yaml.load( f""" class_name: Datasource execution_engine: class_name: PandasExecutionEngine data_connectors: my_data_connector: class_name: ConfiguredAssetFilesystemDataConnector base_directory: {base_directory} glob_directive: "*.csv" default_regex: pattern: (.+)\\.csv group_names: - name assets: Titanic: base_directory: {titanic_asset_base_directory_path} pattern: (.+)_(\\d+)_(\\d+)\\.csv group_names: - name - timestamp - size """, ) context.add_datasource( "my_datasource", **config, ) batch_request: Union[dict, BatchRequest] = { "datasource_name": "my_datasource", "data_connector_name": "my_data_connector", "data_asset_name": "Titanic", "partition_request": { "batch_identifiers": { "name": "Titanic", "timestamp": "19120414", "size": "1313", } }, } batch_list: List[Batch] = context.get_batch_list(**batch_request) assert len(batch_list) == 1 batch: Batch = batch_list[0] assert batch.batch_spec is not None assert batch.batch_definition["data_asset_name"] == "Titanic" assert batch.batch_definition["partition_definition"] == { "name": "Titanic", "timestamp": "19120414", "size": "1313", } assert isinstance(batch.data.dataframe, pd.DataFrame) assert batch.data.dataframe.shape == (1313, 7)
def test_infer_distribution_parameters(self): D = ge.read_csv( file_relative_path( __file__, "../test_sets/fixed_distributional_test_dataset.csv")) with self.assertRaises(TypeError): ge.dataset.util.infer_distribution_parameters( data=D.norm, distribution="norm", params=["wrong_param_format"]) t = ge.dataset.util.infer_distribution_parameters(data=D.norm_std, distribution="norm", params=None) self.assertEqual(t["mean"], D.norm_std.mean()) self.assertEqual(t["std_dev"], D.norm_std.std()) self.assertEqual(t["loc"], 0) self.assertEqual(t["scale"], 1) # beta t = ge.dataset.util.infer_distribution_parameters(data=D.beta, distribution="beta") self.assertEqual( t["alpha"], (t["mean"]**2) * (((1 - t["mean"]) / t["std_dev"]**2) - (1 / t["mean"])), "beta dist, alpha infer", ) self.assertEqual(t["beta"], t["alpha"] * ((1 / t["mean"]) - 1), "beta dist, beta infer") # gamma t = ge.dataset.util.infer_distribution_parameters(data=D.gamma, distribution="gamma") self.assertEqual(t["alpha"], D.gamma.mean()) # uniform distributions t = ge.dataset.util.infer_distribution_parameters( data=D.uniform, distribution="uniform") self.assertEqual(t["min"], min(D.uniform), "uniform, min infer") self.assertEqual(t["max"], max(D.uniform) - min(D.uniform), "uniform, max infer") uni_loc = 5 uni_scale = 10 t = ge.dataset.util.infer_distribution_parameters( data=D.uniform, distribution="uniform", params={ "loc": uni_loc, "scale": uni_scale }, ) self.assertEqual(t["min"], uni_loc, "uniform, min infer") self.assertEqual(t["max"], uni_scale, "uniform, max infer") # expon distribution with self.assertRaises(AttributeError): ge.dataset.util.infer_distribution_parameters( data=D.norm, distribution="fakedistribution") # chi2 t = ge.dataset.util.infer_distribution_parameters(data=D.chi2, distribution="chi2") self.assertEqual(t["df"], D.chi2.mean())
def test_DataContext_raises_error_on_unparsable_yaml_file(): local_dir = file_relative_path(__file__, os.path.join(BASE_DIR, "bad_yml")) with pytest.raises(ge_exceptions.InvalidConfigurationYamlError): DataContext(local_dir)
def test_basic_project_upgrade(v10_project_directory, caplog): # test project upgrade that requires no manual steps runner: CliRunner = CliRunner(mix_stderr=False) result: Result = runner.invoke( cli, ["-c", v10_project_directory, "--v3-api", "project", "upgrade"], input="\n", catch_exceptions=False, ) stdout: str = result.stdout with open( file_relative_path( __file__, "../../test_fixtures/upgrade_helper/test_basic_project_upgrade_expected_stdout.fixture", )) as f: expected_stdout: str = f.read() expected_stdout = expected_stdout.replace("GE_PROJECT_DIR", v10_project_directory) assert stdout == expected_stdout expected_project_tree_str: str = """\ great_expectations/ .gitignore great_expectations.yml checkpoints/ .gitkeep expectations/ .ge_store_backend_id .gitkeep notebooks/ .gitkeep plugins/ custom_store_backends/ __init__.py my_custom_store_backend.py uncommitted/ config_variables.yml data_docs/ local_site/ expectations/ .gitkeep static/ .gitkeep validations/ diabetic_data/ warning/ 20200430T191246.763896Z/ 20200430T191246.763896Z/ c3b4c5df224fef4b1a056a0f3b93aba5.html logs/ project_upgrades/ UpgradeHelperV11_20190926T134241.000000Z.json UpgradeHelperV13_20190926T134241.000000Z.json validations/ .ge_store_backend_id diabetic_data/ warning/ 20200430T191246.763896Z/ 20200430T191246.763896Z/ c3b4c5df224fef4b1a056a0f3b93aba5.json """ obs_project_tree_str: str = gen_directory_tree_str( startpath=v10_project_directory) assert obs_project_tree_str == expected_project_tree_str # make sure config number incremented assert (DataContext.get_ge_config_version( context_root_dir=v10_project_directory) == 3.0) with open( file_relative_path( __file__, "../../test_fixtures/upgrade_helper/UpgradeHelperV11_basic_upgrade_log.json", )) as f: expected_upgrade_log_dict: dict = json.load(f) expected_upgrade_log_str: str = json.dumps(expected_upgrade_log_dict) expected_upgrade_log_str = expected_upgrade_log_str.replace( "GE_PROJECT_DIR", v10_project_directory) expected_upgrade_log_dict: dict = json.loads(expected_upgrade_log_str) with open( f"{v10_project_directory}/uncommitted/logs/project_upgrades/UpgradeHelperV11_20190926T134241.000000Z.json" ) as f: obs_upgrade_log_dict: dict = json.load(f) assert obs_upgrade_log_dict == expected_upgrade_log_dict
def test_cli_init_on_new_project_with_broken_excel_file_try_again_with_different_file( mock_webbrowser, caplog, tmp_path_factory ): project_dir = str(tmp_path_factory.mktemp("test_cli_init_diff")) os.makedirs(os.path.join(project_dir, "data")) data_path = os.path.join(project_dir, "data", "broken_excel_file.xls") fixture_path = file_relative_path(__file__, "../test_sets/broken_excel_file.xls") data_path_2 = os.path.join(project_dir, "data", "Titanic.csv") fixture_path_2 = file_relative_path(__file__, "../test_sets/Titanic.csv") shutil.copy(fixture_path, data_path) shutil.copy(fixture_path_2, data_path_2) runner = CliRunner(mix_stderr=False) result = runner.invoke( cli, ["init", "-d", project_dir], input="Y\n1\n1\n{}\n\n{}\n".format( data_path, data_path_2, catch_exceptions=False ), ) stdout = result.output assert mock_webbrowser.call_count == 1 assert ( "{}/great_expectations/uncommitted/data_docs/local_site/validations/Titanic/warning/".format( project_dir ) in mock_webbrowser.call_args[0][0] ) assert len(stdout) < 3000, "CLI output is unreasonably long." assert "Always know what to expect from your data" in stdout assert "What data would you like Great Expectations to connect to" in stdout assert "What are you processing your files with" in stdout assert "Enter the path (relative or absolute) of a data file" in stdout assert "Cannot load file." in stdout assert ( "- Please check the file and try again or select a different data file." in stdout ) assert ( "- Error: Unsupported format, or corrupt file: Expected BOF record; found b'PRODUCTI'" in stdout ) assert "Try again? [Y/n]:" in stdout assert "[{}]:".format(data_path) in stdout assert "Name the new expectation suite [Titanic.warning]" in stdout assert ( "Great Expectations will choose a couple of columns and generate expectations about them" in stdout ) assert "Generating example Expectation Suite..." in stdout assert "Building" in stdout assert "Data Docs" in stdout assert ( "A new Expectation suite 'Titanic.warning' was added to your project" in stdout ) assert "Great Expectations is now set up" in stdout assert os.path.isdir(os.path.join(project_dir, "great_expectations")) config_path = os.path.join(project_dir, "great_expectations/great_expectations.yml") assert os.path.isfile(config_path) config = yaml.load(open(config_path, "r")) data_source_class = config["datasources"]["files_datasource"]["data_asset_type"][ "class_name" ] assert data_source_class == "PandasDataset" obs_tree = gen_directory_tree_str(os.path.join(project_dir, "great_expectations")) # Instead of monkey patching datetime, just regex out the time directories date_safe_obs_tree = re.sub(r"\d*T\d*\.\d*Z", "9999.9999", obs_tree) # Instead of monkey patching guids, just regex out the guids guid_safe_obs_tree = re.sub( r"[a-z0-9]{32}(?=\.(json|html))", "foobarbazguid", date_safe_obs_tree ) assert ( guid_safe_obs_tree == """great_expectations/ .gitignore great_expectations.yml expectations/ Titanic/ warning.json notebooks/ pandas/ validation_playground.ipynb spark/ validation_playground.ipynb sql/ validation_playground.ipynb plugins/ custom_data_docs/ renderers/ styles/ data_docs_custom_styles.css views/ uncommitted/ config_variables.yml data_docs/ local_site/ index.html expectations/ Titanic/ warning.html static/ fonts/ HKGrotesk/ HKGrotesk-Bold.otf HKGrotesk-BoldItalic.otf HKGrotesk-Italic.otf HKGrotesk-Light.otf HKGrotesk-LightItalic.otf HKGrotesk-Medium.otf HKGrotesk-MediumItalic.otf HKGrotesk-Regular.otf HKGrotesk-SemiBold.otf HKGrotesk-SemiBoldItalic.otf images/ favicon.ico glossary_scroller.gif iterative-dev-loop.png logo-long-vector.svg logo-long.png short-logo-vector.svg short-logo.png validation_failed_unexpected_values.gif styles/ data_docs_custom_styles_template.css data_docs_default_styles.css validations/ Titanic/ warning/ 9999.9999/ foobarbazguid.html validations/ Titanic/ warning/ 9999.9999/ foobarbazguid.json """ ) assert_no_logging_messages_or_tracebacks(caplog, result)
def test_project_upgrade_with_manual_steps(v10_project_directory, caplog, sa, postgresql_engine): # This test requires sqlalchemy because it includes database backends configured # test project upgrade that requires manual steps # copy v2 yml shutil.copy( file_relative_path( __file__, "../../test_fixtures/upgrade_helper/great_expectations_v1_needs_manual_upgrade.yml", ), os.path.join(v10_project_directory, "great_expectations.yml"), ) runner: CliRunner = CliRunner(mix_stderr=False) result: Result = runner.invoke( cli, ["-c", v10_project_directory, "--v3-api", "project", "upgrade"], input="\n", catch_exceptions=False, ) stdout: str = result.stdout with open( file_relative_path( __file__, "../../test_fixtures/upgrade_helper/test_project_upgrade_with_manual_steps_expected_stdout.fixture", )) as f: expected_stdout: str = f.read() expected_stdout = expected_stdout.replace("GE_PROJECT_DIR", v10_project_directory) assert stdout == expected_stdout pycache_dir_path: str = os.path.join(v10_project_directory, "plugins", "custom_store_backends", "__pycache__") try: shutil.rmtree(pycache_dir_path) except FileNotFoundError: pass expected_project_tree_str: str = """\ great_expectations/ .gitignore great_expectations.yml checkpoints/ .gitkeep expectations/ .ge_store_backend_id .gitkeep notebooks/ .gitkeep plugins/ custom_store_backends/ __init__.py my_custom_store_backend.py uncommitted/ config_variables.yml data_docs/ local_site/ expectations/ .gitkeep static/ .gitkeep validations/ diabetic_data/ warning/ 20200430T191246.763896Z/ 20200430T191246.763896Z/ c3b4c5df224fef4b1a056a0f3b93aba5.html logs/ project_upgrades/ UpgradeHelperV11_20190926T134241.000000Z.json validations/ .ge_store_backend_id diabetic_data/ warning/ 20200430T191246.763896Z/ 20200430T191246.763896Z/ c3b4c5df224fef4b1a056a0f3b93aba5.json """ obs_project_tree_str: str = gen_directory_tree_str( startpath=v10_project_directory) assert obs_project_tree_str == expected_project_tree_str # make sure config number not incremented assert (DataContext.get_ge_config_version( context_root_dir=v10_project_directory) == 1.0) with open( file_relative_path( __file__, "../../test_fixtures/upgrade_helper/UpgradeHelperV11_manual_steps_upgrade_log.json", )) as f: expected_upgrade_log_dict: dict = json.load(f) expected_upgrade_log_str: str = json.dumps(expected_upgrade_log_dict) expected_upgrade_log_str = expected_upgrade_log_str.replace( "GE_PROJECT_DIR", v10_project_directory) expected_upgrade_log_dict = json.loads(expected_upgrade_log_str) with open( f"{v10_project_directory}/uncommitted/logs/project_upgrades/UpgradeHelperV11_20190926T134241.000000Z.json" ) as f: obs_upgrade_log_dict: dict = json.load(f) assert obs_upgrade_log_dict == expected_upgrade_log_dict
def copy_static_assets(self, static_assets_source_dir=None): """ Copies static assets, using a special "static_assets" backend store that accepts variable-length tuples as keys, with no filepath_template. """ file_exclusions = [".DS_Store"] dir_exclusions = [] if not static_assets_source_dir: static_assets_source_dir = file_relative_path( __file__, os.path.join("..", "..", "render", "view", "static")) # If `static_assets_source_absdir` contains the string ".zip", then we try to extract (unzip) # the static files. If the unzipping is successful, that means that Great Expectations is # installed into a zip file (see PEP 273) and we need to run this function again if ".zip" in static_assets_source_dir.lower(): unzip_destdir = tempfile.mkdtemp() unzipped_ok = self._unzip_assets(static_assets_source_dir, unzip_destdir) if unzipped_ok: return self.copy_static_assets(unzip_destdir) for item in os.listdir(static_assets_source_dir): # Directory if os.path.isdir(os.path.join(static_assets_source_dir, item)): if item in dir_exclusions: continue # Recurse new_source_dir = os.path.join(static_assets_source_dir, item) self.copy_static_assets(new_source_dir) # File else: # Copy file over using static assets store backend if item in file_exclusions: continue source_name = os.path.join(static_assets_source_dir, item) with open(source_name, "rb") as f: # Only use path elements starting from static/ for key store_key = tuple( os.path.normpath(source_name).split(os.sep)) store_key = store_key[store_key.index("static"):] content_type, content_encoding = guess_type(item, strict=False) if content_type is None: # Use GE-known content-type if possible if source_name.endswith(".otf"): content_type = "font/opentype" else: # fallback logger.warning( "Unable to automatically determine content_type for {}" .format(source_name)) content_type = "text/html; charset=utf8" if not isinstance(self.store_backends["static_assets"], GeCloudStoreBackend): self.store_backends["static_assets"].set( store_key, f.read(), content_encoding=content_encoding, content_type=content_type, )
def test_requirements_files(): """requirements.txt should be a subset of requirements-dev.txt""" with open(file_relative_path(__file__, "../requirements.txt")) as req: requirements = { f'{line.name}{"".join(line.specs[0])}' for line in rp.parse(req) } with open(file_relative_path(__file__, "../requirements-dev.txt")) as req: requirements_dev = { f'{line.name}{"".join(line.specs[0])}' for line in rp.parse(req) } with open(file_relative_path(__file__, "../requirements-dev-util.txt")) as req: requirements_dev_util = { f'{line.name}{"".join(line.specs[0])}' for line in rp.parse(req) } with open(file_relative_path(__file__, "../requirements-dev-spark.txt")) as req: requirements_dev_spark = { f'{line.name}{"".join(line.specs[0])}' for line in rp.parse(req) } with open( file_relative_path(__file__, "../requirements-dev-sqlalchemy.txt") ) as req: requirements_dev_sqlalchemy = { f'{line.name}{"".join(line.specs[0])}' for line in rp.parse(req) } with open(file_relative_path(__file__, "../requirements-dev-test.txt")) as req: requirements_dev_test = { f'{line.name}{"".join(line.specs[0])}' for line in rp.parse(req) } with open(file_relative_path(__file__, "../requirements-dev-build.txt")) as req: requirements_dev_build = { f'{line.name}{"".join(line.specs[0])}' for line in rp.parse(req) } with open(file_relative_path(__file__, "../requirements-dev-publish.txt")) as req: requirements_dev_publish = { f'{line.name}{"".join(line.specs[0])}' for line in rp.parse(req) } assert requirements <= requirements_dev assert requirements_dev_util.intersection(requirements_dev_spark) == set() assert requirements_dev_util.intersection(requirements_dev_sqlalchemy) == set() assert requirements_dev_util.intersection(requirements_dev_test) == set() assert requirements_dev_util.intersection(requirements_dev_build) == set() assert requirements_dev_spark.intersection(requirements_dev_sqlalchemy) == set() assert requirements_dev_spark.intersection(requirements_dev_test) == set() assert requirements_dev_spark.intersection(requirements_dev_build) == set() assert requirements_dev_sqlalchemy.intersection(requirements_dev_test) == set() assert requirements_dev_sqlalchemy.intersection(requirements_dev_build) == set() assert requirements_dev_test.intersection(requirements_dev_build) == set() assert requirements_dev_publish.intersection(requirements_dev_test) == set() assert requirements_dev_publish.intersection(requirements_dev_build) == set() assert ( requirements_dev - ( requirements | requirements_dev_util | requirements_dev_sqlalchemy | requirements_dev_spark | requirements_dev_test | requirements_dev_build | requirements_dev_publish ) == set() )