def test_subdir_reader_file_partitioning(tmp_path_factory): base_directory = str(tmp_path_factory.mktemp("test_folder_connection_path")) mock_files = [ "20190101__asset_1.csv", "20190102__asset_1.csv", "20190103__asset_1.csv", "asset_2/20190101__asset_2.csv", "asset_2/20190102__asset_2.csv" ] for file in mock_files: if "/" in file: safe_mmkdir(os.path.join(base_directory, file.split("/")[0])) open(os.path.join(base_directory, file), "w").close() # If we have files, we should see them as individual assets subdir_reader_generator = SubdirReaderGenerator("test_generator", base_directory=base_directory) known_assets = subdir_reader_generator.get_available_data_asset_names() assert set(known_assets) == { "20190101__asset_1", "20190102__asset_1", "20190103__asset_1", "asset_2" } # SubdirReaderGenerator uses the filename as partition name for root files known_partitions = subdir_reader_generator.get_available_partition_ids("20190101__asset_1") assert set(known_partitions) == {"20190101__asset_1"} kwargs = subdir_reader_generator.build_batch_kwargs_from_partition_id("20190101__asset_1", "20190101__asset_1") assert kwargs["path"] == os.path.join(base_directory, "20190101__asset_1.csv")
def data_context(tmp_path_factory): # This data_context is *manually* created to have the config we want, vs created with DataContext.create project_path = str(tmp_path_factory.mktemp('data_context')) context_path = os.path.join(project_path, "great_expectations") asset_config_path = os.path.join(context_path, "expectations") safe_mmkdir(os.path.join(asset_config_path, "mydatasource/mygenerator/my_dag_node"), exist_ok=True) shutil.copy("./tests/test_fixtures/great_expectations_basic.yml", str(os.path.join(context_path, "great_expectations.yml"))) shutil.copy( "./tests/test_fixtures/expectation_suites/parameterized_expectation_suite_fixture.json", os.path.join(asset_config_path, "mydatasource/mygenerator/my_dag_node/default.json")) safe_mmkdir(os.path.join(context_path, "plugins")) shutil.copy( "./tests/test_fixtures/custom_pandas_dataset.py", str(os.path.join(context_path, "plugins", "custom_pandas_dataset.py"))) shutil.copy( "./tests/test_fixtures/custom_sqlalchemy_dataset.py", str( os.path.join(context_path, "plugins", "custom_sqlalchemy_dataset.py"))) shutil.copy( "./tests/test_fixtures/custom_sparkdf_dataset.py", str(os.path.join(context_path, "plugins", "custom_sparkdf_dataset.py"))) return ge.data_context.DataContext(context_path)
def empty_data_context(tmp_path_factory): project_path = str(tmp_path_factory.mktemp('empty_data_context')) context = ge.data_context.DataContext.create(project_path) context_path = os.path.join(project_path, "great_expectations") asset_config_path = os.path.join(context_path, "expectations") safe_mmkdir(asset_config_path, exist_ok=True) return context
def __init__(self, base_directory, filepath_template=None, filepath_prefix=None, filepath_suffix=None, forbidden_substrings=None, platform_specific_separator=True, root_directory=None, fixed_length_key=False): super().__init__( filepath_template=filepath_template, filepath_prefix=filepath_prefix, filepath_suffix=filepath_suffix, forbidden_substrings=forbidden_substrings, platform_specific_separator=platform_specific_separator, fixed_length_key=fixed_length_key) if os.path.isabs(base_directory): self.full_base_directory = base_directory else: if root_directory is None: raise ValueError( "base_directory must be an absolute path if root_directory is not provided" ) elif not os.path.isabs(root_directory): raise ValueError( "root_directory must be an absolute path. Got {0} instead." .format(root_directory)) else: self.full_base_directory = os.path.join( root_directory, base_directory) safe_mmkdir(str(os.path.dirname(self.full_base_directory)))
def test_load_config_variables_file(basic_data_context_config, tmp_path_factory): # Setup: base_path = str(tmp_path_factory.mktemp('test_load_config_variables_file')) safe_mmkdir(os.path.join(base_path, "uncommitted")) with open(os.path.join(base_path, "uncommitted", "dev_variables.yml"), "w") as outfile: yaml.dump({'env': 'dev'}, outfile) with open(os.path.join(base_path, "uncommitted", "prod_variables.yml"), "w") as outfile: yaml.dump({'env': 'prod'}, outfile) basic_data_context_config[ "config_variables_file_path"] = "uncommitted/${TEST_CONFIG_FILE_ENV}_variables.yml" try: # We should be able to load different files based on an environment variable os.environ["TEST_CONFIG_FILE_ENV"] = "dev" context = BaseDataContext(basic_data_context_config, context_root_dir=base_path) config_vars = context._load_config_variables_file() assert config_vars['env'] == 'dev' os.environ["TEST_CONFIG_FILE_ENV"] = "prod" context = BaseDataContext(basic_data_context_config, context_root_dir=base_path) config_vars = context._load_config_variables_file() assert config_vars['env'] == 'prod' except Exception: raise finally: # Make sure we unset the environment variable we're using del os.environ["TEST_CONFIG_FILE_ENV"]
def titanic_data_context(tmp_path_factory): project_path = str(tmp_path_factory.mktemp('titanic_data_context')) context_path = os.path.join(project_path, "great_expectations") safe_mmkdir(os.path.join(context_path, "expectations"), exist_ok=True) data_path = os.path.join(context_path, "../data") safe_mmkdir(os.path.join(data_path), exist_ok=True) shutil.copy("./tests/test_fixtures/great_expectations_titanic.yml", str(os.path.join(context_path, "great_expectations.yml"))) shutil.copy("./tests/test_sets/Titanic.csv", str(os.path.join(context_path, "../data/Titanic.csv"))) return ge.data_context.DataContext(context_path)
def test_get_available_data_asset_names_for_query_path(empty_data_context): # create queries path context_path = empty_data_context.root_directory query_path = os.path.join( context_path, "datasources/mydatasource/generators/mygenerator") safe_mmkdir(query_path) shutil.copy(file_relative_path(__file__, "../../test_fixtures/dummy.sql"), query_path) data_source = Datasource(name="mydatasource", data_context=empty_data_context) generator = QueryBatchKwargsGenerator(name="mygenerator", datasource=data_source) sql_list = generator.get_available_data_asset_names() assert ("dummy", "query") in sql_list["names"]
def test_opt_out_yml(tmp_path_factory): project_path = str(tmp_path_factory.mktemp("data_context")) context_path = os.path.join(project_path, "great_expectations") safe_mmkdir(context_path, exist_ok=True) fixture_dir = file_relative_path(__file__, "../../test_fixtures") shutil.copy( os.path.join(fixture_dir, "great_expectations_basic_with_usage_stats_disabled.yml"), str(os.path.join(context_path, "great_expectations.yml")), ) assert DataContext( context_root_dir=context_path )._project_config.anonymous_usage_statistics.enabled is False
def filesystem_csv(tmp_path_factory): base_dir = tmp_path_factory.mktemp('filesystem_csv') base_dir = str(base_dir) # Put a few files in the directory with open(os.path.join(base_dir, "f1.csv"), "w") as outfile: outfile.writelines(["a,b,c\n"]) with open(os.path.join(base_dir, "f2.csv"), "w") as outfile: outfile.writelines(["a,b,c\n"]) safe_mmkdir(os.path.join(base_dir, "f3")) with open(os.path.join(base_dir, "f3", "f3_20190101.csv"), "w") as outfile: outfile.writelines(["a,b,c\n"]) with open(os.path.join(base_dir, "f3", "f3_20190102.csv"), "w") as outfile: outfile.writelines(["a,b,c\n"]) return base_dir
def test_load_data_context_from_environment_variables(tmp_path_factory): try: project_path = str(tmp_path_factory.mktemp('data_context')) context_path = os.path.join(project_path, "great_expectations") safe_mmkdir(context_path) shutil.copy("./tests/test_fixtures/great_expectations_basic.yml", str(os.path.join(context_path, "great_expectations.yml"))) with pytest.raises(DataContextError) as err: DataContext.find_context_root_dir() assert "Unable to locate context root directory." in err os.environ["GE_HOME"] = context_path assert DataContext.find_context_root_dir() == context_path except Exception: raise finally: # Make sure we unset the environment variable we're using del os.environ["GE_HOME"]
def _set(self, key, value, **kwargs): if not isinstance(key, tuple): key = key.to_tuple() filepath = os.path.join(self.full_base_directory, self._convert_key_to_filepath(key)) path, filename = os.path.split(filepath) safe_mmkdir(str(path)) with open(filepath, "wb") as outfile: if isinstance(value, string_types): # Following try/except is to support py2, since both str and bytes objects pass above condition try: outfile.write(value.encode("utf-8")) except UnicodeDecodeError: outfile.write(value) else: outfile.write(value) return filepath
def test_subdir_reader_file_partitioning(basic_pandas_datasource, tmp_path_factory): base_directory = str( tmp_path_factory.mktemp("test_folder_connection_path")) mock_files = [ "20190101__asset_1.csv", "20190102__asset_1.csv", "20190103__asset_1.csv", "asset_2/20190101__asset_2.csv", "asset_2/20190102__asset_2.csv" ] for file in mock_files: if "/" in file: safe_mmkdir(os.path.join(base_directory, file.split("/")[0])) open(os.path.join(base_directory, file), "w").close() # If we have files, we should see them as individual assets subdir_reader_generator = SubdirReaderBatchKwargsGenerator( "test_generator", datasource=basic_pandas_datasource, base_directory=base_directory) known_assets = subdir_reader_generator.get_available_data_asset_names( )["names"] assert set(known_assets) == {("20190101__asset_1", "file"), ("20190102__asset_1", "file"), ("20190103__asset_1", "file"), ("asset_2", "directory")} # SubdirReaderBatchKwargsGenerator uses the filename as partition name for root files known_partitions = subdir_reader_generator.get_available_partition_ids( "20190101__asset_1") assert set(known_partitions) == {"20190101__asset_1"} kwargs = subdir_reader_generator.build_batch_kwargs( "20190101__asset_1", "20190101__asset_1") assert kwargs["path"] == os.path.join(base_directory, "20190101__asset_1.csv") # We should also be able to pass a limit kwargs = subdir_reader_generator.build_batch_kwargs("20190101__asset_1", "20190101__asset_1", limit=10) assert kwargs["path"] == os.path.join(base_directory, "20190101__asset_1.csv") assert kwargs["reader_options"]["nrows"] == 10
def test_get_available_data_asset_names_for_query_path(empty_data_context): # create queries path context_path = empty_data_context.root_directory safe_mmkdir( os.path.join( context_path, "datasources/mydatasource/generators/mygenerator/queries")) shutil.copy( "./tests/test_fixtures/dummy.sql", str( os.path.join(context_path, "datasources", "mydatasource", "generators", "mygenerator", "queries"))) data_source = Datasource(name="mydatasource", data_context=empty_data_context) generator = QueryGenerator(name="mygenerator", datasource=data_source) sql_list = generator.get_available_data_asset_names() assert "dummy" in sql_list
def test_subdir_reader_configurable_reader_method(basic_pandas_datasource, tmp_path_factory): base_directory = str(tmp_path_factory.mktemp("test_folder_connection_path")) mock_files = [ "20190101__asset_1.dat", "20190102__asset_1.dat", "20190103__asset_1.dat", "asset_2/20190101__asset_2.dat", "asset_2/20190102__asset_2.dat" ] for file in mock_files: if "/" in file: safe_mmkdir(os.path.join(base_directory, file.split("/")[0])) open(os.path.join(base_directory, file), "w").close() # If we have files, we should see them as individual assets subdir_reader_generator = SubdirReaderBatchKwargsGenerator("test_generator", datasource=basic_pandas_datasource, base_directory=base_directory, reader_method='csv', known_extensions=['.dat']) batch_kwargs = next(subdir_reader_generator.get_iterator('asset_2')) assert batch_kwargs['reader_method'] == 'csv'
def test_opt_out_env_var_overrides_yml(tmp_path_factory, monkeypatch): project_path = str(tmp_path_factory.mktemp("data_context")) context_path = os.path.join(project_path, "great_expectations") safe_mmkdir(context_path, exist_ok=True) fixture_dir = file_relative_path(__file__, "../../test_fixtures") shutil.copy( os.path.join(fixture_dir, "great_expectations_basic_with_usage_stats_enabled.yml"), str(os.path.join(context_path, "great_expectations.yml")), ) assert DataContext( context_root_dir=context_path )._project_config.anonymous_usage_statistics.enabled is True monkeypatch.setenv("GE_USAGE_STATS", "False") context = DataContext(context_root_dir=context_path) project_config = context._project_config assert project_config.anonymous_usage_statistics.enabled is False
def scaffold_directories_and_notebooks(base_dir): """Add basic directories for an initial, opinionated GE project.""" safe_mmkdir(base_dir, exist_ok=True) notebook_dir_name = "notebooks" open(os.path.join(base_dir, ".gitignore"), 'w').write("uncommitted/") for directory in [ notebook_dir_name, "expectations", "datasources", "uncommitted", "plugins", "fixtures" ]: safe_mmkdir(os.path.join(base_dir, directory), exist_ok=True) for uncommitted_directory in [ "validations", "credentials", "documentation", "samples" ]: safe_mmkdir(os.path.join(base_dir, "uncommitted", uncommitted_directory), exist_ok=True) for notebook in glob.glob( script_relative_path("../init_notebooks/*.ipynb")): notebook_name = os.path.basename(notebook) shutil.copyfile( notebook, os.path.join(base_dir, notebook_dir_name, notebook_name))
def test_load_data_context_from_environment_variables(tmp_path_factory): curdir = os.path.abspath(os.getcwd()) try: project_path = str(tmp_path_factory.mktemp('data_context')) context_path = os.path.join(project_path, "great_expectations") safe_mmkdir(context_path) os.chdir(context_path) with pytest.raises(DataContextError) as err: DataContext.find_context_root_dir() assert isinstance(err.value, ConfigNotFoundError) shutil.copy(file_relative_path(__file__, "../test_fixtures/great_expectations_basic.yml"), str(os.path.join(context_path, "great_expectations.yml"))) os.environ["GE_HOME"] = context_path assert DataContext.find_context_root_dir() == context_path except Exception: raise finally: # Make sure we unset the environment variable we're using if "GE_HOME" in os.environ: del os.environ["GE_HOME"] os.chdir(curdir)
def test_opt_out_etc_overrides_yml(tmp_path_factory): home_config_dir = tmp_path_factory.mktemp("home_dir") home_config_dir = str(home_config_dir) etc_config_dir = tmp_path_factory.mktemp("etc") etc_config_dir = str(etc_config_dir) config_dirs = [home_config_dir, etc_config_dir] config_dirs = [ os.path.join(config_dir, "great_expectations.conf") for config_dir in config_dirs ] disabled_config = configparser.ConfigParser() disabled_config["anonymous_usage_statistics"] = {"enabled": False} with open(os.path.join(etc_config_dir, "great_expectations.conf"), 'w') as configfile: disabled_config.write(configfile) project_path = str(tmp_path_factory.mktemp("data_context")) context_path = os.path.join(project_path, "great_expectations") safe_mmkdir(context_path, exist_ok=True) fixture_dir = file_relative_path(__file__, "../../test_fixtures") shutil.copy( os.path.join(fixture_dir, "great_expectations_basic_with_usage_stats_enabled.yml"), str(os.path.join(context_path, "great_expectations.yml")), ) assert DataContext( context_root_dir=context_path )._project_config.anonymous_usage_statistics.enabled is True with mock.patch( "great_expectations.data_context.BaseDataContext.GLOBAL_CONFIG_PATHS", config_dirs): context = DataContext(context_root_dir=context_path) project_config = context._project_config assert project_config.anonymous_usage_statistics.enabled is False
def test_safe_mmkdir(tmp_path_factory): project_path = str(tmp_path_factory.mktemp('empty_dir')) first_path = os.path.join(project_path, "first_path") safe_mmkdir(first_path) assert os.path.isdir(first_path) with pytest.raises(TypeError): safe_mmkdir(1) #This should trigger python 2 if six.PY2: with pytest.raises(TypeError) as e: next_project_path = tmp_path_factory.mktemp( 'test_safe_mmkdir__dir_b') safe_mmkdir(next_project_path) assert e.value.message == "directory must be of type str, not {'directory_type': \"<class 'pathlib2.PosixPath'>\"}"
def test_configuration_driven_site_builder(site_builder_data_context_with_html_store_titanic_random): context = site_builder_data_context_with_html_store_titanic_random context.add_validation_operator( "validate_and_store", { "class_name": "ActionListValidationOperator", "action_list": [{ "name": "store_validation_result", "action": { "class_name": "StoreAction", "target_store_name": "validations_store", } }, { "name": "extract_and_store_eval_parameters", "action": { "class_name": "ExtractAndStoreEvaluationParamsAction", "target_store_name": "evaluation_parameter_store", } }] } ) # profiling the Titanic datasource will generate one expectation suite and one validation # that is a profiling result context.profile_datasource(context.list_datasources()[0]["name"]) # creating another validation result using the profiler's suite (no need to use a new expectation suite # for this test). having two validation results - one with run id "profiling" - allows us to test # the logic of run_id_filter that helps filtering validation results to be included in # the profiling and the validation sections. batch = context.get_batch('Titanic', expectation_suite_name='BasicDatasetProfiler', batch_kwargs=context.yield_batch_kwargs('Titanic')) run_id = "test_run_id_12345" context.run_validation_operator( assets_to_validate=[batch], run_id=run_id, validation_operator_name="validate_and_store", ) data_docs_config = context._project_config.get('data_docs_sites') local_site_config = data_docs_config['local_site'] # local_site_config.pop('module_name') # This isn't necessary local_site_config.pop('class_name') # set datasource_whitelist local_site_config['datasource_whitelist'] = ['titanic'] keys_as_strings = [x.to_string() for x in context.stores["validations_store"].list_keys()] assert set(keys_as_strings) == set([ "ValidationResultIdentifier.titanic.default.Titanic.BasicDatasetProfiler.test_run_id_12345", "ValidationResultIdentifier.titanic.default.Titanic.BasicDatasetProfiler.profiling", "ValidationResultIdentifier.random.default.f2.BasicDatasetProfiler.profiling", "ValidationResultIdentifier.random.default.f1.BasicDatasetProfiler.profiling", ]) site_builder = SiteBuilder( data_context=context, **local_site_config ) res = site_builder.build() index_page_locator_info = res[0] index_links_dict = res[1] print( json.dumps(index_page_locator_info, indent=2) ) assert index_page_locator_info == context.root_directory + '/uncommitted/data_docs/local_site/index.html' print( json.dumps(index_links_dict, indent=2) ) assert json.loads(json.dumps(index_links_dict)) == json.loads("""\ { "titanic": { "default": { "Titanic": { "profiling_links": [ { "full_data_asset_name": "titanic/default/Titanic", "expectation_suite_name": "BasicDatasetProfiler", "filepath": "validations/profiling/titanic/default/Titanic/BasicDatasetProfiler.html", "source": "titanic", "generator": "default", "asset": "Titanic", "run_id": "profiling", "validation_success": false } ], "validations_links": [ { "full_data_asset_name": "titanic/default/Titanic", "expectation_suite_name": "BasicDatasetProfiler", "filepath": "validations/test_run_id_12345/titanic/default/Titanic/BasicDatasetProfiler.html", "source": "titanic", "generator": "default", "asset": "Titanic", "run_id": "test_run_id_12345", "validation_success": false } ], "expectations_links": [ { "full_data_asset_name": "titanic/default/Titanic", "expectation_suite_name": "BasicDatasetProfiler", "filepath": "expectations/titanic/default/Titanic/BasicDatasetProfiler.html", "source": "titanic", "generator": "default", "asset": "Titanic", "run_id": null, "validation_success": null } ] } } } } """) assert "random" not in index_links_dict, \ """`random` must not appear in this documentation, because `datasource_whitelist` config option specifies only `titanic`""" assert len(index_links_dict['titanic']['default']['Titanic']['validations_links']) == 1, \ """ The only rendered validation should be the one not generated by the profiler """ # save documentation locally safe_mmkdir("./tests/render/output") safe_mmkdir("./tests/render/output/documentation") if os.path.isdir("./tests/render/output/documentation"): shutil.rmtree("./tests/render/output/documentation") shutil.copytree( os.path.join( site_builder_data_context_with_html_store_titanic_random.root_directory, "uncommitted/data_docs/" ), "./tests/render/output/documentation" ) # let's create another validation result and run the site builder to add it # to the data docs # the operator does not have an StoreAction action configured, so the site # will not be updated without our call to site builder ts_last_mod_0 = os.path.getmtime(os.path.join(site_builder.site_index_builder.target_store.store_backends[ValidationResultIdentifier].full_base_directory, "validations/test_run_id_12345/titanic/default/Titanic/BasicDatasetProfiler.html")) run_id = "test_run_id_12346" operator_result = context.run_validation_operator( assets_to_validate=[batch], run_id=run_id, validation_operator_name="validate_and_store", ) validation_result_id = ValidationResultIdentifier( expectation_suite_identifier=[key for key in operator_result["details"].keys()][0], run_id=run_id) res = site_builder.build(resource_identifiers=[validation_result_id]) index_links_dict = res[1] # verify that an additional validation result HTML file was generated assert len(index_links_dict["titanic"]["default"]["Titanic"]["validations_links"]) == 2 site_builder.site_index_builder.target_store.store_backends[ValidationResultIdentifier].full_base_directory # verify that the validation result HTML file rendered in the previous run was NOT updated ts_last_mod_1 = os.path.getmtime(os.path.join(site_builder.site_index_builder.target_store.store_backends[ValidationResultIdentifier].full_base_directory, "validations/test_run_id_12345/titanic/default/Titanic/BasicDatasetProfiler.html")) assert ts_last_mod_0 == ts_last_mod_1 print("mmm")
def test_render_full_static_site(tmp_path_factory, filesystem_csv_3): project_dir = str(tmp_path_factory.mktemp("project_dir")) print(project_dir) os.makedirs(os.path.join(project_dir, "data")) os.makedirs(os.path.join(project_dir, "data/titanic")) curdir = os.path.abspath(os.getcwd()) shutil.copy("./tests/test_sets/Titanic.csv", str(os.path.join(project_dir, "data/titanic/Titanic.csv"))) os.makedirs(os.path.join(project_dir, "data/random")) curdir = os.path.abspath(os.getcwd()) shutil.copy(os.path.join(filesystem_csv_3, "f1.csv"), str(os.path.join(project_dir, "data/random/f1.csv"))) shutil.copy(os.path.join(filesystem_csv_3, "f2.csv"), str(os.path.join(project_dir, "data/random/f2.csv"))) context = DataContext.create(project_dir) ge_directory = os.path.join(project_dir, "great_expectations") scaffold_directories_and_notebooks(ge_directory) context.add_datasource("titanic", "pandas", base_directory=os.path.join(project_dir, "data/titanic/")) context.add_datasource("random", "pandas", base_directory=os.path.join(project_dir, "data/random/")) context.profile_datasource("titanic") context.profile_datasource("random") context.build_data_documentation() # Titanic assert os.path.exists( os.path.join( ge_directory, "uncommitted/validations/profiling/titanic/default/Titanic/BasicDatasetProfiler.json" )) assert os.path.exists( os.path.join( # profiling results HTML ge_directory, "uncommitted/documentation/local_site/profiling/titanic/default/Titanic/BasicDatasetProfiler.html" )) assert os.path.exists( os.path.join( # profiling expectations HTML ge_directory, "uncommitted/documentation/local_site/expectations/titanic/default/Titanic/BasicDatasetProfiler.html" )) # f1 assert os.path.exists( os.path.join( ge_directory, "uncommitted/validations/profiling/random/default/f1/BasicDatasetProfiler.json" )) assert os.path.exists( os.path.join( # profiling results HTML ge_directory, "uncommitted/documentation/local_site/profiling/random/default/f1/BasicDatasetProfiler.html" )) assert os.path.exists( os.path.join( # profiling expectations HTML ge_directory, "uncommitted/documentation/local_site/profiling/random/default/f1/BasicDatasetProfiler.html" )) # f2 assert os.path.exists( os.path.join( ge_directory, "uncommitted/validations/profiling/random/default/f2/BasicDatasetProfiler.json" )) assert os.path.exists( os.path.join( ge_directory, "uncommitted/documentation/local_site/profiling/random/default/f2/BasicDatasetProfiler.html" )) assert os.path.exists( os.path.join( ge_directory, "uncommitted/documentation/local_site/expectations/random/default/f2/BasicDatasetProfiler.html" )) # local_site index.html assert os.path.exists( os.path.join(ge_directory, "uncommitted/documentation/local_site/index.html")) # team_site index.html assert os.path.exists( os.path.join(ge_directory, "uncommitted/documentation/team_site/index.html")) # save documentation locally safe_mmkdir("./tests/data_context/output") safe_mmkdir("./tests/data_context/output/documentation") if os.path.isdir("./tests/data_context/output/documentation"): shutil.rmtree("./tests/data_context/output/documentation") shutil.copytree(os.path.join(ge_directory, "uncommitted/documentation/"), "./tests/data_context/output/documentation")
def test_configuration_driven_site_builder( site_builder_data_context_with_html_store_titanic_random): context = site_builder_data_context_with_html_store_titanic_random context.add_validation_operator( "validate_and_store", { "class_name": "ActionListValidationOperator", "action_list": [{ "name": "store_validation_result", "action": { "class_name": "StoreValidationResultAction", "target_store_name": "validations_store", } }, { "name": "extract_and_store_eval_parameters", "action": { "class_name": "StoreEvaluationParametersAction", "target_store_name": "evaluation_parameter_store", } }] }) # profiling the Titanic datasource will generate one expectation suite and one validation # that is a profiling result datasource_name = 'titanic' data_asset_name = "Titanic" profiler_name = 'BasicDatasetProfiler' generator_name = "subdir_reader" context.profile_datasource(datasource_name) # creating another validation result using the profiler's suite (no need to use a new expectation suite # for this test). having two validation results - one with run id "profiling" - allows us to test # the logic of run_id_filter that helps filtering validation results to be included in # the profiling and the validation sections. batch_kwargs = context.build_batch_kwargs(datasource=datasource_name, generator=generator_name, name=data_asset_name) expectation_suite_name = "{}.{}.{}.{}".format(datasource_name, generator_name, data_asset_name, profiler_name) batch = context.get_batch( batch_kwargs=batch_kwargs, expectation_suite_name=expectation_suite_name, ) run_id = "test_run_id_12345" context.run_validation_operator( assets_to_validate=[batch], run_id=run_id, validation_operator_name="validate_and_store", ) data_docs_config = context._project_config.data_docs_sites local_site_config = data_docs_config['local_site'] # local_site_config.pop('module_name') # This isn't necessary local_site_config.pop('class_name') validations_set = set(context.stores["validations_store"].list_keys()) assert len(validations_set) == 4 assert ValidationResultIdentifier( expectation_suite_identifier=ExpectationSuiteIdentifier( expectation_suite_name=expectation_suite_name), run_id="test_run_id_12345", batch_identifier=batch.batch_id) in validations_set assert ValidationResultIdentifier( expectation_suite_identifier=ExpectationSuiteIdentifier( expectation_suite_name=expectation_suite_name), run_id="profiling", batch_identifier=batch.batch_id) in validations_set assert ValidationResultIdentifier( expectation_suite_identifier=ExpectationSuiteIdentifier( expectation_suite_name=expectation_suite_name), run_id="profiling", batch_identifier=batch.batch_id) in validations_set assert ValidationResultIdentifier( expectation_suite_identifier=ExpectationSuiteIdentifier( expectation_suite_name=expectation_suite_name), run_id="profiling", batch_identifier=batch.batch_id) in validations_set site_builder = SiteBuilder( data_context=context, runtime_environment={"root_directory": context.root_directory}, **local_site_config) res = site_builder.build() index_page_locator_info = res[0] index_links_dict = res[1] # assert that how-to buttons and related elements are rendered (default behavior) assert_how_to_buttons(context, index_page_locator_info, index_links_dict) print(json.dumps(index_page_locator_info, indent=2)) assert index_page_locator_info == context.root_directory + '/uncommitted/data_docs/local_site/index.html' print(json.dumps(index_links_dict, indent=2)) assert "site_name" in index_links_dict assert "expectations_links" in index_links_dict assert len(index_links_dict["expectations_links"]) == 3 assert "validations_links" in index_links_dict assert len(index_links_dict["validations_links"]) == 1, \ """ The only rendered validation should be the one not generated by the profiler """ assert "profiling_links" in index_links_dict assert len(index_links_dict["profiling_links"]) == 3 # save documentation locally safe_mmkdir("./tests/render/output") safe_mmkdir("./tests/render/output/documentation") if os.path.isdir("./tests/render/output/documentation"): shutil.rmtree("./tests/render/output/documentation") shutil.copytree( os.path.join( site_builder_data_context_with_html_store_titanic_random. root_directory, "uncommitted/data_docs/"), "./tests/render/output/documentation") # let's create another validation result and run the site builder to add it # to the data docs # the operator does not have an StoreValidationResultAction action configured, so the site # will not be updated without our call to site builder expectation_suite_path_component = expectation_suite_name.replace('.', '/') validation_result_page_path = os.path.join( site_builder.site_index_builder.target_store. store_backends[ValidationResultIdentifier].full_base_directory, "validations", expectation_suite_path_component, run_id, batch.batch_id + ".html") ts_last_mod_0 = os.path.getmtime(validation_result_page_path) run_id = "test_run_id_12346" operator_result = context.run_validation_operator( assets_to_validate=[batch], run_id=run_id, validation_operator_name="validate_and_store", ) validation_result_id = ValidationResultIdentifier( expectation_suite_identifier=[ key for key in operator_result["details"].keys() ][0], run_id=run_id, batch_identifier=batch.batch_id) res = site_builder.build(resource_identifiers=[validation_result_id]) index_links_dict = res[1] # verify that an additional validation result HTML file was generated assert len(index_links_dict["validations_links"]) == 2 site_builder.site_index_builder.target_store.store_backends[ ValidationResultIdentifier].full_base_directory # verify that the validation result HTML file rendered in the previous run was NOT updated ts_last_mod_1 = os.path.getmtime(validation_result_page_path) assert ts_last_mod_0 == ts_last_mod_1 # verify that the new method of the site builder that returns the URL of the HTML file that renders # a resource new_validation_result_page_path = os.path.join( site_builder.site_index_builder.target_store. store_backends[ValidationResultIdentifier].full_base_directory, "validations", expectation_suite_path_component, run_id, batch.batch_id + ".html") html_url = site_builder.get_resource_url( resource_identifier=validation_result_id) assert "file://" + new_validation_result_page_path == html_url html_url = site_builder.get_resource_url() assert "file://" + os.path.join(site_builder.site_index_builder.target_store.store_backends[\ ValidationResultIdentifier].full_base_directory, "index.html") == html_url
def test_subdir_reader_path_partitioning(tmp_path_factory): base_directory = str(tmp_path_factory.mktemp("test_folder_connection_path")) mock_files = [ "asset_1/20190101__asset_1.csv", "asset_1/20190102__asset_1.csv", "asset_1/20190103__asset_1.csv", "asset_2/20190101__asset_2.csv", "asset_2/20190102__asset_2.csv" ] for file in mock_files: safe_mmkdir(os.path.join(base_directory, file.split("/")[0])) open(os.path.join(base_directory, file), "w").close() subdir_reader_generator = SubdirReaderGenerator("test_generator", base_directory=base_directory) # We should see two assets known_assets = subdir_reader_generator.get_available_data_asset_names() # Use set in test to avoid order issues assert set(known_assets) == {"asset_1", "asset_2"} # We should see three partitions for the first: known_partitions = subdir_reader_generator.get_available_partition_ids("asset_1") assert set(known_partitions) == { "20190101__asset_1", "20190102__asset_1", "20190103__asset_1" } asset_1_kwargs = [kwargs for kwargs in subdir_reader_generator.get_iterator("asset_1")] asset_2_kwargs = [kwargs for kwargs in subdir_reader_generator.get_iterator("asset_2")] with pytest.raises(BatchKwargsError): not_an_asset_kwargs = [kwargs for kwargs in subdir_reader_generator.get_iterator("not_an_asset")] assert len(asset_1_kwargs) == 3 paths = [kwargs["path"] for kwargs in asset_1_kwargs] assert set(paths) == { os.path.join(base_directory, "asset_1/20190101__asset_1.csv"), os.path.join(base_directory, "asset_1/20190102__asset_1.csv"), os.path.join(base_directory, "asset_1/20190103__asset_1.csv") } partitions = [kwargs["partition_id"] for kwargs in asset_1_kwargs] # SubdirReaderGenerator uses filenames from subdirectories to generate partition names assert set(partitions) == { "20190101__asset_1", "20190102__asset_1", "20190103__asset_1" } assert len(asset_1_kwargs[0].keys()) == 3 assert len(asset_2_kwargs) == 2 paths = [kwargs["path"] for kwargs in asset_2_kwargs] assert set(paths) == { os.path.join(base_directory, "asset_2/20190101__asset_2.csv"), os.path.join(base_directory, "asset_2/20190102__asset_2.csv") } partitions = [kwargs["partition_id"] for kwargs in asset_2_kwargs] assert set(partitions) == { "20190101__asset_2", "20190102__asset_2" } assert len(asset_2_kwargs[0].keys()) == 3
def test_render_full_static_site(tmp_path_factory, filesystem_csv_3): project_dir = str(tmp_path_factory.mktemp("project_dir")) print(project_dir) os.makedirs(os.path.join(project_dir, "data")) os.makedirs(os.path.join(project_dir, "data/titanic")) curdir = os.path.abspath(os.getcwd()) shutil.copy("./tests/test_sets/Titanic.csv", str(os.path.join(project_dir, "data/titanic/Titanic.csv"))) os.makedirs(os.path.join(project_dir, "data/random")) curdir = os.path.abspath(os.getcwd()) shutil.copy(os.path.join(filesystem_csv_3, "f1.csv"), str(os.path.join(project_dir, "data/random/f1.csv"))) shutil.copy(os.path.join(filesystem_csv_3, "f2.csv"), str(os.path.join(project_dir, "data/random/f2.csv"))) context = DataContext.create(project_dir) ge_directory = os.path.join(project_dir, "great_expectations") scaffold_directories_and_notebooks(ge_directory) context.add_datasource("titanic", "pandas", base_directory=os.path.join(project_dir, "data/titanic/")) context.add_datasource("random", "pandas", base_directory=os.path.join(project_dir, "data/random/")) context.profile_datasource("titanic") glob_str = os.path.join( ge_directory, "uncommitted/validations/*/titanic/default/Titanic/BasicDatasetProfiler.json" ) print(glob_str) glob_result = glob(glob_str) os.mkdir(os.path.join(ge_directory, "fixtures/validations")) os.mkdir(os.path.join(ge_directory, "fixtures/validations/titanic")) os.mkdir(os.path.join(ge_directory, "fixtures/validations/titanic/default")) full_fixture_path = os.path.join( ge_directory, "fixtures/validations/titanic/default/Titanic/") os.mkdir(full_fixture_path) shutil.copy(glob_result[0], full_fixture_path + "BasicDatasetProfiler.json") context.profile_datasource("random") os.mkdir(os.path.join(ge_directory, "fixtures/validations/random")) os.mkdir(os.path.join(ge_directory, "fixtures/validations/random/default")) glob_str = os.path.join( ge_directory, "uncommitted/validations/*/random/default/f*/BasicDatasetProfiler.json" ) print(glob_str) glob_result = glob(glob_str) full_fixture_path = os.path.join( ge_directory, "fixtures/validations/random/default/f1/") os.mkdir(full_fixture_path) shutil.copy( glob_result[0], # !!! This might switch the f1 and f2 files... full_fixture_path + "BasicDatasetProfiler.json") full_fixture_path = os.path.join( ge_directory, "fixtures/validations/random/default/f2/") os.mkdir(full_fixture_path) shutil.copy( glob_result[1], # !!! This might switch the f1 and f2 files... full_fixture_path + "BasicDatasetProfiler.json") # for g in glob_result: # shutil.copy( # g, # full_fixture_path+"BasicDatasetProfiler.json" # ) # os.mkdir(os.path.join(ge_directory,"fixtures") context.render_full_static_site() # Titanic assert os.path.exists( os.path.join( ge_directory, "fixtures/validations/titanic/default/Titanic/BasicDatasetProfiler.json" )) assert os.path.exists( os.path.join( ge_directory, "uncommitted/documentation/titanic/default/Titanic/BasicDatasetProfiler.html" )) with open( os.path.join( ge_directory, "fixtures/validations/titanic/default/Titanic/BasicDatasetProfiler.json" ), "r") as infile: titanic_validation = json.load(infile) titanic_run_id = titanic_validation['meta']['run_id'] titanic_validation_html_filename = "{run_id}-BasicDatasetProfiler.html".format( run_id=titanic_run_id.replace(':', ''), ) assert os.path.exists( os.path.join( ge_directory, "uncommitted/documentation/titanic/default/Titanic/{filename}". format(filename=titanic_validation_html_filename))) # f1 assert os.path.exists( os.path.join( ge_directory, "fixtures/validations/random/default/f1/BasicDatasetProfiler.json") ) assert os.path.exists( os.path.join( ge_directory, "uncommitted/documentation/random/default/f1/BasicDatasetProfiler.html" )) with open( os.path.join( ge_directory, "fixtures/validations/random/default/f1/BasicDatasetProfiler.json" ), "r") as infile: f1_validation = json.load(infile) f1_run_id = f1_validation['meta']['run_id'] f1_validation_html_filename = "{run_id}-BasicDatasetProfiler.html".format( run_id=f1_run_id.replace(':', ''), ) assert os.path.exists( os.path.join( ge_directory, "uncommitted/documentation/random/default/f1/{filename}".format( filename=f1_validation_html_filename))) # f2 assert os.path.exists( os.path.join( ge_directory, "fixtures/validations/random/default/f2/BasicDatasetProfiler.json") ) assert os.path.exists( os.path.join( ge_directory, "uncommitted/documentation/random/default/f2/BasicDatasetProfiler.html" )) with open( os.path.join( ge_directory, "fixtures/validations/random/default/f2/BasicDatasetProfiler.json" ), "r") as infile: f2_validation = json.load(infile) f2_run_id = f2_validation['meta']['run_id'] f2_validation_html_filename = "{run_id}-BasicDatasetProfiler.html".format( run_id=f2_run_id.replace(':', ''), ) assert os.path.exists( os.path.join( ge_directory, "uncommitted/documentation/random/default/f2/{filename}".format( filename=f2_validation_html_filename))) # full site assert os.path.exists( os.path.join(ge_directory, "uncommitted/documentation/index.html")) # save documentation locally safe_mmkdir("./tests/data_context/output") safe_mmkdir("./tests/data_context/output/documentation") safe_mmkdir("./tests/data_context/output/documentation/titanic") try: shutil.copytree( os.path.join(ge_directory, "uncommitted/documentation/titanic/default"), "./tests/data_context/output/documentation/titanic/default") except FileExistsError: shutil.rmtree( "./tests/data_context/output/documentation/titanic/default") shutil.copytree( os.path.join(ge_directory, "uncommitted/documentation/titanic/default"), "./tests/data_context/output/documentation/titanic/default") safe_mmkdir("./tests/data_context/output/documentation/random") try: shutil.copytree( os.path.join(ge_directory, "uncommitted/documentation/random/default"), "./tests/data_context/output/documentation/random/default") except FileExistsError: shutil.rmtree( "./tests/data_context/output/documentation/random/default") shutil.copytree( os.path.join(ge_directory, "uncommitted/documentation/random/default"), "./tests/data_context/output/documentation/random/default") shutil.copy( os.path.join(ge_directory, "uncommitted/documentation/index.html"), "./tests/data_context/output/documentation")
def test_render_full_static_site_from_empty_project(tmp_path_factory, filesystem_csv_3): # TODO : Use a standard test fixture # TODO : Have that test fixture copy a directory, rather than building a new one from scratch base_dir = str(tmp_path_factory.mktemp("project_dir")) project_dir = os.path.join(base_dir, "project_path") os.mkdir(project_dir) os.makedirs(os.path.join(project_dir, "data")) os.makedirs(os.path.join(project_dir, "data/titanic")) shutil.copy(file_relative_path(__file__, "../test_sets/Titanic.csv"), str(os.path.join(project_dir, "data/titanic/Titanic.csv"))) os.makedirs(os.path.join(project_dir, "data/random")) shutil.copy(os.path.join(filesystem_csv_3, "f1.csv"), str(os.path.join(project_dir, "data/random/f1.csv"))) shutil.copy(os.path.join(filesystem_csv_3, "f2.csv"), str(os.path.join(project_dir, "data/random/f2.csv"))) assert gen_directory_tree_str(project_dir) == """\ project_path/ data/ random/ f1.csv f2.csv titanic/ Titanic.csv """ context = DataContext.create(project_dir) ge_directory = os.path.join(project_dir, "great_expectations") context.add_datasource("titanic", module_name="great_expectations.datasource", class_name="PandasDatasource", generators={ "subdir_reader": { "class_name": "SubdirReaderBatchKwargsGenerator", "base_directory": os.path.join(project_dir, "data/titanic/") } }) context.add_datasource("random", module_name="great_expectations.datasource", class_name="PandasDatasource", generators={ "subdir_reader": { "class_name": "SubdirReaderBatchKwargsGenerator", "base_directory": os.path.join(project_dir, "data/random/") } }) context.profile_datasource("titanic") # Replicate the batch id of the batch that will be profiled in order to generate the file path of the # validation result titanic_profiled_batch_id = PathBatchKwargs({ 'path': os.path.join(project_dir, 'data/titanic/Titanic.csv'), 'datasource': 'titanic' }).to_id() tree_str = gen_directory_tree_str(project_dir) assert tree_str == """project_path/ data/ random/ f1.csv f2.csv titanic/ Titanic.csv great_expectations/ .gitignore great_expectations.yml expectations/ titanic/ subdir_reader/ Titanic/ BasicDatasetProfiler.json notebooks/ pandas/ validation_playground.ipynb spark/ validation_playground.ipynb sql/ validation_playground.ipynb plugins/ custom_data_docs/ renderers/ styles/ data_docs_custom_styles.css views/ uncommitted/ config_variables.yml data_docs/ samples/ validations/ titanic/ subdir_reader/ Titanic/ BasicDatasetProfiler/ profiling/ {}.json """.format(titanic_profiled_batch_id) context.profile_datasource("random") context.build_data_docs() f1_profiled_batch_id = PathBatchKwargs({ 'path': os.path.join(project_dir, 'data/random/f1.csv'), 'datasource': 'random' }).to_id() f2_profiled_batch_id = PathBatchKwargs({ 'path': os.path.join(project_dir, 'data/random/f2.csv'), 'datasource': 'random' }).to_id() data_docs_dir = os.path.join(project_dir, "great_expectations/uncommitted/data_docs") observed = gen_directory_tree_str(data_docs_dir) assert observed == """\ data_docs/ local_site/ index.html expectations/ random/ subdir_reader/ f1/ BasicDatasetProfiler.html f2/ BasicDatasetProfiler.html titanic/ subdir_reader/ Titanic/ BasicDatasetProfiler.html static/ fonts/ HKGrotesk/ HKGrotesk-Bold.otf HKGrotesk-BoldItalic.otf HKGrotesk-Italic.otf HKGrotesk-Light.otf HKGrotesk-LightItalic.otf HKGrotesk-Medium.otf HKGrotesk-MediumItalic.otf HKGrotesk-Regular.otf HKGrotesk-SemiBold.otf HKGrotesk-SemiBoldItalic.otf images/ favicon.ico glossary_scroller.gif iterative-dev-loop.png logo-long-vector.svg logo-long.png short-logo-vector.svg short-logo.png validation_failed_unexpected_values.gif styles/ data_docs_custom_styles_template.css data_docs_default_styles.css validations/ random/ subdir_reader/ f1/ BasicDatasetProfiler/ profiling/ {0:s}.html f2/ BasicDatasetProfiler/ profiling/ {1:s}.html titanic/ subdir_reader/ Titanic/ BasicDatasetProfiler/ profiling/ {2:s}.html """.format(f1_profiled_batch_id, f2_profiled_batch_id, titanic_profiled_batch_id) # save data_docs locally safe_mmkdir("./tests/data_context/output") safe_mmkdir("./tests/data_context/output/data_docs") if os.path.isdir("./tests/data_context/output/data_docs"): shutil.rmtree("./tests/data_context/output/data_docs") shutil.copytree(os.path.join(ge_directory, "uncommitted/data_docs/"), "./tests/data_context/output/data_docs")
def test_render_full_static_site_from_empty_project(tmp_path_factory, filesystem_csv_3): # TODO : Use a standard test fixture # TODO : Have that test fixture copy a directory, rather than building a new one from scratch base_dir = str(tmp_path_factory.mktemp("project_dir")) project_dir = os.path.join(base_dir, "project_path") os.mkdir(project_dir) os.makedirs(os.path.join(project_dir, "data")) os.makedirs(os.path.join(project_dir, "data/titanic")) shutil.copy("./tests/test_sets/Titanic.csv", str(os.path.join(project_dir, "data/titanic/Titanic.csv"))) os.makedirs(os.path.join(project_dir, "data/random")) shutil.copy(os.path.join(filesystem_csv_3, "f1.csv"), str(os.path.join(project_dir, "data/random/f1.csv"))) shutil.copy(os.path.join(filesystem_csv_3, "f2.csv"), str(os.path.join(project_dir, "data/random/f2.csv"))) assert gen_directory_tree_str(project_dir) == """\ project_path/ data/ random/ f1.csv f2.csv titanic/ Titanic.csv """ context = DataContext.create(project_dir) ge_directory = os.path.join(project_dir, "great_expectations") context.add_datasource("titanic", module_name="great_expectations.datasource", class_name="PandasDatasource", base_directory=os.path.join(project_dir, "data/titanic/")) context.add_datasource("random", module_name="great_expectations.datasource", class_name="PandasDatasource", base_directory=os.path.join(project_dir, "data/random/")) context.profile_datasource("titanic") assert gen_directory_tree_str(project_dir) == """\ project_path/ data/ random/ f1.csv f2.csv titanic/ Titanic.csv great_expectations/ .gitignore great_expectations.yml datasources/ expectations/ titanic/ default/ Titanic/ BasicDatasetProfiler.json notebooks/ create_expectations.ipynb integrate_validation_into_pipeline.ipynb plugins/ uncommitted/ config_variables.yml data_docs/ samples/ validations/ profiling/ titanic/ default/ Titanic/ BasicDatasetProfiler.json """ context.profile_datasource("random") context.build_data_docs() data_docs_dir = os.path.join(project_dir, "great_expectations/uncommitted/data_docs") observed = gen_directory_tree_str(data_docs_dir) print(observed) assert observed == """\ data_docs/ local_site/ index.html expectations/ random/ default/ f1/ BasicDatasetProfiler.html f2/ BasicDatasetProfiler.html titanic/ default/ Titanic/ BasicDatasetProfiler.html validations/ profiling/ random/ default/ f1/ BasicDatasetProfiler.html f2/ BasicDatasetProfiler.html titanic/ default/ Titanic/ BasicDatasetProfiler.html """ # save data_docs locally safe_mmkdir("./tests/data_context/output") safe_mmkdir("./tests/data_context/output/data_docs") if os.path.isdir("./tests/data_context/output/data_docs"): shutil.rmtree("./tests/data_context/output/data_docs") shutil.copytree(os.path.join(ge_directory, "uncommitted/data_docs/"), "./tests/data_context/output/data_docs")