def test_validation_multiple_workers(s3, monkeypatch): """ Simple example on how to run DL for multiple worker. [init] -> [worker]x4 -> [closedown] """ monkeypatch.setattr(fs, "S3FileSystem", mock_get_file) from data_linter import validation from dataengineeringutils3.s3 import get_filepaths_from_s3_folder test_folder = "tests/data/end_to_end1/land/" config = yaml.safe_load(simple_yaml_config) # Only required for mocked tests set_up_s3(s3, test_folder, config) validation.para_run_init(4, config) # although ran sequencially this can be ran in parallel for i in range(4): validation.para_run_validation(i, config) validation.para_collect_all_status(config) validation.para_collect_all_logs(config) # Assert that files have moved from land -> pass and nothing failed land_files = get_filepaths_from_s3_folder(config["land-base-path"]) pass_files = get_filepaths_from_s3_folder(config["pass-base-path"]) fail_files = get_filepaths_from_s3_folder(config["fail-base-path"]) assert (not land_files and not fail_files) and pass_files
def test_validation_single_worker(s3, monkeypatch): """ Simple example on how to run DL for a single worker. [init] -> [worker]x1 -> [closedown] """ # Need to mock S3 read for pyarrow (only for testing) monkeypatch.setattr(fs, "S3FileSystem", mock_get_file) from data_linter import validation from dataengineeringutils3.s3 import get_filepaths_from_s3_folder land_folder = "tests/data/end_to_end1/land/" config = yaml.safe_load(simple_yaml_config) # Only required for mocked tests set_up_s3(s3, land_folder, config) validation.para_run_init(1, config) validation.para_run_validation(0, config) validation.para_collect_all_status(config) validation.para_collect_all_logs(config) # Assert that files have moved from land -> pass and nothing failed land_files = get_filepaths_from_s3_folder(config["land-base-path"]) pass_files = get_filepaths_from_s3_folder(config["pass-base-path"]) fail_files = get_filepaths_from_s3_folder(config["fail-base-path"]) assert (not land_files and not fail_files) and pass_files
def test_end_to_end_single_file_config(s3, monkeypatch): monkeypatch.setattr(fs, "S3FileSystem", mock_get_file) from data_linter import validation test_folder = "tests/data/end_to_end1/land/" config = { "land-base-path": "s3://land/", "fail-base-path": "s3://fail/", "pass-base-path": "s3://pass/", "log-base-path": "s3://log/", "compress-data": True, "remove-tables-on-pass": True, "all-must-pass": True, "tables": { "table1": { "required": True, "metadata": "tests/data/end_to_end1/meta_data/table1.json", "expect-header": True, "matched-files": ["s3://land/table1.csv"], } }, } set_up_s3(s3, test_folder, config) validation.para_run_init(1, config) validation.para_run_validation(0, config) validation.para_collect_all_status(config) validation.para_collect_all_logs(config)
def test_bin_count(s3, monkeypatch, max_bin_count): monkeypatch.setattr(fs, "S3FileSystem", mock_get_file) from data_linter import validation test_folder = "tests/data/end_to_end1/" land_folder = "tests/data/end_to_end1/land/" config_path = os.path.join(test_folder, "config.yaml") with open(config_path) as yml: config = yaml.safe_load(yml) set_up_s3(s3, land_folder, config) validation.para_run_init(max_bin_count, config_path) for i in range(max_bin_count): validation.para_run_validation(i, config_path) validation.para_collect_all_status(config_path) validation.para_collect_all_logs(config_path)
def test_end_to_end_full_path_spectrum_parallel( s3, monkeypatch, tmpdir_factory, land_path, fail_path, pass_path, log_path, ): monkeypatch.setattr(fs, "S3FileSystem", mock_get_file) from data_linter import validation test_folder = "tests/data/end_to_end1/" land_folder = "tests/data/end_to_end1/land/" config_path = os.path.join(test_folder, "config.yaml") max_bin_count = 3 with open(config_path) as yml: config = yaml.safe_load(yml) if not fail_path.startswith("s3://"): fail_path = tmpdir_factory.mktemp(fail_path) if not pass_path.startswith("s3://"): pass_path = tmpdir_factory.mktemp(pass_path) if not log_path.startswith("s3://"): log_path = tmpdir_factory.mktemp(log_path) config["land_path"] = land_path config["fail_path"] = fail_path config["pass_path"] = pass_path config["log_path"] = log_path set_up_s3(s3, land_folder, config) validation.para_run_init(max_bin_count, config_path) for i in range(max_bin_count): validation.para_run_validation(i, config_path) validation.para_collect_all_status(config_path) validation.para_collect_all_logs(config_path)