def test_validation_multiple_workers(s3, monkeypatch): """ Simple example on how to run DL for multiple worker. [init] -> [worker]x4 -> [closedown] """ monkeypatch.setattr(fs, "S3FileSystem", mock_get_file) from data_linter import validation from dataengineeringutils3.s3 import get_filepaths_from_s3_folder test_folder = "tests/data/end_to_end1/land/" config = yaml.safe_load(simple_yaml_config) # Only required for mocked tests set_up_s3(s3, test_folder, config) validation.para_run_init(4, config) # although ran sequencially this can be ran in parallel for i in range(4): validation.para_run_validation(i, config) validation.para_collect_all_status(config) validation.para_collect_all_logs(config) # Assert that files have moved from land -> pass and nothing failed land_files = get_filepaths_from_s3_folder(config["land-base-path"]) pass_files = get_filepaths_from_s3_folder(config["pass-base-path"]) fail_files = get_filepaths_from_s3_folder(config["fail-base-path"]) assert (not land_files and not fail_files) and pass_files
def test_validation_single_worker(s3, monkeypatch): """ Simple example on how to run DL for a single worker. [init] -> [worker]x1 -> [closedown] """ # Need to mock S3 read for pyarrow (only for testing) monkeypatch.setattr(fs, "S3FileSystem", mock_get_file) from data_linter import validation from dataengineeringutils3.s3 import get_filepaths_from_s3_folder land_folder = "tests/data/end_to_end1/land/" config = yaml.safe_load(simple_yaml_config) # Only required for mocked tests set_up_s3(s3, land_folder, config) validation.para_run_init(1, config) validation.para_run_validation(0, config) validation.para_collect_all_status(config) validation.para_collect_all_logs(config) # Assert that files have moved from land -> pass and nothing failed land_files = get_filepaths_from_s3_folder(config["land-base-path"]) pass_files = get_filepaths_from_s3_folder(config["pass-base-path"]) fail_files = get_filepaths_from_s3_folder(config["fail-base-path"]) assert (not land_files and not fail_files) and pass_files
def test_get_filepaths_from_s3_folder(s3): bucket_name = "test" files = [ { "folder": "f1", "key": "my_file.json", "body": "test" }, { "folder": "f1", "key": "df.first.py", "body": "test" }, { "folder": "f1", "key": "otherfile.json", "body": "" }, { "folder": "f", "key": "ffile.json", "body": "test" }, { "folder": "f.2", "key": "otherfile.json", "body": "test" }, ] s3.meta.client.create_bucket( Bucket=bucket_name, CreateBucketConfiguration={"LocationConstraint": "eu-west-1"}, ) for f in files: s3.Object(bucket_name, f["folder"] + "/" + f["key"]).put(Body=f["body"]) fps = get_filepaths_from_s3_folder("s3://test/f1") assert fps == ["s3://test/f1/df.first.py", "s3://test/f1/my_file.json"] fps = get_filepaths_from_s3_folder("s3://test/f1/", exclude_zero_byte_files=False) assert fps == [ "s3://test/f1/df.first.py", "s3://test/f1/my_file.json", "s3://test/f1/otherfile.json", ] fps = get_filepaths_from_s3_folder("s3://test/f") assert fps == ["s3://test/f/ffile.json"] fps = get_filepaths_from_s3_folder("s3://test/f1", file_extension="json", exclude_zero_byte_files=False) assert fps == ["s3://test/f1/my_file.json", "s3://test/f1/otherfile.json"]
def validate_from_chunked_configs(config: dict, config_num: int) -> bool: land_base_path = config["land-base-path"] land_base_path_is_s3 = land_base_path.startswith("s3://") if land_base_path_is_s3: tmp_log_bp = get_temp_log_basepath(config) s3_temp_path = os.path.join(tmp_log_bp, "configs", str(config_num)) config_file_paths = get_filepaths_from_s3_folder(s3_temp_path) if not config_file_paths: return False s3_client = boto3.client("s3") all_configs = [] for config_file_path in config_file_paths: bucket, key = s3_path_to_bucket_key(config_file_path) config_file_obj = s3_client.get_object(Bucket=bucket, Key=key) all_configs.append(yaml.safe_load(config_file_obj["Body"].read())) for config in all_configs: validate_data(config) return True else: raise ValueError("Local land path not supported for parrallel running")
def para_run_init(max_bin_count: int, config: Union[str, dict] = "config.yaml"): log.info("Loading config for paralellisation") log_path = None try: config = load_and_validate_config(config) temp_log_path = get_temp_log_path_from_config(config) if get_filepaths_from_s3_folder(temp_log_path): log.info( f"Found temp logs in {temp_log_path}." "Deleting data in folder before run." ) delete_s3_folder_contents(temp_log_path) log_path = get_main_log_path_from_config(config) config = match_files_in_land_to_config(config) bin_pack_configs(config, max_bin_count) log.info("Running validation") except Exception as e: log_msg = f"Unexpected error. Uploading log to {log_path} before raising error." error_msg = str(e) log.error(log_msg) log.error(error_msg) upload_log(log, log_stringio, log_path) raise e.with_traceback(e.__traceback__) else: upload_log(log, log_stringio, temp_log_path)
def match_files_in_land_to_config(config: dict) -> dict: """ Takes config and matches files in S3 to the corresponding table list in config. Checks against other config parameters and raise error if config params not met. """ land_base_path = config["land-base-path"] if land_base_path.startswith("s3://"): land_files = get_filepaths_from_s3_folder(land_base_path) else: land_files = get_filepaths_from_local_folder(land_base_path) if not land_files and config.get("fail-no-files", False): raise FileNotFoundError( f"No files found in the path: {land_base_path}") else: total_files = len(land_files) log.info(f"Found {total_files} in {land_base_path}") # Check for requrired tables all_matched = [] for table_name, table_params in config["tables"].items(): if table_params.get("pattern"): table_params["matched_files"] = [ land_file for land_file in land_files if re.match(table_params.get("pattern"), land_file.replace(land_base_path, "")) ] else: table_params["matched_files"] = [ land_file for land_file in land_files if land_file.replace(land_base_path, "").startswith(table_name) ] if not table_params["matched_files"] and table_params.get("required"): raise FileNotFoundError( f"Config states file for {table_name} must exist but no files matched." ) all_matched.extend(table_params["matched_files"]) if len(all_matched) != len(set(all_matched)): large_error_traceback = "" for table_name, table_params in config["tables"].items(): large_error_traceback += f"{table_name}: {table_params['matched_files']} \n" raise FileExistsError( f"We matched the same files to multiple tables.\n{large_error_traceback}" ) # Fail if expecting no unknown files if "fail-unknown-files" in config: file_exeptions = config["fail-unknown-files"].get("exceptions", []) land_diff = set(land_files).difference(all_matched) land_diff = land_diff.difference(file_exeptions) if land_diff: raise FileExistsError("Config states no unknown should exist. " f"The following were unmatched: {land_diff}") return config
def para_collect_all_logs(config: Union[str, dict] = "config.yaml"): config = load_and_validate_config(config) log_base_path = config["log-base-path"] log_path_fin = get_main_log_path_from_config(config) log_base_path_is_s3 = log_base_path.startswith("s3://") tmp_log_base_path = get_temp_log_basepath(config) init_log_path = os.path.join(tmp_log_base_path, "init") val_log_path = os.path.join(tmp_log_base_path, "val") status_log_path = os.path.join(tmp_log_base_path, "status") if log_base_path_is_s3: init_log_paths = get_filepaths_from_s3_folder(init_log_path) val_log_paths = get_filepaths_from_s3_folder(val_log_path) status_log_paths = get_filepaths_from_s3_folder(status_log_path) else: init_log_paths = get_filepaths_from_local_folder(init_log_path) val_log_paths = get_filepaths_from_local_folder(val_log_path) status_log_paths = get_filepaths_from_local_folder(status_log_path) log_string_list = [] for init_log_path in init_log_paths: log_string_list.append(read_all_file_body(init_log_path)) for val_log_path in val_log_paths: log_string_list.append(read_all_file_body(val_log_path)) for status_log_path in status_log_paths: log_string_list.append(read_all_file_body(status_log_path)) log_io = io.StringIO() for log_str in log_string_list: log_io.write(log_str) upload_log(log, log_io, log_path_fin) log_path_del = os.path.join(log_base_path, "data_linter_temporary_fs") if log_base_path_is_s3: delete_s3_folder_contents(log_path_del) else: shutil.rmtree(log_path_del, ignore_errors=True)
def test_end_to_end(setup_env_and_s3, overwrite_json): os.environ["GITHUB_REPO"] = "dummy_repo" os.environ["BUCKET_NAME"] = "moj-analytics-lookup-tables" print(os.listdir("data/lookup1/")) if overwrite_json: with open("data/database_overwrite.json", 'w') as f: overwrite_param = { "description": "test new desc", "bucket": "alpha-lookup-overwrite-bucket" } json.dump(overwrite_param, f) from etl.constants import ( BUCKET_NAME, DATA_DIR, GITHUB_REPO, RELEASE, ) from dataengineeringutils3.s3 import get_filepaths_from_s3_folder from etl.lookup_sync import LookupTableSync lookup_table_sync = LookupTableSync(BUCKET_NAME, DATA_DIR, GITHUB_REPO, RELEASE) lookup_table_sync.send_raw() # Check files uploaded to the correct place b = "alpha-lookup-overwrite-bucket" if overwrite_json else BUCKET_NAME expected_s3_basepath = f"s3://{b}/{GITHUB_REPO}/{RELEASE}/" fps = get_filepaths_from_s3_folder(expected_s3_basepath) fps = [fp.replace(expected_s3_basepath, "") for fp in fps] expected_fps = [ "data/lookup1/data.csv", "data/lookup1/meta.json", "data/lookup2/lookup2.csv", "data/lookup2/lookup2.json" ] assert sorted(fps) == sorted(expected_fps)
def test_bin_pack_configs(s3, max_bin_count): from data_linter import validation from data_linter.utils import read_all_file_body from dataengineeringutils3.s3 import get_filepaths_from_s3_folder from botocore.exceptions import ClientError test_folder = "tests/data/end_to_end1/" land_folder = "tests/data/end_to_end1/land/" config_path = os.path.join(test_folder, "config_matched_files.yml") with open(config_path) as yml: config = yaml.safe_load(yml) set_up_s3(s3, land_folder, config) validation.bin_pack_configs(config, max_bin_count) land_base_path = config["land-base-path"] all_bin_packed_configs = get_filepaths_from_s3_folder( f"{land_base_path}/data_linter_temporary_storage/configs") for i, file_path in enumerate(all_bin_packed_configs): bin_pack_path = os.path.join( test_folder, f"bin_pack/config_{max_bin_count}_{i}.yml") with open(bin_pack_path) as yml: pre_bin_packed = yaml.safe_load(yml) try: actual_bin_pack = yaml.safe_load(read_all_file_body(file_path)) except ClientError as e: if e.response["Error"]["Code"] == "NoSuchKey": assert pre_bin_packed is None else: assert actual_bin_pack == pre_bin_packed
def test_validation_multiple_workers_no_init(s3, monkeypatch): """ Simple example on how to run DL for multiple workers. But without using the init. You would want to do this if you want to specify which worker works on what specific dataset. In the example below we run 1 worker per table validation [worker]x2 -> [closedown] """ monkeypatch.setattr(fs, "S3FileSystem", mock_get_file) import boto3 from data_linter import validation from data_linter.logging_functions import get_temp_log_basepath from dataengineeringutils3.s3 import ( s3_path_to_bucket_key, get_filepaths_from_s3_folder, ) s3_client = boto3.client("s3") land_folder = "tests/data/end_to_end1/land/" config = yaml.safe_load(simple_yaml_config) # Only required for mocked tests set_up_s3(s3, land_folder, config) worker_config_path = os.path.join(get_temp_log_basepath(config), "configs") log_bucket, worker_base_key = s3_path_to_bucket_key(worker_config_path) config = validation.load_and_validate_config(config) config = validation.match_files_in_land_to_config(config) # Create a config for worker 0 to only process table1 # (aka drop other tables in config) # and write to worker 0 config to s3 worker0_conf = deepcopy(config) del worker0_conf["tables"]["table2"] s3_client.put_object( Body=yaml.dump(worker0_conf).encode("utf-8"), Bucket=log_bucket, Key=f"{worker_base_key}/0/config.yml", ) # Create a config for worker 1 to only process table2 # and write to worker 1 config to s3 worker1_conf = deepcopy(config) del worker1_conf["tables"]["table1"] s3_client.put_object( Body=yaml.dump(worker1_conf).encode("utf-8"), Bucket=log_bucket, Key=f"{worker_base_key}/1/config.yml", ) validation.para_run_validation(0, config) validation.para_run_validation(1, config) validation.para_collect_all_status(config) validation.para_collect_all_logs(config) # Assert that files have moved from land -> pass and nothing failed land_files = get_filepaths_from_s3_folder(config["land-base-path"]) pass_files = get_filepaths_from_s3_folder(config["pass-base-path"]) fail_files = get_filepaths_from_s3_folder(config["fail-base-path"]) assert (not land_files and not fail_files) and pass_files
def collect_all_status(config: dict): """ collects the status files saved and determines whether the linting was a succes or not and copies/removes/compresses the files to and from the correct places Args: config: the config as given at the beggining with the paths of where to collect and save data from as well as compression, remove-on-pass etc. """ utc_ts = int(datetime.utcnow().timestamp()) land_base_path = config["land-base-path"] all_must_pass = config.get("all-must-pass", False) pass_base_path = config["pass-base-path"] log_base_path = config["log-base-path"] fail_base_path = config.get("fail-base-path") remove_on_pass = config.get("remove-tables-on-pass") compress = config.get("compress-data") timestamp_partition_name = config.get("timestamp-partition-name") land_base_path_is_s3 = land_base_path.startswith("s3://") log_base_path_is_s3 = log_base_path.startswith("s3://") temp_status_basepath = os.path.join(get_temp_log_basepath(config), "status") if log_base_path_is_s3: status_file_paths = get_filepaths_from_s3_folder(temp_status_basepath) s3_client = boto3.client("s3") all_table_response = [] for status_file_path in status_file_paths: bucket, key = s3_path_to_bucket_key(status_file_path) status_file_obj = s3_client.get_object(Bucket=bucket, Key=key) all_table_response.append( json.loads(status_file_obj["Body"].read())) else: status_file_paths = get_filepaths_from_local_folder( temp_status_basepath) all_table_response = [] for status_file_path in status_file_paths: with open(status_file_path) as json_in: all_table_response.append(json.load(json_in)) all_tables_passed = True pass_count = sum([i["valid"] for i in all_table_response]) if pass_count != len(all_table_response): all_tables_passed = False there_was_a_fail = False all_tables_to_fail = False all_tables_to_respective = False if all_tables_passed: all_tables_to_respective = True else: if all_must_pass: all_tables_to_fail = True else: all_tables_to_respective = True for i, table_response in enumerate(all_table_response): table_name = table_response.get("table-name") matched_file = table_response.get("original-path") file_basename = os.path.basename(matched_file) if all_tables_to_fail: there_was_a_fail = True final_outpath = get_out_path( fail_base_path, table_name, utc_ts, file_basename, compress=compress, filenum=i, timestamp_partition_name=timestamp_partition_name, ) if compress: log.info( f"Compressing file from {matched_file} to {final_outpath}") compress_data(matched_file, final_outpath) else: log.info( f"Copying file from {matched_file} to {final_outpath}") copy_data(matched_file, final_outpath) elif all_tables_to_respective: if table_response["valid"]: final_outpath = get_out_path( pass_base_path, table_name, utc_ts, file_basename, compress=compress, filenum=i, timestamp_partition_name=timestamp_partition_name, ) if compress: log.info( f"Compressing file from {matched_file} to {final_outpath}" ) compress_data(matched_file, final_outpath) else: log.info( f"Copying file from {matched_file} to {final_outpath}") copy_data(matched_file, final_outpath) if remove_on_pass: log.info(f"Removing data in land: {matched_file}") if land_base_path_is_s3: delete_s3_object(matched_file) else: os.remove(matched_file) else: there_was_a_fail = True final_outpath = get_out_path( fail_base_path, table_name, utc_ts, file_basename, compress=compress, filenum=i, timestamp_partition_name=timestamp_partition_name, ) if compress: log.info( f"Compressing file from {matched_file} to {final_outpath}" ) compress_data(matched_file, final_outpath) else: log.info( f"Copying file from {matched_file} to {final_outpath}") copy_data(matched_file, final_outpath) table_response["archived-path"] = final_outpath # write (table specific) log log_outpath = get_table_log_path(log_base_path, table_name, utc_ts, filenum=i) if log_base_path_is_s3: write_json_to_s3(table_response, log_outpath) else: path_name = os.path.dirname(log_outpath) os.makedirs(path_name, exist_ok=True) with open(log_outpath, "w") as json_out: json.dump(table_response, json_out) log.info(f"log for {matched_file} uploaded to {log_outpath}") if there_was_a_fail and all_must_pass: log.info("The following tables have failed: ") for failed_table in [i for i in all_table_response if not i["valid"]]: log.info(f"{failed_table['table-name']} failed") log.info(f"...original path: {failed_table['original-path']}") log.info(f"...out path: {failed_table['archived-path']}") _del_path(get_temp_log_basepath(config)) raise ValueError("Tables did not pass linter") if not all_must_pass and there_was_a_fail: msg6 = "Some tables failed but all_must_pass set to false." msg6 += " Check logs for details" log.info(msg6) _del_path(get_temp_log_basepath(config))