def test_validation_multiple_workers(s3, monkeypatch):
    """
    Simple example on how to run DL for multiple worker.

    [init] -> [worker]x4 -> [closedown]
    """

    monkeypatch.setattr(fs, "S3FileSystem", mock_get_file)

    from data_linter import validation
    from dataengineeringutils3.s3 import get_filepaths_from_s3_folder

    test_folder = "tests/data/end_to_end1/land/"
    config = yaml.safe_load(simple_yaml_config)

    # Only required for mocked tests
    set_up_s3(s3, test_folder, config)

    validation.para_run_init(4, config)

    # although ran sequencially this can be ran in parallel
    for i in range(4):
        validation.para_run_validation(i, config)

    validation.para_collect_all_status(config)
    validation.para_collect_all_logs(config)

    # Assert that files have moved from land -> pass and nothing failed
    land_files = get_filepaths_from_s3_folder(config["land-base-path"])
    pass_files = get_filepaths_from_s3_folder(config["pass-base-path"])
    fail_files = get_filepaths_from_s3_folder(config["fail-base-path"])
    assert (not land_files and not fail_files) and pass_files
def test_validation_single_worker(s3, monkeypatch):
    """
    Simple example on how to run DL for a single worker.

    [init] -> [worker]x1 -> [closedown]
    """

    # Need to mock S3 read for pyarrow (only for testing)
    monkeypatch.setattr(fs, "S3FileSystem", mock_get_file)

    from data_linter import validation
    from dataengineeringutils3.s3 import get_filepaths_from_s3_folder

    land_folder = "tests/data/end_to_end1/land/"
    config = yaml.safe_load(simple_yaml_config)

    # Only required for mocked tests
    set_up_s3(s3, land_folder, config)

    validation.para_run_init(1, config)
    validation.para_run_validation(0, config)
    validation.para_collect_all_status(config)
    validation.para_collect_all_logs(config)

    # Assert that files have moved from land -> pass and nothing failed
    land_files = get_filepaths_from_s3_folder(config["land-base-path"])
    pass_files = get_filepaths_from_s3_folder(config["pass-base-path"])
    fail_files = get_filepaths_from_s3_folder(config["fail-base-path"])
    assert (not land_files and not fail_files) and pass_files
def test_get_filepaths_from_s3_folder(s3):

    bucket_name = "test"

    files = [
        {
            "folder": "f1",
            "key": "my_file.json",
            "body": "test"
        },
        {
            "folder": "f1",
            "key": "df.first.py",
            "body": "test"
        },
        {
            "folder": "f1",
            "key": "otherfile.json",
            "body": ""
        },
        {
            "folder": "f",
            "key": "ffile.json",
            "body": "test"
        },
        {
            "folder": "f.2",
            "key": "otherfile.json",
            "body": "test"
        },
    ]

    s3.meta.client.create_bucket(
        Bucket=bucket_name,
        CreateBucketConfiguration={"LocationConstraint": "eu-west-1"},
    )

    for f in files:
        s3.Object(bucket_name,
                  f["folder"] + "/" + f["key"]).put(Body=f["body"])

    fps = get_filepaths_from_s3_folder("s3://test/f1")
    assert fps == ["s3://test/f1/df.first.py", "s3://test/f1/my_file.json"]

    fps = get_filepaths_from_s3_folder("s3://test/f1/",
                                       exclude_zero_byte_files=False)
    assert fps == [
        "s3://test/f1/df.first.py",
        "s3://test/f1/my_file.json",
        "s3://test/f1/otherfile.json",
    ]

    fps = get_filepaths_from_s3_folder("s3://test/f")
    assert fps == ["s3://test/f/ffile.json"]

    fps = get_filepaths_from_s3_folder("s3://test/f1",
                                       file_extension="json",
                                       exclude_zero_byte_files=False)
    assert fps == ["s3://test/f1/my_file.json", "s3://test/f1/otherfile.json"]
示例#4
0
def validate_from_chunked_configs(config: dict, config_num: int) -> bool:

    land_base_path = config["land-base-path"]
    land_base_path_is_s3 = land_base_path.startswith("s3://")

    if land_base_path_is_s3:
        tmp_log_bp = get_temp_log_basepath(config)
        s3_temp_path = os.path.join(tmp_log_bp, "configs", str(config_num))

        config_file_paths = get_filepaths_from_s3_folder(s3_temp_path)
        if not config_file_paths:
            return False

        s3_client = boto3.client("s3")

        all_configs = []
        for config_file_path in config_file_paths:
            bucket, key = s3_path_to_bucket_key(config_file_path)
            config_file_obj = s3_client.get_object(Bucket=bucket, Key=key)
            all_configs.append(yaml.safe_load(config_file_obj["Body"].read()))

        for config in all_configs:
            validate_data(config)

        return True

    else:
        raise ValueError("Local land path not supported for parrallel running")
示例#5
0
def para_run_init(max_bin_count: int, config: Union[str, dict] = "config.yaml"):

    log.info("Loading config for paralellisation")
    log_path = None
    try:
        config = load_and_validate_config(config)
        temp_log_path = get_temp_log_path_from_config(config)
        if get_filepaths_from_s3_folder(temp_log_path):
            log.info(
                f"Found temp logs in {temp_log_path}."
                "Deleting data in folder before run."
            )
            delete_s3_folder_contents(temp_log_path)

        log_path = get_main_log_path_from_config(config)

        config = match_files_in_land_to_config(config)

        bin_pack_configs(config, max_bin_count)

        log.info("Running validation")

    except Exception as e:
        log_msg = f"Unexpected error. Uploading log to {log_path} before raising error."
        error_msg = str(e)

        log.error(log_msg)
        log.error(error_msg)

        upload_log(log, log_stringio, log_path)

        raise e.with_traceback(e.__traceback__)
    else:
        upload_log(log, log_stringio, temp_log_path)
示例#6
0
def match_files_in_land_to_config(config: dict) -> dict:
    """
    Takes config and matches files in S3 to the corresponding table list in config.
    Checks against other config parameters and raise error if config params not met.
    """

    land_base_path = config["land-base-path"]
    if land_base_path.startswith("s3://"):
        land_files = get_filepaths_from_s3_folder(land_base_path)
    else:
        land_files = get_filepaths_from_local_folder(land_base_path)

    if not land_files and config.get("fail-no-files", False):
        raise FileNotFoundError(
            f"No files found in the path: {land_base_path}")
    else:
        total_files = len(land_files)
        log.info(f"Found {total_files} in {land_base_path}")

    # Check for requrired tables
    all_matched = []
    for table_name, table_params in config["tables"].items():
        if table_params.get("pattern"):
            table_params["matched_files"] = [
                land_file for land_file in land_files
                if re.match(table_params.get("pattern"),
                            land_file.replace(land_base_path, ""))
            ]
        else:
            table_params["matched_files"] = [
                land_file for land_file in land_files
                if land_file.replace(land_base_path, "").startswith(table_name)
            ]

        if not table_params["matched_files"] and table_params.get("required"):
            raise FileNotFoundError(
                f"Config states file for {table_name} must exist but no files matched."
            )

        all_matched.extend(table_params["matched_files"])

    if len(all_matched) != len(set(all_matched)):
        large_error_traceback = ""
        for table_name, table_params in config["tables"].items():
            large_error_traceback += f"{table_name}: {table_params['matched_files']} \n"
        raise FileExistsError(
            f"We matched the same files to multiple tables.\n{large_error_traceback}"
        )

    # Fail if expecting no unknown files
    if "fail-unknown-files" in config:
        file_exeptions = config["fail-unknown-files"].get("exceptions", [])
        land_diff = set(land_files).difference(all_matched)
        land_diff = land_diff.difference(file_exeptions)
        if land_diff:
            raise FileExistsError("Config states no unknown should exist. "
                                  f"The following were unmatched: {land_diff}")

    return config
示例#7
0
def para_collect_all_logs(config: Union[str, dict] = "config.yaml"):

    config = load_and_validate_config(config)

    log_base_path = config["log-base-path"]
    log_path_fin = get_main_log_path_from_config(config)
    log_base_path_is_s3 = log_base_path.startswith("s3://")

    tmp_log_base_path = get_temp_log_basepath(config)
    init_log_path = os.path.join(tmp_log_base_path, "init")
    val_log_path = os.path.join(tmp_log_base_path, "val")
    status_log_path = os.path.join(tmp_log_base_path, "status")

    if log_base_path_is_s3:
        init_log_paths = get_filepaths_from_s3_folder(init_log_path)
        val_log_paths = get_filepaths_from_s3_folder(val_log_path)
        status_log_paths = get_filepaths_from_s3_folder(status_log_path)
    else:
        init_log_paths = get_filepaths_from_local_folder(init_log_path)
        val_log_paths = get_filepaths_from_local_folder(val_log_path)
        status_log_paths = get_filepaths_from_local_folder(status_log_path)

    log_string_list = []
    for init_log_path in init_log_paths:
        log_string_list.append(read_all_file_body(init_log_path))
    for val_log_path in val_log_paths:
        log_string_list.append(read_all_file_body(val_log_path))
    for status_log_path in status_log_paths:
        log_string_list.append(read_all_file_body(status_log_path))

    log_io = io.StringIO()
    for log_str in log_string_list:
        log_io.write(log_str)
    upload_log(log, log_io, log_path_fin)

    log_path_del = os.path.join(log_base_path, "data_linter_temporary_fs")

    if log_base_path_is_s3:
        delete_s3_folder_contents(log_path_del)
    else:
        shutil.rmtree(log_path_del, ignore_errors=True)
示例#8
0
def test_end_to_end(setup_env_and_s3, overwrite_json):
    os.environ["GITHUB_REPO"] = "dummy_repo"
    os.environ["BUCKET_NAME"] = "moj-analytics-lookup-tables"

    print(os.listdir("data/lookup1/"))
    if overwrite_json:
        with open("data/database_overwrite.json", 'w') as f:
            overwrite_param = {
                "description": "test new desc",
                "bucket": "alpha-lookup-overwrite-bucket"
            }
            json.dump(overwrite_param, f)

    from etl.constants import (
        BUCKET_NAME,
        DATA_DIR,
        GITHUB_REPO,
        RELEASE,
    )
    from dataengineeringutils3.s3 import get_filepaths_from_s3_folder
    from etl.lookup_sync import LookupTableSync

    lookup_table_sync = LookupTableSync(BUCKET_NAME, DATA_DIR, GITHUB_REPO,
                                        RELEASE)
    lookup_table_sync.send_raw()

    # Check files uploaded to the correct place
    b = "alpha-lookup-overwrite-bucket" if overwrite_json else BUCKET_NAME
    expected_s3_basepath = f"s3://{b}/{GITHUB_REPO}/{RELEASE}/"
    fps = get_filepaths_from_s3_folder(expected_s3_basepath)
    fps = [fp.replace(expected_s3_basepath, "") for fp in fps]

    expected_fps = [
        "data/lookup1/data.csv", "data/lookup1/meta.json",
        "data/lookup2/lookup2.csv", "data/lookup2/lookup2.json"
    ]

    assert sorted(fps) == sorted(expected_fps)
def test_bin_pack_configs(s3, max_bin_count):

    from data_linter import validation
    from data_linter.utils import read_all_file_body
    from dataengineeringutils3.s3 import get_filepaths_from_s3_folder
    from botocore.exceptions import ClientError

    test_folder = "tests/data/end_to_end1/"
    land_folder = "tests/data/end_to_end1/land/"
    config_path = os.path.join(test_folder, "config_matched_files.yml")

    with open(config_path) as yml:
        config = yaml.safe_load(yml)

    set_up_s3(s3, land_folder, config)

    validation.bin_pack_configs(config, max_bin_count)

    land_base_path = config["land-base-path"]

    all_bin_packed_configs = get_filepaths_from_s3_folder(
        f"{land_base_path}/data_linter_temporary_storage/configs")

    for i, file_path in enumerate(all_bin_packed_configs):
        bin_pack_path = os.path.join(
            test_folder, f"bin_pack/config_{max_bin_count}_{i}.yml")
        with open(bin_pack_path) as yml:
            pre_bin_packed = yaml.safe_load(yml)

        try:
            actual_bin_pack = yaml.safe_load(read_all_file_body(file_path))
        except ClientError as e:
            if e.response["Error"]["Code"] == "NoSuchKey":
                assert pre_bin_packed is None
        else:
            assert actual_bin_pack == pre_bin_packed
def test_validation_multiple_workers_no_init(s3, monkeypatch):
    """
    Simple example on how to run DL for multiple workers.
    But without using the init. You would want to do this
    if you want to specify which worker works on what specific dataset.
    In the example below we run 1 worker per table validation

    [worker]x2 -> [closedown]
    """

    monkeypatch.setattr(fs, "S3FileSystem", mock_get_file)

    import boto3
    from data_linter import validation
    from data_linter.logging_functions import get_temp_log_basepath

    from dataengineeringutils3.s3 import (
        s3_path_to_bucket_key,
        get_filepaths_from_s3_folder,
    )

    s3_client = boto3.client("s3")

    land_folder = "tests/data/end_to_end1/land/"
    config = yaml.safe_load(simple_yaml_config)

    # Only required for mocked tests
    set_up_s3(s3, land_folder, config)

    worker_config_path = os.path.join(get_temp_log_basepath(config), "configs")
    log_bucket, worker_base_key = s3_path_to_bucket_key(worker_config_path)

    config = validation.load_and_validate_config(config)
    config = validation.match_files_in_land_to_config(config)

    # Create a config for worker 0 to only process table1
    # (aka drop other tables in config)
    # and write to worker 0 config to s3
    worker0_conf = deepcopy(config)
    del worker0_conf["tables"]["table2"]
    s3_client.put_object(
        Body=yaml.dump(worker0_conf).encode("utf-8"),
        Bucket=log_bucket,
        Key=f"{worker_base_key}/0/config.yml",
    )

    # Create a config for worker 1 to only process table2
    # and write to worker 1 config to s3
    worker1_conf = deepcopy(config)
    del worker1_conf["tables"]["table1"]
    s3_client.put_object(
        Body=yaml.dump(worker1_conf).encode("utf-8"),
        Bucket=log_bucket,
        Key=f"{worker_base_key}/1/config.yml",
    )

    validation.para_run_validation(0, config)
    validation.para_run_validation(1, config)

    validation.para_collect_all_status(config)
    validation.para_collect_all_logs(config)

    # Assert that files have moved from land -> pass and nothing failed
    land_files = get_filepaths_from_s3_folder(config["land-base-path"])
    pass_files = get_filepaths_from_s3_folder(config["pass-base-path"])
    fail_files = get_filepaths_from_s3_folder(config["fail-base-path"])
    assert (not land_files and not fail_files) and pass_files
示例#11
0
def collect_all_status(config: dict):
    """
    collects the status files saved and determines whether the linting was a succes or
    not and copies/removes/compresses the files to and from the correct places

    Args:
    config: the config as given at the beggining with the paths of where to collect and
    save data from as well as compression, remove-on-pass etc.
    """

    utc_ts = int(datetime.utcnow().timestamp())
    land_base_path = config["land-base-path"]
    all_must_pass = config.get("all-must-pass", False)
    pass_base_path = config["pass-base-path"]
    log_base_path = config["log-base-path"]
    fail_base_path = config.get("fail-base-path")
    remove_on_pass = config.get("remove-tables-on-pass")
    compress = config.get("compress-data")
    timestamp_partition_name = config.get("timestamp-partition-name")

    land_base_path_is_s3 = land_base_path.startswith("s3://")
    log_base_path_is_s3 = log_base_path.startswith("s3://")
    temp_status_basepath = os.path.join(get_temp_log_basepath(config),
                                        "status")
    if log_base_path_is_s3:
        status_file_paths = get_filepaths_from_s3_folder(temp_status_basepath)

        s3_client = boto3.client("s3")

        all_table_response = []
        for status_file_path in status_file_paths:
            bucket, key = s3_path_to_bucket_key(status_file_path)
            status_file_obj = s3_client.get_object(Bucket=bucket, Key=key)
            all_table_response.append(
                json.loads(status_file_obj["Body"].read()))

    else:
        status_file_paths = get_filepaths_from_local_folder(
            temp_status_basepath)

        all_table_response = []
        for status_file_path in status_file_paths:
            with open(status_file_path) as json_in:
                all_table_response.append(json.load(json_in))

    all_tables_passed = True

    pass_count = sum([i["valid"] for i in all_table_response])

    if pass_count != len(all_table_response):
        all_tables_passed = False

    there_was_a_fail = False
    all_tables_to_fail = False
    all_tables_to_respective = False

    if all_tables_passed:
        all_tables_to_respective = True
    else:
        if all_must_pass:
            all_tables_to_fail = True
        else:
            all_tables_to_respective = True

    for i, table_response in enumerate(all_table_response):
        table_name = table_response.get("table-name")
        matched_file = table_response.get("original-path")
        file_basename = os.path.basename(matched_file)

        if all_tables_to_fail:
            there_was_a_fail = True
            final_outpath = get_out_path(
                fail_base_path,
                table_name,
                utc_ts,
                file_basename,
                compress=compress,
                filenum=i,
                timestamp_partition_name=timestamp_partition_name,
            )
            if compress:
                log.info(
                    f"Compressing file from {matched_file} to {final_outpath}")
                compress_data(matched_file, final_outpath)
            else:
                log.info(
                    f"Copying file from {matched_file} to {final_outpath}")
                copy_data(matched_file, final_outpath)
        elif all_tables_to_respective:
            if table_response["valid"]:
                final_outpath = get_out_path(
                    pass_base_path,
                    table_name,
                    utc_ts,
                    file_basename,
                    compress=compress,
                    filenum=i,
                    timestamp_partition_name=timestamp_partition_name,
                )
                if compress:
                    log.info(
                        f"Compressing file from {matched_file} to {final_outpath}"
                    )
                    compress_data(matched_file, final_outpath)
                else:
                    log.info(
                        f"Copying file from {matched_file} to {final_outpath}")
                    copy_data(matched_file, final_outpath)
                if remove_on_pass:
                    log.info(f"Removing data in land: {matched_file}")
                    if land_base_path_is_s3:
                        delete_s3_object(matched_file)
                    else:
                        os.remove(matched_file)

            else:
                there_was_a_fail = True
                final_outpath = get_out_path(
                    fail_base_path,
                    table_name,
                    utc_ts,
                    file_basename,
                    compress=compress,
                    filenum=i,
                    timestamp_partition_name=timestamp_partition_name,
                )
                if compress:
                    log.info(
                        f"Compressing file from {matched_file} to {final_outpath}"
                    )
                    compress_data(matched_file, final_outpath)
                else:
                    log.info(
                        f"Copying file from {matched_file} to {final_outpath}")
                    copy_data(matched_file, final_outpath)
        table_response["archived-path"] = final_outpath

        # write (table specific) log
        log_outpath = get_table_log_path(log_base_path,
                                         table_name,
                                         utc_ts,
                                         filenum=i)
        if log_base_path_is_s3:
            write_json_to_s3(table_response, log_outpath)
        else:
            path_name = os.path.dirname(log_outpath)
            os.makedirs(path_name, exist_ok=True)
            with open(log_outpath, "w") as json_out:
                json.dump(table_response, json_out)
        log.info(f"log for {matched_file} uploaded to {log_outpath}")

    if there_was_a_fail and all_must_pass:
        log.info("The following tables have failed: ")
        for failed_table in [i for i in all_table_response if not i["valid"]]:
            log.info(f"{failed_table['table-name']} failed")
            log.info(f"...original path: {failed_table['original-path']}")
            log.info(f"...out path: {failed_table['archived-path']}")
        _del_path(get_temp_log_basepath(config))
        raise ValueError("Tables did not pass linter")

    if not all_must_pass and there_was_a_fail:
        msg6 = "Some tables failed but all_must_pass set to false."
        msg6 += " Check logs for details"
        log.info(msg6)

    _del_path(get_temp_log_basepath(config))