def test_validation_multiple_workers(s3, monkeypatch):
    """
    Simple example on how to run DL for multiple worker.

    [init] -> [worker]x4 -> [closedown]
    """

    monkeypatch.setattr(fs, "S3FileSystem", mock_get_file)

    from data_linter import validation
    from dataengineeringutils3.s3 import get_filepaths_from_s3_folder

    test_folder = "tests/data/end_to_end1/land/"
    config = yaml.safe_load(simple_yaml_config)

    # Only required for mocked tests
    set_up_s3(s3, test_folder, config)

    validation.para_run_init(4, config)

    # although ran sequencially this can be ran in parallel
    for i in range(4):
        validation.para_run_validation(i, config)

    validation.para_collect_all_status(config)
    validation.para_collect_all_logs(config)

    # Assert that files have moved from land -> pass and nothing failed
    land_files = get_filepaths_from_s3_folder(config["land-base-path"])
    pass_files = get_filepaths_from_s3_folder(config["pass-base-path"])
    fail_files = get_filepaths_from_s3_folder(config["fail-base-path"])
    assert (not land_files and not fail_files) and pass_files
def test_validation_single_worker(s3, monkeypatch):
    """
    Simple example on how to run DL for a single worker.

    [init] -> [worker]x1 -> [closedown]
    """

    # Need to mock S3 read for pyarrow (only for testing)
    monkeypatch.setattr(fs, "S3FileSystem", mock_get_file)

    from data_linter import validation
    from dataengineeringutils3.s3 import get_filepaths_from_s3_folder

    land_folder = "tests/data/end_to_end1/land/"
    config = yaml.safe_load(simple_yaml_config)

    # Only required for mocked tests
    set_up_s3(s3, land_folder, config)

    validation.para_run_init(1, config)
    validation.para_run_validation(0, config)
    validation.para_collect_all_status(config)
    validation.para_collect_all_logs(config)

    # Assert that files have moved from land -> pass and nothing failed
    land_files = get_filepaths_from_s3_folder(config["land-base-path"])
    pass_files = get_filepaths_from_s3_folder(config["pass-base-path"])
    fail_files = get_filepaths_from_s3_folder(config["fail-base-path"])
    assert (not land_files and not fail_files) and pass_files
def test_end_to_end_single_file_config(s3, monkeypatch):

    monkeypatch.setattr(fs, "S3FileSystem", mock_get_file)

    from data_linter import validation

    test_folder = "tests/data/end_to_end1/land/"

    config = {
        "land-base-path": "s3://land/",
        "fail-base-path": "s3://fail/",
        "pass-base-path": "s3://pass/",
        "log-base-path": "s3://log/",
        "compress-data": True,
        "remove-tables-on-pass": True,
        "all-must-pass": True,
        "tables": {
            "table1": {
                "required": True,
                "metadata": "tests/data/end_to_end1/meta_data/table1.json",
                "expect-header": True,
                "matched-files": ["s3://land/table1.csv"],
            }
        },
    }

    set_up_s3(s3, test_folder, config)

    validation.para_run_init(1, config)
    validation.para_run_validation(0, config)
    validation.para_collect_all_status(config)
    validation.para_collect_all_logs(config)
def test_bin_count(s3, monkeypatch, max_bin_count):

    monkeypatch.setattr(fs, "S3FileSystem", mock_get_file)

    from data_linter import validation

    test_folder = "tests/data/end_to_end1/"
    land_folder = "tests/data/end_to_end1/land/"
    config_path = os.path.join(test_folder, "config.yaml")

    with open(config_path) as yml:
        config = yaml.safe_load(yml)

    set_up_s3(s3, land_folder, config)

    validation.para_run_init(max_bin_count, config_path)
    for i in range(max_bin_count):
        validation.para_run_validation(i, config_path)
    validation.para_collect_all_status(config_path)
    validation.para_collect_all_logs(config_path)
def test_end_to_end_full_path_spectrum_parallel(
    s3,
    monkeypatch,
    tmpdir_factory,
    land_path,
    fail_path,
    pass_path,
    log_path,
):
    monkeypatch.setattr(fs, "S3FileSystem", mock_get_file)

    from data_linter import validation

    test_folder = "tests/data/end_to_end1/"
    land_folder = "tests/data/end_to_end1/land/"
    config_path = os.path.join(test_folder, "config.yaml")
    max_bin_count = 3

    with open(config_path) as yml:
        config = yaml.safe_load(yml)

    if not fail_path.startswith("s3://"):
        fail_path = tmpdir_factory.mktemp(fail_path)
    if not pass_path.startswith("s3://"):
        pass_path = tmpdir_factory.mktemp(pass_path)
    if not log_path.startswith("s3://"):
        log_path = tmpdir_factory.mktemp(log_path)

    config["land_path"] = land_path
    config["fail_path"] = fail_path
    config["pass_path"] = pass_path
    config["log_path"] = log_path

    set_up_s3(s3, land_folder, config)

    validation.para_run_init(max_bin_count, config_path)
    for i in range(max_bin_count):
        validation.para_run_validation(i, config_path)
    validation.para_collect_all_status(config_path)
    validation.para_collect_all_logs(config_path)