Exemplo n.º 1
def get_all_errors_for_file(config_path: str, file_path: str):
    # get the config
    config = load_and_validate_config(config_path)
    # get the path of the logs required to read
    pull_logs_from = os.path.join(config["log-base-path"], "tables")
    # read the logs
    logs_df = reader.read(pull_logs_from, file_format="jsonl")
    # get the errors for the file in question from all the logs
    file_logs = logs_df[logs_df["original-path"] == file_path]
    # if the file logs has more than one entry, then it probably contains logs from more
    # than one lint run, lets tell the user that
    if len(file_logs) > 1:
            "More than one log for file, output may contain duplicate entries\n\n"
            "Entries show most recent first")
    # extract the timestamps from the log files
    file_logs["ts"] = file_logs["archived-path"].apply(
        lambda x: os.path.splitext(os.path.basename(x))[0].rsplit("-", 1)[1])
    # sort in descending order
    file_logs = file_logs.sort_values(by="ts", ascending=False)
    # use this to collect the markdown tables
    list_of_markdown_tables = []
    # for each file, generate a markdown table in descending order of the timestamp
    for i in range(len(file_logs)):
        # get the response dict
        current_response_dict = file_logs["response"][0]
        # make the markdown header template
        file_markdown = (
            f"**file:** {file_logs['original-path'][i]}\n"
            f"**timestamp of run:** {file_logs['ts'][i]}\n\n"
            "column | test name | test result | percentage error | traceback/error\n"
            "--- | --- | --- | --- | ---\n")
        # add each column and test to the this files markdown table
        for col, tests in current_response_dict.items():
            if col == "valid":
            # for each test in this column, make the markdown for it
            for test_name, test_result in tests.items():
                if test_name == "valid":
                test_valid = "✅" if test_result["valid"] else "❌"
                percentage_error = test_result.get(
                    "percentage_of_column_is_error", "n/a")
                tb = test_result.get("traceback", "n/a")
                file_markdown += (
                    f"{col} | {test_name} | {test_valid} | {percentage_error} | {tb}\n"
        list_of_markdown_tables.append(file_markdown + "\n\n")
    return Markdown("\n\n".join(list_of_markdown_tables))
def test_generate_iam_config(test_input, expected):
    config = load_and_validate_config(
        os.path.join("tests/data/inputs", test_input))
    with open(f"tests/data/expected/{expected}") as f:
        expected_output = yaml.safe_load(f)

    with tempfile.TemporaryDirectory() as d:

        with open(f"{d}/test_iam.yaml") as f:
            test_output = yaml.safe_load(f)

    assert expected_output == test_output
Exemplo n.º 3
def summary_of_all_tables(config_path: str):
    Summary measures:
        - overall validity
        - total number files that have failed as a percentage and number
        - count of failures per table
    # get the config
    config = load_and_validate_config(config_path)
    # make the logs path
    pull_logs_from = os.path.join(config["log-base-path"], "tables")
    # pull logs as df
    logs_df = reader.read(pull_logs_from, file_format="jsonl")
    # get overall valid
    overall_valid = "✅" if logs_df["valid"].all() else "❌"
    total = len(logs_df["valid"])
    count_successes = logs_df["valid"].sum()
    # get number of failures
    count_fails = total - count_successes
    # get percentage of files that failed
    percentage_fails = (count_fails / total) * 100
    count_fails = logs_df["valid"].value_counts().to_dict().get(False, 0)
    # make the summary markdown
    summary_markdown = (
        "overall valid | fail percentage | fail count\n"
        "--- | --- | ---\n"
        f"{overall_valid} | {percentage_fails}% | {count_fails}")
    # get list of tables
    table_list = list(logs_df["table-name"].unique())
    # get the failure count per table
    table_fails_markdown = (
        "table | percentage of files failed | number of failed files\n"
        "--- | --- | ---\n")
    for table_name in table_list:
        # just get this tables deets
        table_log_df = logs_df[logs_df["table-name"] == table_name]
        # get percentage of fails
        table_percentage_fails = (table_log_df["valid"].value_counts(
            normalize=True).mul(100).to_dict().get(False, 0.0))
        # get count of fails
        table_count_fails = table_log_df["valid"].value_counts().to_dict().get(
            False, 0)
        # add results to markdown
        table_fails_markdown += (
            f"{table_name} | {table_percentage_fails} | {table_count_fails}\n")

    return Markdown(f"### overall summary \n{summary_markdown}\n"
                    f"### per table summary \n{table_fails_markdown}\n")
Exemplo n.º 4
def get_failed_files(config_path: str, table_name: str = None) -> Markdown:
    # set the table name
    table_name = "" if not table_name else table_name
    # get the config
    config = load_and_validate_config(config_path)
    # get the path of the logs required to read
    pull_logs_from = os.path.join(config["log-base-path"], "tables",
    # read the logs
    logs_df = reader.read(pull_logs_from, file_format="jsonl")
    # get all the failed paths
    trimmed = logs_df[logs_df["valid"] is False][[
        "table-name", "original-path"
    # return it as markdown
    return Markdown(trimmed.to_markdown())
def test_validation_multiple_workers_no_init(s3, monkeypatch):
    Simple example on how to run DL for multiple workers.
    But without using the init. You would want to do this
    if you want to specify which worker works on what specific dataset.
    In the example below we run 1 worker per table validation

    [worker]x2 -> [closedown]

    monkeypatch.setattr(fs, "S3FileSystem", mock_get_file)

    import boto3
    from data_linter import validation
    from data_linter.logging_functions import get_temp_log_basepath

    from dataengineeringutils3.s3 import (

    s3_client = boto3.client("s3")

    land_folder = "tests/data/end_to_end1/land/"
    config = yaml.safe_load(simple_yaml_config)

    # Only required for mocked tests
    set_up_s3(s3, land_folder, config)

    worker_config_path = os.path.join(get_temp_log_basepath(config), "configs")
    log_bucket, worker_base_key = s3_path_to_bucket_key(worker_config_path)

    config = validation.load_and_validate_config(config)
    config = validation.match_files_in_land_to_config(config)

    # Create a config for worker 0 to only process table1
    # (aka drop other tables in config)
    # and write to worker 0 config to s3
    worker0_conf = deepcopy(config)
    del worker0_conf["tables"]["table2"]

    # Create a config for worker 1 to only process table2
    # and write to worker 1 config to s3
    worker1_conf = deepcopy(config)
    del worker1_conf["tables"]["table1"]

    validation.para_run_validation(0, config)
    validation.para_run_validation(1, config)


    # Assert that files have moved from land -> pass and nothing failed
    land_files = get_filepaths_from_s3_folder(config["land-base-path"])
    pass_files = get_filepaths_from_s3_folder(config["pass-base-path"])
    fail_files = get_filepaths_from_s3_folder(config["fail-base-path"])
    assert (not land_files and not fail_files) and pass_files
Exemplo n.º 6
def test_load_and_validate_config_pass():
    with open("tests/data/expected/expected_pass.json", "r") as f:
        expected_pass = json.load(f)
    c = load_and_validate_config("tests/data/inputs/example_config_pass.yaml")
    assert c == expected_pass
Exemplo n.º 7
def test_load_and_validate_config_fail(test_input, expected):
    with pytest.raises(ValidationError, match=expected) as e:
        _ = load_and_validate_config(
            os.path.join("tests/data/inputs", test_input))
        assert e.validator == expected