def test_read(data_format, use_meta):
    test_data_path = f"tests/data/all_types.{data_format}"

    if use_meta:
        meta = {
            "columns": [
                {
                    "name": "my_float",
                    "type": "float64",
                    "type_category": "float"
                },
                {
                    "name": "my_bool",
                    "type": "bool_",
                    "type_category": "boolean"
                },
                {
                    "name": "my_nullable_bool",
                    "type": "bool_",
                    "type_category": "boolean",
                },
                {
                    "name": "my_date",
                    "type": "date32",
                    "type_category": "timestamp"
                },
                {
                    "name": "my_datetime",
                    "type": "timestamp(s)",
                    "type_category": "timestamp",
                },
                {
                    "name": "my_int",
                    "type": "int64",
                    "type_category": "integer"
                },
                {
                    "name": "my_string",
                    "type": "string",
                    "type_category": "string"
                },
            ]
        }
    else:
        meta = None

    df1 = reader.read(test_data_path, meta)

    if data_format == "csv":
        df2 = reader.csv.read(test_data_path, meta)
    elif data_format == "jsonl":
        df2 = reader.json.read(test_data_path, meta)
    else:
        raise ValueError(f"Test wasn't expecting: {data_format}")

    assert_frame_equal(df1, df2)
def test_inferred_cols_pandas_types(data_format):
    df = reader.read(f"tests/data/all_types.{data_format}")
    test = df.dtypes.to_dict()
    assert isinstance(test["i"], pd.core.arrays.integer.Int64Dtype)
    assert isinstance(test["my_float"], type(np.dtype("float64")))
    assert isinstance(test["my_bool"], pd.core.arrays.boolean.BooleanDtype)
    if data_format == "jsonl":
        pytest.skip("Pandas cannot infer bool with nulls from JSON datasets")
    else:
        assert isinstance(test["my_nullable_bool"],
                          pd.core.arrays.boolean.BooleanDtype)

    assert isinstance(test["my_string"], pd.core.arrays.string_.StringDtype)
示例#3
0
def get_all_errors_for_file(config_path: str, file_path: str):
    # get the config
    config = load_and_validate_config(config_path)
    # get the path of the logs required to read
    pull_logs_from = os.path.join(config["log-base-path"], "tables")
    # read the logs
    logs_df = reader.read(pull_logs_from, file_format="jsonl")
    # get the errors for the file in question from all the logs
    file_logs = logs_df[logs_df["original-path"] == file_path]
    # if the file logs has more than one entry, then it probably contains logs from more
    # than one lint run, lets tell the user that
    if len(file_logs) > 1:
        print(
            "More than one log for file, output may contain duplicate entries\n\n"
            "Entries show most recent first")
    # extract the timestamps from the log files
    file_logs["ts"] = file_logs["archived-path"].apply(
        lambda x: os.path.splitext(os.path.basename(x))[0].rsplit("-", 1)[1])
    # sort in descending order
    file_logs = file_logs.sort_values(by="ts", ascending=False)
    # use this to collect the markdown tables
    list_of_markdown_tables = []
    # for each file, generate a markdown table in descending order of the timestamp
    for i in range(len(file_logs)):
        # get the response dict
        current_response_dict = file_logs["response"][0]
        # make the markdown header template
        file_markdown = (
            f"**file:** {file_logs['original-path'][i]}\n"
            f"**timestamp of run:** {file_logs['ts'][i]}\n\n"
            "column | test name | test result | percentage error | traceback/error\n"
            "--- | --- | --- | --- | ---\n")
        # add each column and test to the this files markdown table
        for col, tests in current_response_dict.items():
            if col == "valid":
                continue
            # for each test in this column, make the markdown for it
            for test_name, test_result in tests.items():
                if test_name == "valid":
                    continue
                test_valid = "✅" if test_result["valid"] else "❌"
                percentage_error = test_result.get(
                    "percentage_of_column_is_error", "n/a")
                tb = test_result.get("traceback", "n/a")
                file_markdown += (
                    f"{col} | {test_name} | {test_valid} | {percentage_error} | {tb}\n"
                )
        list_of_markdown_tables.append(file_markdown + "\n\n")
    return Markdown("\n\n".join(list_of_markdown_tables))
示例#4
0
def summary_of_all_tables(config_path: str):
    """
    Summary measures:
        - overall validity
        - total number files that have failed as a percentage and number
        - count of failures per table
    """
    # get the config
    config = load_and_validate_config(config_path)
    # make the logs path
    pull_logs_from = os.path.join(config["log-base-path"], "tables")
    # pull logs as df
    logs_df = reader.read(pull_logs_from, file_format="jsonl")
    # get overall valid
    overall_valid = "✅" if logs_df["valid"].all() else "❌"
    total = len(logs_df["valid"])
    count_successes = logs_df["valid"].sum()
    # get number of failures
    count_fails = total - count_successes
    # get percentage of files that failed
    percentage_fails = (count_fails / total) * 100
    count_fails = logs_df["valid"].value_counts().to_dict().get(False, 0)
    # make the summary markdown
    summary_markdown = (
        "overall valid | fail percentage | fail count\n"
        "--- | --- | ---\n"
        f"{overall_valid} | {percentage_fails}% | {count_fails}")
    # get list of tables
    table_list = list(logs_df["table-name"].unique())
    # get the failure count per table
    table_fails_markdown = (
        "table | percentage of files failed | number of failed files\n"
        "--- | --- | ---\n")
    for table_name in table_list:
        # just get this tables deets
        table_log_df = logs_df[logs_df["table-name"] == table_name]
        # get percentage of fails
        table_percentage_fails = (table_log_df["valid"].value_counts(
            normalize=True).mul(100).to_dict().get(False, 0.0))
        # get count of fails
        table_count_fails = table_log_df["valid"].value_counts().to_dict().get(
            False, 0)
        # add results to markdown
        table_fails_markdown += (
            f"{table_name} | {table_percentage_fails} | {table_count_fails}\n")

    return Markdown(f"### overall summary \n{summary_markdown}\n"
                    f"### per table summary \n{table_fails_markdown}\n")
示例#5
0
def get_failed_files(config_path: str, table_name: str = None) -> Markdown:
    # set the table name
    table_name = "" if not table_name else table_name
    # get the config
    config = load_and_validate_config(config_path)
    # get the path of the logs required to read
    pull_logs_from = os.path.join(config["log-base-path"], "tables",
                                  table_name)
    # read the logs
    logs_df = reader.read(pull_logs_from, file_format="jsonl")
    # get all the failed paths
    trimmed = logs_df[logs_df["valid"] is False][[
        "table-name", "original-path"
    ]]
    # return it as markdown
    return Markdown(trimmed.to_markdown())
def test_write_local_path_not_exist(data_format):
    # tests that if the path does not exist, the writer will not error
    with tempfile.TemporaryDirectory() as tmp_dir:
        df = reader.read("tests/data/all_types.csv")
        out_file = os.path.join(tmp_dir, f"does/not/exist/data.{data_format}")
        writer.write(df, out_file)
def test_write(data_format, use_meta):

    if use_meta:
        meta = {
            "columns": [
                {
                    "name": "my_float",
                    "type": "float64",
                    "type_category": "float"
                },
                {
                    "name": "my_bool",
                    "type": "bool_",
                    "type_category": "boolean"
                },
                {
                    "name": "my_nullable_bool",
                    "type": "bool_",
                    "type_category": "boolean",
                },
                {
                    "name": "my_date",
                    "type": "date32",
                    "type_category": "timestamp"
                },
                {
                    "name": "my_datetime",
                    "type": "timestamp(s)",
                    "type_category": "timestamp",
                },
                {
                    "name": "my_int",
                    "type": "int64",
                    "type_category": "integer"
                },
                {
                    "name": "my_string",
                    "type": "string",
                    "type_category": "string"
                },
            ]
        }
    else:
        meta = None

    in_data_path = "tests/data/all_types.csv"
    df = reader.read(in_data_path, meta)

    # Create temp files
    with tempfile.NamedTemporaryFile(suffix=f".{data_format}") as f:
        tmp_out1 = f.name
    with tempfile.NamedTemporaryFile(suffix=f".{data_format}") as f:
        tmp_out2 = f.name

    writer.write(df, tmp_out1, meta)
    if data_format == "csv":
        writer.csv.write(df, tmp_out2, meta)
    elif data_format == "jsonl":
        writer.json.write(df, tmp_out2, meta)
    elif data_format in ["snappy.parquet", "parquet"]:
        writer.parquet.write(df, tmp_out2, meta)
    else:
        raise ValueError(f"Test wasn't expecting: {data_format}")

    with open(tmp_out1, "rb") as f:
        b1 = f.read()
    with open(tmp_out2, "rb") as f:
        b2 = f.read()

    assert b1 == b2
def _parse_data_to_pandas(filepath: str, table_params: dict,
                          metadata: Metadata):
    """
    Reads in the data from the given filepath and returns
    a dataframe
    """

    # get the required sets of column names
    meta_col_names = [
        c["name"] for c in metadata.columns
        if c["name"] not in metadata.partitions
    ]

    pandas_kwargs = table_params.get("pandas-kwargs", {})

    # read data (and do headers stuff if csv)
    if filepath.lower().endswith("csv"):
        expect_header = table_params.get("expect-header", True)
        header = 0 if expect_header else None
        df = reader.read(filepath,
                         header=header,
                         low_memory=False,
                         **pandas_kwargs)
        if not expect_header:
            df.columns = meta_col_names
    else:
        df = reader.read(filepath, **pandas_kwargs)

    # eliminate case sensitivity, if requested
    if table_params.get("headers-ignore-case"):
        for c in metadata.columns:
            c["name"] = c["name"].lower()
        df.columns = [c.lower() for c in df.columns]
        meta_col_names = [c.lower() for c in meta_col_names]

    allow_missing_cols = table_params.get("allow-missing-cols", False)
    allow_unexpected_data = table_params.get("allow-unexpected-data", False)

    cols_in_meta_but_not_data = [
        c for c in meta_col_names if c not in df.columns
    ]
    cols_in_data_but_not_meta = [
        c for c in df.columns if c not in meta_col_names
    ]
    cols_in_data_and_meta = [c for c in df.columns if c in meta_col_names]

    # error if there are no common columns
    if not cols_in_data_and_meta:
        raise ColumnError(
            "There is no commonality between the data and metadata")

    # this is so that both mitigations can be checked and both errors are made visible
    raise_column_error = False
    err_msg = ""

    # remove columns from meta that aren't in the data if allowed
    msg_1 = f"columns present in metadata but not in data: {cols_in_meta_but_not_data}"
    if (not allow_missing_cols) and cols_in_meta_but_not_data:
        err_msg += msg_1
        raise_column_error = True
    elif allow_missing_cols and cols_in_meta_but_not_data:
        for col in cols_in_meta_but_not_data:
            metadata.remove_column(col)
        log.info("not testing " + msg_1)

    # error if there is unexepcted data, unless allowed
    msg_2 = f"columns present in data but not in metadata: {cols_in_data_but_not_meta}"
    if (not allow_unexpected_data) and cols_in_data_but_not_meta:
        err_msg += f"\n{msg_2}"
        raise_column_error = True
    elif allow_unexpected_data and cols_in_data_but_not_meta:
        log.info("not testing " + msg_2)
        df = df[cols_in_data_and_meta]

    # raise the error with all details, if required
    if raise_column_error:
        raise ColumnError(err_msg)

    # sample the data, if required
    if table_params.get("row-limit"):
        df = df.sample(table_params.get("row-limit"))

    if metadata.file_format not in ["parquet", "snappy.parquet"]:
        df = cast_pandas_table_to_schema(df, metadata)

    return df, metadata
示例#9
0
def test_round_trip(trip1_file_format, trip2_file_format):
    meta = {
        "columns": [
            {
                "name": "my_float",
                "type": "float64",
                "type_category": "float"
            },
            {
                "name": "my_bool",
                "type": "bool_",
                "type_category": "boolean"
            },
            {
                "name": "my_nullable_bool",
                "type": "bool_",
                "type_category": "boolean"
            },
            {
                "name": "my_date",
                "type": "date32",
                "type_category": "timestamp"
            },
            {
                "name": "my_datetime",
                "type": "timestamp(s)",
                "type_category": "timestamp",
            },
            {
                "name": "my_int",
                "type": "int64",
                "type_category": "integer"
            },
            {
                "name": "my_string",
                "type": "string",
                "type_category": "string"
            },
        ]
    }
    original = reader.csv.read("tests/data/all_types.csv", meta)
    orig_copy = original.copy()

    # Trip 1
    with tempfile.NamedTemporaryFile() as f:
        tmp_out_file1 = f.name
    writer.write(orig_copy,
                 tmp_out_file1,
                 file_format=trip1_file_format,
                 metadata=meta)
    df_mid = reader.read(tmp_out_file1,
                         file_format=trip1_file_format,
                         metadata=meta)

    # Trip 2
    with tempfile.NamedTemporaryFile() as f:
        tmp_out_file2 = f.name
    writer.write(df_mid,
                 tmp_out_file2,
                 file_format=trip2_file_format,
                 metadata=meta)
    final = reader.read(tmp_out_file2,
                        file_format=trip2_file_format,
                        metadata=meta)

    assert_frame_equal(original, final)
def test_round_trip():
    meta = {
        "columns": [
            {
                "name": "my_float",
                "type": "float64",
                "type_category": "float"
            },
            {
                "name": "my_bool",
                "type": "bool_",
                "type_category": "boolean"
            },
            {
                "name": "my_nullable_bool",
                "type": "bool_",
                "type_category": "boolean"
            },
            {
                "name": "my_date",
                "type": "date32",
                "type_category": "timestamp"
            },
            {
                "name": "my_datetime",
                "type": "timestamp(s)",
                "type_category": "timestamp",
            },
            {
                "name": "my_int",
                "type": "int64",
                "type_category": "integer"
            },
            {
                "name": "my_string",
                "type": "string",
                "type_category": "string"
            },
        ]
    }

    # Create parquet temp file
    with tempfile.NamedTemporaryFile(suffix=".parquet") as f:
        tmp_out_file = f.name
    original = reader.csv.read("tests/data/all_types.csv", meta)
    writer.parquet.write(original, tmp_out_file)

    data_paths = {
        "csv": "tests/data/all_types.csv",
        "json": "tests/data/all_types.jsonl",
        "parquet": tmp_out_file,
    }

    for type1 in ["csv", "json", "parquet"]:
        for type2 in ["csv", "json", "parquet"]:
            df1 = reader.read(
                input_path=data_paths[type1],
                metadata=meta,
            )
            df2 = reader.read(
                input_path=data_paths[type2],
                metadata=meta,
            )
            assert_frame_equal(df1, df2)