Exemplo n.º 1
0
def _check_nonnull_columns(
    df_profile: "pandas.DataFrame", *, require_nonnull_columns: Optional[List[str]] = None
) -> ValidationCheck:
    """
    Checks that a specified list of columns in a DataFrame have all non-null values

    Args:
        df_profile: DataFrame containing profile information
        require_nonnull_columns: list of columns that are checked to have all non-null values

    Returns:
        ValidationCheck object, with bool for whether check passed and dict of failing columns
    """
    failed_checks_dict = defaultdict(list)

    if require_nonnull_columns is None:
        require_nonnull_columns = []

    # check for nonnull columns
    for col in require_nonnull_columns:
        num_null = df_profile.loc[col, "EmptyValueCount"]
        if num_null > 0:
            LOGGER.warning(f"column {col} has {num_null} null values")
            failed_checks_dict["failed_nonnull_columns"].append(col)

    passed = len(failed_checks_dict) == 0
    return ValidationCheck(passed, failed_checks_dict)
Exemplo n.º 2
0
def _check_present_columns(
    df: "pandas.DataFrame", *, require_present_columns: Optional[List[str]] = None
) -> ValidationCheck:
    """
    Checks that a specified list of columns in a DataFrame are all present

    Args:
        df: DataFrame
        require_present_columns: list of columns that are checked to be present

    Returns:
        ValidationCheck object, with bool for whether check passed and dict of failing columns
    """
    failed_checks_dict = defaultdict(list)

    if require_present_columns is None:
        require_present_columns = []

    # check if columns are present
    for col in require_present_columns:
        if col not in df.columns:
            LOGGER.warning(f"column {col} is not present")
            failed_checks_dict["failed_present_columns"].append(col)

    passed = len(failed_checks_dict) == 0
    return ValidationCheck(passed, failed_checks_dict)
Exemplo n.º 3
0
def _check_unique_columns(
    df_profile: "pandas.DataFrame", *, require_unique_columns: Optional[List[str]] = None
) -> ValidationCheck:
    """
    Checks that a specified list of columns in a DataFrame have all unique values

    Args:
        df_profile: DataFrame containing profile information
        require_unique_columns: list of columns that are checked to have all unique values,
            e.g. a primary key column

    Returns:
        ValidationCheck object, with bool for whether check passed and dict of failing columns
    """
    failed_checks_dict = defaultdict(list)

    if require_unique_columns is None:
        require_unique_columns = []

    # check for unique columns
    for col in require_unique_columns:
        num_unique = df_profile.loc[col, "DistinctValueCount"]
        num_records = df_profile.loc[col, "RecordCount"]
        if not num_unique == num_records:
            LOGGER.warning(
                f"column {col} has only {num_unique} unique values out of {num_records} records"
            )
            failed_checks_dict["failed_unique_columns"].append(col)

    passed = len(failed_checks_dict) == 0
    return ValidationCheck(passed, failed_checks_dict)
Exemplo n.º 4
0
def _check_custom(
    df: "pandas.DataFrame",
    columns_to_check: Optional[List[str]],
    check_function: Callable[[Any], bool],
) -> ValidationCheck:
    """
    Args:
        df: Dataframe
        columns_to_check: columns on which check_function will be applied
        check_function: function applied on columns_to_check

    Returns:
        ValidationCheck object, with bool for whether all checks passed and dict of failing columns
    """
    failed_checks_dict = defaultdict(list)

    df1 = df[columns_to_check].applymap(check_function)
    for col in columns_to_check:
        if not df1[col].all():
            LOGGER.warning(
                f"column {col} failed custom check {check_function.__name__}")
            failed_checks_dict[
                f"failed custom check {check_function.__name__}"].append(col)

    passed = len(failed_checks_dict) == 0
    return ValidationCheck(passed, failed_checks_dict)
Exemplo n.º 5
0
def validate(
    df: "pandas.DataFrame",
    *,
    raise_error: bool = True,
    require_present_columns: Optional[List[str]] = None,
    require_unique_columns: Optional[List[str]] = None,
    require_nonnull_columns: Optional[List[str]] = None,
) -> ValidationCheck:
    """
    Performs validation checks on a DataFrame.
    Returns a dict of columns that fail each check, and optionally returns an error.
    Intended to be used on a DataFrame prior to upserting records into a Tamr dataset.

    Args:
        df: DataFrame
        raise_error: if True, will raise a ValueError on failing checks.  if False,
            will print Warning and return a dict
        require_present_columns: list of columns that are checked to be present
        require_unique_columns: list of columns that are checked to have all unique values,
            e.g. a primary key column
        require_nonnull_columns: list of columns that are checked to have all non-null values

    Returns:
        ValidationCheck object, with bool for whether all checks passed and dict of failing columns

    Raises:
        ValueError: if `raise_error` is set True, and any checks fail
    """
    failed_checks_dict = defaultdict(list)

    if require_present_columns is None:
        require_present_columns = []
    if require_unique_columns is None:
        require_unique_columns = []
    if require_nonnull_columns is None:
        require_nonnull_columns = []

    # compute profile stats
    df_profile = profile(df)

    # check for present columns
    failed_checks_dict.update(
        _check_present_columns(df, require_present_columns=require_present_columns).details
    )

    # check for unique columns
    failed_checks_dict.update(
        _check_unique_columns(df_profile, require_unique_columns=require_unique_columns).details
    )

    # check for nonnull columns
    failed_checks_dict.update(
        _check_nonnull_columns(df_profile, require_nonnull_columns=require_nonnull_columns).details
    )

    # convert to dict for printing/return
    failed_checks_dict = dict(failed_checks_dict)
    passed = len(failed_checks_dict) == 0
    if not passed and raise_error:
        raise (
            ValueError(
                f"DataFrame validation failed.  failed columns for each check: "
                f"{failed_checks_dict}"
            )
        )
    return ValidationCheck(passed, failed_checks_dict)