def _check_nonnull_columns( df_profile: "pandas.DataFrame", *, require_nonnull_columns: Optional[List[str]] = None ) -> ValidationCheck: """ Checks that a specified list of columns in a DataFrame have all non-null values Args: df_profile: DataFrame containing profile information require_nonnull_columns: list of columns that are checked to have all non-null values Returns: ValidationCheck object, with bool for whether check passed and dict of failing columns """ failed_checks_dict = defaultdict(list) if require_nonnull_columns is None: require_nonnull_columns = [] # check for nonnull columns for col in require_nonnull_columns: num_null = df_profile.loc[col, "EmptyValueCount"] if num_null > 0: LOGGER.warning(f"column {col} has {num_null} null values") failed_checks_dict["failed_nonnull_columns"].append(col) passed = len(failed_checks_dict) == 0 return ValidationCheck(passed, failed_checks_dict)
def _check_present_columns( df: "pandas.DataFrame", *, require_present_columns: Optional[List[str]] = None ) -> ValidationCheck: """ Checks that a specified list of columns in a DataFrame are all present Args: df: DataFrame require_present_columns: list of columns that are checked to be present Returns: ValidationCheck object, with bool for whether check passed and dict of failing columns """ failed_checks_dict = defaultdict(list) if require_present_columns is None: require_present_columns = [] # check if columns are present for col in require_present_columns: if col not in df.columns: LOGGER.warning(f"column {col} is not present") failed_checks_dict["failed_present_columns"].append(col) passed = len(failed_checks_dict) == 0 return ValidationCheck(passed, failed_checks_dict)
def _check_unique_columns( df_profile: "pandas.DataFrame", *, require_unique_columns: Optional[List[str]] = None ) -> ValidationCheck: """ Checks that a specified list of columns in a DataFrame have all unique values Args: df_profile: DataFrame containing profile information require_unique_columns: list of columns that are checked to have all unique values, e.g. a primary key column Returns: ValidationCheck object, with bool for whether check passed and dict of failing columns """ failed_checks_dict = defaultdict(list) if require_unique_columns is None: require_unique_columns = [] # check for unique columns for col in require_unique_columns: num_unique = df_profile.loc[col, "DistinctValueCount"] num_records = df_profile.loc[col, "RecordCount"] if not num_unique == num_records: LOGGER.warning( f"column {col} has only {num_unique} unique values out of {num_records} records" ) failed_checks_dict["failed_unique_columns"].append(col) passed = len(failed_checks_dict) == 0 return ValidationCheck(passed, failed_checks_dict)
def _check_custom( df: "pandas.DataFrame", columns_to_check: Optional[List[str]], check_function: Callable[[Any], bool], ) -> ValidationCheck: """ Args: df: Dataframe columns_to_check: columns on which check_function will be applied check_function: function applied on columns_to_check Returns: ValidationCheck object, with bool for whether all checks passed and dict of failing columns """ failed_checks_dict = defaultdict(list) df1 = df[columns_to_check].applymap(check_function) for col in columns_to_check: if not df1[col].all(): LOGGER.warning( f"column {col} failed custom check {check_function.__name__}") failed_checks_dict[ f"failed custom check {check_function.__name__}"].append(col) passed = len(failed_checks_dict) == 0 return ValidationCheck(passed, failed_checks_dict)
def validate( df: "pandas.DataFrame", *, raise_error: bool = True, require_present_columns: Optional[List[str]] = None, require_unique_columns: Optional[List[str]] = None, require_nonnull_columns: Optional[List[str]] = None, ) -> ValidationCheck: """ Performs validation checks on a DataFrame. Returns a dict of columns that fail each check, and optionally returns an error. Intended to be used on a DataFrame prior to upserting records into a Tamr dataset. Args: df: DataFrame raise_error: if True, will raise a ValueError on failing checks. if False, will print Warning and return a dict require_present_columns: list of columns that are checked to be present require_unique_columns: list of columns that are checked to have all unique values, e.g. a primary key column require_nonnull_columns: list of columns that are checked to have all non-null values Returns: ValidationCheck object, with bool for whether all checks passed and dict of failing columns Raises: ValueError: if `raise_error` is set True, and any checks fail """ failed_checks_dict = defaultdict(list) if require_present_columns is None: require_present_columns = [] if require_unique_columns is None: require_unique_columns = [] if require_nonnull_columns is None: require_nonnull_columns = [] # compute profile stats df_profile = profile(df) # check for present columns failed_checks_dict.update( _check_present_columns(df, require_present_columns=require_present_columns).details ) # check for unique columns failed_checks_dict.update( _check_unique_columns(df_profile, require_unique_columns=require_unique_columns).details ) # check for nonnull columns failed_checks_dict.update( _check_nonnull_columns(df_profile, require_nonnull_columns=require_nonnull_columns).details ) # convert to dict for printing/return failed_checks_dict = dict(failed_checks_dict) passed = len(failed_checks_dict) == 0 if not passed and raise_error: raise ( ValueError( f"DataFrame validation failed. failed columns for each check: " f"{failed_checks_dict}" ) ) return ValidationCheck(passed, failed_checks_dict)