def check_uniqueness( data: Union[pd.DataFrame, pd.Series], ) -> pd.DataFrame: """Checks if unique data contains columns with duplicates. Args: data: Data to be checked for duplicates. Returns: DataFrame with bool(s) indicating if data contains duplicates, the count of duplicates present, and the proportion of duplicates. Raises: ValueError: If unique data contains nulls. """ UniqueFeatures._validate_unique_dtype(data) is_df = _utils.check_if_df(data) err_message = 'Columns with unique data should not contain nulls.' if is_df: if data.isna().any(axis=None): raise ValueError(err_message) count_dupes = data.nunique(axis=0).subtract( data.shape[0]).multiply(-1) is_dupes = count_dupes.astype(bool) prop_dupes = count_dupes.divide(data.shape[0]) else: if data.isna().any(): raise ValueError(err_message) count_dupes = data.shape[0] - data.nunique() is_dupes = bool(count_dupes) prop_dupes = count_dupes / data.shape[0] result = _utils.result_to_df(data=is_dupes, title='dupes_present', dupe_count=count_dupes, prop_dupe=prop_dupes) return result
def check_mostly_same( data: Union[pd.DataFrame, pd.Series], thresh: float = 0.95, ) -> pd.DataFrame: """Checks if binary data contains almost all the same value. Args: data: Binary data to be checked if almost all values are the same. thresh: Threshold for what proportion of data must be the same to fail check. Returns: DataFrame with bool(s) indicating if data contains all the same value, the value of threshold used to determine if mostly same, and the average value(s). Raises: ValueError: If `thresh` less than or equal to 0.0 or greater than or equal to 1.0. """ _utils.validate_thresh(thresh) is_df = _utils.check_if_df(data) BinaryFeatures._validate_binary_dtype(data) if is_df: mean = data.mean(axis=0) result = (mean >= thresh) | (mean <= 1 - thresh) else: mean = data.mean() result = mean >= thresh or mean <= 1 - thresh return _utils.result_to_df(data=result, title='mostly_same', thresh=thresh, mean=mean)
def check_fuzzy_nulls( data: Union[pd.DataFrame, pd.Series], add_fuzzy_nulls: Optional[List] = None, ) -> pd.DataFrame: """Checks if DataFrame contains values commonly used to denote nulls (fuzzy nulls). Args: data: Data to be checked for fuzzy nulls. add_fuzzy_nulls: Additional items to check as fuzzy nulls. Returns: DataFrame with bool(s) indicating if data contains any fuzzy nulls, count of the fuzzy nulls present, and the proportion of fuzzy nulls. """ is_df = _utils.check_if_df(data) fuzzy_nulls = ['null', 'Null', 'NULL', '', ' '] if add_fuzzy_nulls is not None: fuzzy_nulls.extend(add_fuzzy_nulls) is_fuzzy_nulls = data.isin(fuzzy_nulls).any(axis=0) count_fuzzy_nulls = data.isin(fuzzy_nulls).sum(axis=0) if is_df: prop_fuzzy_nulls = count_fuzzy_nulls.divide(data.shape[0]) else: prop_fuzzy_nulls = count_fuzzy_nulls / data.shape[0] result = _utils.result_to_df( data=is_fuzzy_nulls, title='fuzzy_nulls_present', fuzzy_null_count=count_fuzzy_nulls, prop_fuzzy_null=prop_fuzzy_nulls, ) return result
def check_outside_range( data: Union[pd.DataFrame, pd.Series]) -> pd.DataFrame: """Checks if binary data contains columns where min is less than 0 or max is greater than 1. Args: data: Binary data to be checked if any values are less than 0 or greater than 1. Returns: DataFrame with bool(s) indicating if data contains any values outside of the expected range. """ is_df = _utils.check_if_df(data) BinaryFeatures._validate_binary_dtype(data) if is_df: result = (data.min(axis=0) < 0) | (data.max(axis=0) > 1) else: result = data.min() < 0 or data.max() > 1 return _utils.result_to_df(data=result, title='outside_range')
def check_all_same( data: Union[pd.DataFrame, pd.Series]) -> Union[pd.DataFrame]: """Checks if binary data contains all the same value. Args: data: Binary data to be checked if all values are the same. Returns: DataFrame with bool(s) indicating if data contains all the same value. """ is_df = _utils.check_if_df(data) BinaryFeatures._validate_binary_dtype(data) if is_df: result = data.min(axis=0).eq(data.max(axis=0)) else: result = data.min() == data.max() return _utils.result_to_df(result, title='all_same')
def check_n_categories( data: Union[pd.DataFrame, pd.Series], dropna: bool = False, ) -> pd.DataFrame: """Counts the number of categories. Args: data: Data to count categories for. dropna: If True: ignores nulls, if False: counts nulls as a category. Returns: DataFrame with count(s) of categories. """ CategoricalFeatures._validate_categorical_dtype(data) is_df = _utils.check_if_df(data) if is_df: result = data.nunique(axis=0, dropna=dropna) else: result = data.nunique(dropna=dropna) return _utils.result_to_df(result, title='n_categories')
def check_mostly_same( data: Union[pd.DataFrame, pd.Series], thresh: float = 0.95, dropna: bool = False, ) -> pd.DataFrame: """Checks if categorical data contains almost all the same category. Args: data: Categorical data to be checked if almost all the same category. thresh: Threshold for what proportion of data must be the same category to fail check. dropna: If True: ignores nulls, if False: counts nulls as a category. Returns: DataFrame with bool(s) indicating if data contains almost all the same category, the value of threshold used to determine if mostly same, the most common category, the count of the most common category, and the proportion of the most common category. """ _utils.validate_thresh(thresh) CategoricalFeatures._validate_categorical_dtype(data) is_df = _utils.check_if_df(data) if is_df: most_common = data.mode(axis=0, dropna=dropna).loc[0, :] count_common = data.eq(most_common).sum(axis=0) prop_common = count_common.divide(data.shape[0]) mostly_same = prop_common.ge(thresh) else: most_common = data.mode()[0] count_common = data.eq(most_common).sum() prop_common = count_common / data.shape[0] mostly_same = prop_common >= thresh result = _utils.result_to_df( mostly_same, title='mostly_same', thresh=thresh, most_common=most_common, count=count_common, prop=prop_common, ) return result
def check_nulls(data: Union[pd.DataFrame, pd.Series]) -> pd.DataFrame: """Checks if data contains nulls. Args: data: Data to be checked for nulls. Returns: DataFrame with bool(s) indicating if data contains any nulls, count of the nulls present, and the proportion of nulls. """ is_df = _utils.check_if_df(data) is_nulls = data.isna().any(axis=0) count_nulls = data.isna().sum(axis=0) if is_df: prop_nulls = count_nulls.divide(data.shape[0]) else: prop_nulls = count_nulls / data.shape[0] result = _utils.result_to_df(data=is_nulls, title='nulls_present', null_count=count_nulls, prop_null=prop_nulls) return result