Python result_to_df 예제들, surveyor._utils.result_to_df Python 예제들

예제 #1

0

파일 보기

파일: _unique_features.py 프로젝트: roblisy/surveyor

    def check_uniqueness(
        data: Union[pd.DataFrame, pd.Series], ) -> pd.DataFrame:
        """Checks if unique data contains columns with duplicates.

        Args:
            data: Data to be checked for duplicates.

        Returns:
            DataFrame with bool(s) indicating if data contains duplicates, the count of
            duplicates present, and the proportion of duplicates.

        Raises:
            ValueError: If unique data contains nulls.
        """
        UniqueFeatures._validate_unique_dtype(data)
        is_df = _utils.check_if_df(data)
        err_message = 'Columns with unique data should not contain nulls.'
        if is_df:
            if data.isna().any(axis=None):
                raise ValueError(err_message)
            count_dupes = data.nunique(axis=0).subtract(
                data.shape[0]).multiply(-1)
            is_dupes = count_dupes.astype(bool)
            prop_dupes = count_dupes.divide(data.shape[0])
        else:
            if data.isna().any():
                raise ValueError(err_message)
            count_dupes = data.shape[0] - data.nunique()
            is_dupes = bool(count_dupes)
            prop_dupes = count_dupes / data.shape[0]
        result = _utils.result_to_df(data=is_dupes,
                                     title='dupes_present',
                                     dupe_count=count_dupes,
                                     prop_dupe=prop_dupes)
        return result

예제 #2

0

파일 보기

    def check_mostly_same(
        data: Union[pd.DataFrame, pd.Series],
        thresh: float = 0.95,
    ) -> pd.DataFrame:
        """Checks if binary data contains almost all the same value.

        Args:
            data: Binary data to be checked if almost all values are the same.
            thresh: Threshold for what proportion of data must be the same to fail check.

        Returns:
            DataFrame with bool(s) indicating if data contains all the same value, the
            value of threshold used to determine if mostly same, and the average value(s).

        Raises:
            ValueError: If `thresh` less than or equal to 0.0 or greater than or equal to 1.0.
        """
        _utils.validate_thresh(thresh)
        is_df = _utils.check_if_df(data)
        BinaryFeatures._validate_binary_dtype(data)
        if is_df:
            mean = data.mean(axis=0)
            result = (mean >= thresh) | (mean <= 1 - thresh)
        else:
            mean = data.mean()
            result = mean >= thresh or mean <= 1 - thresh
        return _utils.result_to_df(data=result,
                                   title='mostly_same',
                                   thresh=thresh,
                                   mean=mean)

예제 #3

0

파일 보기

    def check_fuzzy_nulls(
        data: Union[pd.DataFrame, pd.Series],
        add_fuzzy_nulls: Optional[List] = None,
    ) -> pd.DataFrame:
        """Checks if DataFrame contains values commonly used to denote nulls (fuzzy nulls).

        Args:
            data: Data to be checked for fuzzy nulls.
            add_fuzzy_nulls: Additional items to check as fuzzy nulls.

        Returns:
            DataFrame with bool(s) indicating if data contains any fuzzy nulls, count of
            the fuzzy nulls present, and the proportion of fuzzy nulls.
        """
        is_df = _utils.check_if_df(data)
        fuzzy_nulls = ['null', 'Null', 'NULL', '', ' ']
        if add_fuzzy_nulls is not None:
            fuzzy_nulls.extend(add_fuzzy_nulls)
        is_fuzzy_nulls = data.isin(fuzzy_nulls).any(axis=0)
        count_fuzzy_nulls = data.isin(fuzzy_nulls).sum(axis=0)
        if is_df:
            prop_fuzzy_nulls = count_fuzzy_nulls.divide(data.shape[0])
        else:
            prop_fuzzy_nulls = count_fuzzy_nulls / data.shape[0]
        result = _utils.result_to_df(
            data=is_fuzzy_nulls,
            title='fuzzy_nulls_present',
            fuzzy_null_count=count_fuzzy_nulls,
            prop_fuzzy_null=prop_fuzzy_nulls,
        )
        return result

예제 #4

0

파일 보기

    def check_outside_range(
            data: Union[pd.DataFrame, pd.Series]) -> pd.DataFrame:
        """Checks if binary data contains columns where min is less than 0 or max is greater than 1.

        Args:
            data: Binary data to be checked if any values are less than 0 or greater than 1.

        Returns:
            DataFrame with bool(s) indicating if data contains any values outside of the expected range.
        """
        is_df = _utils.check_if_df(data)
        BinaryFeatures._validate_binary_dtype(data)
        if is_df:
            result = (data.min(axis=0) < 0) | (data.max(axis=0) > 1)
        else:
            result = data.min() < 0 or data.max() > 1
        return _utils.result_to_df(data=result, title='outside_range')

예제 #5

0

파일 보기

    def check_all_same(
            data: Union[pd.DataFrame, pd.Series]) -> Union[pd.DataFrame]:
        """Checks if binary data contains all the same value.

        Args:
            data: Binary data to be checked if all values are the same.

        Returns:
            DataFrame with bool(s) indicating if data contains all the same value.
        """
        is_df = _utils.check_if_df(data)
        BinaryFeatures._validate_binary_dtype(data)
        if is_df:
            result = data.min(axis=0).eq(data.max(axis=0))
        else:
            result = data.min() == data.max()
        return _utils.result_to_df(result, title='all_same')

예제 #6

0

파일 보기

파일: _categorical_features.py 프로젝트: roblisy/surveyor

    def check_n_categories(
        data: Union[pd.DataFrame, pd.Series],
        dropna: bool = False,
    ) -> pd.DataFrame:
        """Counts the number of categories.

        Args:
            data: Data to count categories for.
            dropna: If True: ignores nulls, if False: counts nulls as a category.

        Returns:
            DataFrame with count(s) of categories.
        """
        CategoricalFeatures._validate_categorical_dtype(data)
        is_df = _utils.check_if_df(data)
        if is_df:
            result = data.nunique(axis=0, dropna=dropna)
        else:
            result = data.nunique(dropna=dropna)
        return _utils.result_to_df(result, title='n_categories')

예제 #7

0

파일 보기

파일: _categorical_features.py 프로젝트: roblisy/surveyor

    def check_mostly_same(
        data: Union[pd.DataFrame, pd.Series],
        thresh: float = 0.95,
        dropna: bool = False,
    ) -> pd.DataFrame:
        """Checks if categorical data contains almost all the same category.

        Args:
            data: Categorical data to be checked if almost all the same category.
            thresh: Threshold for what proportion of data must be the same category to fail check.
            dropna: If True: ignores nulls, if False: counts nulls as a category.

        Returns:
            DataFrame with bool(s) indicating if data contains almost all the same category, the
            value of threshold used to determine if mostly same, the most common category, the
            count of the most common category, and the proportion of the most common category.
        """
        _utils.validate_thresh(thresh)
        CategoricalFeatures._validate_categorical_dtype(data)
        is_df = _utils.check_if_df(data)
        if is_df:
            most_common = data.mode(axis=0, dropna=dropna).loc[0, :]
            count_common = data.eq(most_common).sum(axis=0)
            prop_common = count_common.divide(data.shape[0])
            mostly_same = prop_common.ge(thresh)
        else:
            most_common = data.mode()[0]
            count_common = data.eq(most_common).sum()
            prop_common = count_common / data.shape[0]
            mostly_same = prop_common >= thresh
        result = _utils.result_to_df(
            mostly_same,
            title='mostly_same',
            thresh=thresh,
            most_common=most_common,
            count=count_common,
            prop=prop_common,
        )
        return result

예제 #8

0

파일 보기

    def check_nulls(data: Union[pd.DataFrame, pd.Series]) -> pd.DataFrame:
        """Checks if data contains nulls.

        Args:
            data: Data to be checked for nulls.

        Returns:
            DataFrame with bool(s) indicating if data contains any nulls, count of the nulls
            present, and the proportion of nulls.
        """
        is_df = _utils.check_if_df(data)
        is_nulls = data.isna().any(axis=0)
        count_nulls = data.isna().sum(axis=0)
        if is_df:
            prop_nulls = count_nulls.divide(data.shape[0])
        else:
            prop_nulls = count_nulls / data.shape[0]
        result = _utils.result_to_df(data=is_nulls,
                                     title='nulls_present',
                                     null_count=count_nulls,
                                     prop_null=prop_nulls)
        return result