예제 #1
0
def refine_percentiles(percentiles: Sequence[float] | None) -> Sequence[float]:
    """Ensure that percentiles are unique and sorted.

    Parameters
    ----------
    percentiles : list-like of numbers, optional
        The percentiles to include in the output.
    """
    if percentiles is None:
        # error: Incompatible return value type (got "ndarray", expected
        # "Sequence[float]")
        return np.array([0.25, 0.5, 0.75])  # type: ignore[return-value]

    # explicit conversion of `percentiles` to list
    percentiles = list(percentiles)

    # get them all to be in [0, 1]
    validate_percentile(percentiles)

    # median should always be included
    if 0.5 not in percentiles:
        percentiles.append(0.5)

    # error: Incompatible types in assignment (expression has type "ndarray", variable
    # has type "Optional[Sequence[float]]")
    percentiles = np.asarray(percentiles)  # type: ignore[assignment]

    # sort and check for duplicates
    unique_pcts = np.unique(percentiles)
    assert percentiles is not None
    if len(unique_pcts) < len(percentiles):
        raise ValueError("percentiles cannot contain duplicates")

    return unique_pcts
예제 #2
0
파일: describe.py 프로젝트: rhuille/pandas
def refine_percentiles(
        percentiles: Optional[Sequence[float]]) -> Sequence[float]:
    """Ensure that percentiles are unique and sorted.

    Parameters
    ----------
    percentiles : list-like of numbers, optional
        The percentiles to include in the output.
    """
    if percentiles is None:
        return np.array([0.25, 0.5, 0.75])

    # explicit conversion of `percentiles` to list
    percentiles = list(percentiles)

    # get them all to be in [0, 1]
    validate_percentile(percentiles)

    # median should always be included
    if 0.5 not in percentiles:
        percentiles.append(0.5)

    percentiles = np.asarray(percentiles)

    # sort and check for duplicates
    unique_pcts = np.unique(percentiles)
    assert percentiles is not None
    if len(unique_pcts) < len(percentiles):
        raise ValueError("percentiles cannot contain duplicates")

    return unique_pcts
예제 #3
0
def describe(data, percentiles=None, include=None, exclude=None):
    if data.ndim == 2 and data.columns.size == 0:
        raise ValueError("Cannot describe a DataFrame without columns")

    if percentiles is not None:
        # explicit conversion of `percentiles` to list
        percentiles = list(percentiles)

        # get them all to be in [0, 1]
        validate_percentile(percentiles)

        # median should always be included
        if 0.5 not in percentiles:
            percentiles.append(0.5)
        percentiles = np.asarray(percentiles)
    else:
        percentiles = np.array([0.25, 0.5, 0.75])

    # sort and check for duplicates
    unique_pcts = np.unique(percentiles)
    if len(unique_pcts) < len(percentiles):
        raise ValueError("percentiles cannot contain duplicates")
    percentiles = unique_pcts

    formatted_percentiles = format_percentiles(percentiles)

    def describe_numeric_1d(series):
        stat_index = ([
            "record", "unique", "unique_rate", "null", "null_rate", "zeros",
            "zeros_rate", "mean", "std", "min"
        ] + formatted_percentiles + ["max"])
        record = series.shape[0]
        objcounts = series.value_counts()
        count_unique = len(objcounts[objcounts != 0])
        count_null = series.isnull().sum()
        zeros = record - np.count_nonzero(series)
        d = ([
            record, count_unique, count_unique / record, count_null,
            count_null / record, zeros, zeros / record,
            series.mean(),
            series.std(),
            series.min()
        ] + series.quantile(percentiles).tolist() + [series.max()])
        return pd.Series(d, index=stat_index, name=series.name)

    def describe_categorical_1d(data):
        names = [
            "record", "unique", "unique_rate", "null", "null_rate", "empty",
            "empty_rate"
        ]
        record = data.shape[0]
        objcounts = data.value_counts()
        nobjcounts = objcounts[objcounts != 0]
        count_unique = len(nobjcounts)
        count_null = data.isnull().sum()
        empty = sum(data == '')
        result = [
            record, count_unique, count_unique / record, count_null,
            count_null / record, empty, empty / record
        ]
        dtype = None
        if result[1] > 0:
            top, top_freq = nobjcounts.index[0], nobjcounts.iloc[0]
            bottom, bottom_freq = nobjcounts.index[
                count_unique - 1], nobjcounts.iloc[count_unique - 1]
            names += [
                "top", "top_freq", "top_rate", "bottom", "bottom_freq",
                "bottom_rate"
            ]
            result += [
                top, top_freq, top_freq / record, bottom, bottom_freq,
                bottom_freq / record
            ]

        # If the DataFrame is empty, set 'top' and 'freq' to None
        # to maintain output shape consistency
        else:
            names += [
                "top", "top_freq", "top_rate", "bottom", "bottom_freq",
                "bottom_rate"
            ]
            result += [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan]
            dtype = "object"

        return pd.Series(result, index=names, name=data.name, dtype=dtype)

    def describe_timestamp_1d(data):
        # GH-30164
        stat_index = ([
            "record", "unique", "unique_rate", "null", "null_rate", "mean",
            "min"
        ] + formatted_percentiles + ["max"])
        record = data.shape[0]
        objcounts = data.value_counts()
        count_unique = len(objcounts[objcounts != 0])
        count_null = data.isnull().sum()
        d = ([
            record, count_unique, count_unique / record, count_null,
            count_null / record,
            data.mean(),
            data.min()
        ] + data.quantile(percentiles).tolist() + [data.max()])
        return pd.Series(d, index=stat_index, name=data.name)

    def describe_1d(data):
        if is_bool_dtype(data):
            return describe_categorical_1d(data)
        elif is_numeric_dtype(data):
            return describe_numeric_1d(data)
        elif is_datetime64_any_dtype(data):
            return describe_timestamp_1d(data)
        elif is_timedelta64_dtype(data):
            return describe_numeric_1d(data)
        else:
            return describe_categorical_1d(data)

    if data.ndim == 1:
        return describe_1d(data)
    elif (include is None) and (exclude is None):
        # when some numerics are found, keep only numerics
        data = data.select_dtypes(include=[np.number])
        if len(data.columns) == 0:
            data = data
    elif include == "all":
        if exclude is not None:
            msg = "exclude must be None when include is 'all'"
            raise ValueError(msg)
        data = data
    else:
        data = data.select_dtypes(include=include, exclude=exclude)

    ldesc = [describe_1d(s) for _, s in data.items()]
    # set a convenient order for rows
    names: List[Optional[Hashable]] = []
    ldesc_indexes = sorted((x.index for x in ldesc), key=len)
    for idxnames in ldesc_indexes:
        for name in idxnames:
            if name not in names:
                names.append(name)

    d = pd.concat([x.reindex(names, copy=False) for x in ldesc],
                  axis=1,
                  sort=False)
    d.columns = data.columns.copy()
    result = d.transpose()
    return result
예제 #4
0
def describe_ndframe(
    *,
    obj: FrameOrSeries,
    include: Optional[Union[str, Sequence[str]]],
    exclude: Optional[Union[str, Sequence[str]]],
    datetime_is_numeric: bool,
    percentiles: Optional[Sequence[float]],
) -> FrameOrSeries:
    """Describe series or dataframe.

    Called from pandas.core.generic.NDFrame.describe()

    Parameters
    ----------
    obj: DataFrame or Series
        Either dataframe or series to be described.
    include : 'all', list-like of dtypes or None (default), optional
        A white list of data types to include in the result. Ignored for ``Series``.
    exclude : list-like of dtypes or None (default), optional,
        A black list of data types to omit from the result. Ignored for ``Series``.
    datetime_is_numeric : bool, default False
        Whether to treat datetime dtypes as numeric.
    percentiles : list-like of numbers, optional
        The percentiles to include in the output. All should fall between 0 and 1.
        The default is ``[.25, .5, .75]``, which returns the 25th, 50th, and
        75th percentiles.

    Returns
    -------
    Dataframe or series description.
    """
    if obj.ndim == 2 and obj.columns.size == 0:
        raise ValueError("Cannot describe a DataFrame without columns")

    if percentiles is not None:
        # explicit conversion of `percentiles` to list
        percentiles = list(percentiles)

        # get them all to be in [0, 1]
        validate_percentile(percentiles)

        # median should always be included
        if 0.5 not in percentiles:
            percentiles.append(0.5)
        percentiles = np.asarray(percentiles)
    else:
        percentiles = np.array([0.25, 0.5, 0.75])

    # sort and check for duplicates
    unique_pcts = np.unique(percentiles)
    assert percentiles is not None
    if len(unique_pcts) < len(percentiles):
        raise ValueError("percentiles cannot contain duplicates")
    percentiles = unique_pcts

    formatted_percentiles = format_percentiles(percentiles)

    def describe_numeric_1d(series) -> "Series":
        from pandas import Series

        stat_index = ["count", "mean", "std", "min"
                      ] + formatted_percentiles + ["max"]
        d = ([series.count(),
              series.mean(),
              series.std(),
              series.min()] + series.quantile(percentiles).tolist() +
             [series.max()])
        return Series(d, index=stat_index, name=series.name)

    def describe_categorical_1d(data) -> "Series":
        names = ["count", "unique"]
        objcounts = data.value_counts()
        count_unique = len(objcounts[objcounts != 0])
        result = [data.count(), count_unique]
        dtype = None
        if result[1] > 0:
            top, freq = objcounts.index[0], objcounts.iloc[0]
            if is_datetime64_any_dtype(data.dtype):
                if obj.ndim == 1:
                    stacklevel = 5
                else:
                    stacklevel = 6
                warnings.warn(
                    "Treating datetime data as categorical rather than numeric in "
                    "`.describe` is deprecated and will be removed in a future "
                    "version of pandas. Specify `datetime_is_numeric=True` to "
                    "silence this warning and adopt the future behavior now.",
                    FutureWarning,
                    stacklevel=stacklevel,
                )
                tz = data.dt.tz
                asint = data.dropna().values.view("i8")
                top = Timestamp(top)
                if top.tzinfo is not None and tz is not None:
                    # Don't tz_localize(None) if key is already tz-aware
                    top = top.tz_convert(tz)
                else:
                    top = top.tz_localize(tz)
                names += ["top", "freq", "first", "last"]
                result += [
                    top,
                    freq,
                    Timestamp(asint.min(), tz=tz),
                    Timestamp(asint.max(), tz=tz),
                ]
            else:
                names += ["top", "freq"]
                result += [top, freq]

        # If the DataFrame is empty, set 'top' and 'freq' to None
        # to maintain output shape consistency
        else:
            names += ["top", "freq"]
            result += [np.nan, np.nan]
            dtype = "object"

        from pandas import Series

        return Series(result, index=names, name=data.name, dtype=dtype)

    def describe_timestamp_1d(data) -> "Series":
        # GH-30164
        from pandas import Series

        stat_index = ["count", "mean", "min"] + formatted_percentiles + ["max"]
        d = ([data.count(), data.mean(), data.min()] +
             data.quantile(percentiles).tolist() + [data.max()])
        return Series(d, index=stat_index, name=data.name)

    def describe_1d(data) -> "Series":
        if is_bool_dtype(data.dtype):
            return describe_categorical_1d(data)
        elif is_numeric_dtype(data):
            return describe_numeric_1d(data)
        elif is_datetime64_any_dtype(data.dtype) and datetime_is_numeric:
            return describe_timestamp_1d(data)
        elif is_timedelta64_dtype(data.dtype):
            return describe_numeric_1d(data)
        else:
            return describe_categorical_1d(data)

    if obj.ndim == 1:
        # Incompatible return value type
        #  (got "Series", expected "FrameOrSeries")  [return-value]
        return describe_1d(obj)  # type:ignore[return-value]
    elif (include is None) and (exclude is None):
        # when some numerics are found, keep only numerics
        default_include = [np.number]
        if datetime_is_numeric:
            default_include.append("datetime")
        data = obj.select_dtypes(include=default_include)
        if len(data.columns) == 0:
            data = obj
    elif include == "all":
        if exclude is not None:
            msg = "exclude must be None when include is 'all'"
            raise ValueError(msg)
        data = obj
    else:
        data = obj.select_dtypes(include=include, exclude=exclude)

    ldesc = [describe_1d(s) for _, s in data.items()]
    # set a convenient order for rows
    names: List[Hashable] = []
    ldesc_indexes = sorted((x.index for x in ldesc), key=len)
    for idxnames in ldesc_indexes:
        for name in idxnames:
            if name not in names:
                names.append(name)

    d = concat([x.reindex(names, copy=False) for x in ldesc],
               axis=1,
               sort=False)
    d.columns = data.columns.copy()
    return d
예제 #5
0
파일: describe.py 프로젝트: Beatmonk/pandas
def describe_ndframe(
    *,
    obj: FrameOrSeries,
    include: Optional[Union[str, Sequence[str]]],
    exclude: Optional[Union[str, Sequence[str]]],
    datetime_is_numeric: bool,
    percentiles: Optional[Sequence[float]],
) -> FrameOrSeries:
    """Describe series or dataframe.

    Called from pandas.core.generic.NDFrame.describe()

    Parameters
    ----------
    obj: DataFrame or Series
        Either dataframe or series to be described.
    include : 'all', list-like of dtypes or None (default), optional
        A white list of data types to include in the result. Ignored for ``Series``.
    exclude : list-like of dtypes or None (default), optional,
        A black list of data types to omit from the result. Ignored for ``Series``.
    datetime_is_numeric : bool, default False
        Whether to treat datetime dtypes as numeric.
    percentiles : list-like of numbers, optional
        The percentiles to include in the output. All should fall between 0 and 1.
        The default is ``[.25, .5, .75]``, which returns the 25th, 50th, and
        75th percentiles.

    Returns
    -------
    Dataframe or series description.
    """
    if obj.ndim == 2 and obj.columns.size == 0:
        raise ValueError("Cannot describe a DataFrame without columns")

    if percentiles is not None:
        # explicit conversion of `percentiles` to list
        percentiles = list(percentiles)

        # get them all to be in [0, 1]
        validate_percentile(percentiles)

        # median should always be included
        if 0.5 not in percentiles:
            percentiles.append(0.5)
        percentiles = np.asarray(percentiles)
    else:
        percentiles = np.array([0.25, 0.5, 0.75])

    # sort and check for duplicates
    unique_pcts = np.unique(percentiles)
    assert percentiles is not None
    if len(unique_pcts) < len(percentiles):
        raise ValueError("percentiles cannot contain duplicates")
    percentiles = unique_pcts

    if obj.ndim == 1:
        # Incompatible return value type
        #  (got "Series", expected "FrameOrSeries")  [return-value]
        return describe_1d(
            obj,
            percentiles,
            datetime_is_numeric,
            is_series=True,
        )  # type:ignore[return-value]
    elif (include is None) and (exclude is None):
        # when some numerics are found, keep only numerics
        default_include = [np.number]
        if datetime_is_numeric:
            default_include.append("datetime")
        data = obj.select_dtypes(include=default_include)
        if len(data.columns) == 0:
            data = obj
    elif include == "all":
        if exclude is not None:
            msg = "exclude must be None when include is 'all'"
            raise ValueError(msg)
        data = obj
    else:
        data = obj.select_dtypes(include=include, exclude=exclude)

    ldesc = [
        describe_1d(s, percentiles, datetime_is_numeric, is_series=False)
        for _, s in data.items()
    ]
    # set a convenient order for rows
    names: List[Hashable] = []
    ldesc_indexes = sorted((x.index for x in ldesc), key=len)
    for idxnames in ldesc_indexes:
        for name in idxnames:
            if name not in names:
                names.append(name)

    d = concat([x.reindex(names, copy=False) for x in ldesc], axis=1, sort=False)
    d.columns = data.columns.copy()
    return d