예제 #1
0
def describe_numeric_1d(series: Series,
                        percentiles: Sequence[float]) -> Series:
    """Describe series containing numerical data.

    Parameters
    ----------
    series : Series
        Series to be described.
    percentiles : list-like of numbers
        The percentiles to include in the output.
    """
    from pandas import Series

    # error: Argument 1 to "format_percentiles" has incompatible type "Sequence[float]";
    # expected "Union[ndarray, List[Union[int, float]], List[float], List[Union[str,
    # float]]]"
    formatted_percentiles = format_percentiles(
        percentiles)  # type: ignore[arg-type]

    stat_index = ["count", "mean", "std", "min"
                  ] + formatted_percentiles + ["max"]
    d = ([series.count(),
          series.mean(),
          series.std(),
          series.min()] + series.quantile(percentiles).tolist() +
         [series.max()])
    return Series(d, index=stat_index, name=series.name)
예제 #2
0
 def posterior_summary(self, q=[0.05, 0.95], plot=False):
     if self.posterior_df is None:
         self._posterior_sample_df()
     summary = self.posterior_df.describe(percentiles=q).T
     if not plot:
         return summary
     else:
         sns.scatterplot(x=summary['mean'], y=summary.index)
         for i, var in enumerate(summary.index):
             sns.lineplot(x=summary.loc[var, format_percentiles(q)],
                          y=[var, var], color='k')
         plt.xlabel('')
         plt.show()
예제 #3
0
파일: describe.py 프로젝트: rhuille/pandas
def describe_timestamp_1d(data: "Series",
                          percentiles: Sequence[float]) -> "Series":
    """Describe series containing datetime64 dtype.

    Parameters
    ----------
    data : Series
        Series to be described.
    percentiles : list-like of numbers
        The percentiles to include in the output.
    """
    # GH-30164
    from pandas import Series

    formatted_percentiles = format_percentiles(percentiles)

    stat_index = ["count", "mean", "min"] + formatted_percentiles + ["max"]
    d = ([data.count(), data.mean(), data.min()] +
         data.quantile(percentiles).tolist() + [data.max()])
    return Series(d, index=stat_index, name=data.name)
예제 #4
0
def describe_numeric_1d(series: Series, percentiles: Sequence[float]) -> Series:
    """Describe series containing numerical data.

    Parameters
    ----------
    series : Series
        Series to be described.
    percentiles : list-like of numbers
        The percentiles to include in the output.
    """
    from pandas import Series

    formatted_percentiles = format_percentiles(percentiles)

    stat_index = ["count", "mean", "std", "min"] + formatted_percentiles + ["max"]
    d = (
        [series.count(), series.mean(), series.std(), series.min()]
        + series.quantile(percentiles).tolist()
        + [series.max()]
    )
    return Series(d, index=stat_index, name=series.name)
예제 #5
0
def describe(data, percentiles=None, include=None, exclude=None):
    if data.ndim == 2 and data.columns.size == 0:
        raise ValueError("Cannot describe a DataFrame without columns")

    if percentiles is not None:
        # explicit conversion of `percentiles` to list
        percentiles = list(percentiles)

        # get them all to be in [0, 1]
        validate_percentile(percentiles)

        # median should always be included
        if 0.5 not in percentiles:
            percentiles.append(0.5)
        percentiles = np.asarray(percentiles)
    else:
        percentiles = np.array([0.25, 0.5, 0.75])

    # sort and check for duplicates
    unique_pcts = np.unique(percentiles)
    if len(unique_pcts) < len(percentiles):
        raise ValueError("percentiles cannot contain duplicates")
    percentiles = unique_pcts

    formatted_percentiles = format_percentiles(percentiles)

    def describe_numeric_1d(series):
        stat_index = ([
            "record", "unique", "unique_rate", "null", "null_rate", "zeros",
            "zeros_rate", "mean", "std", "min"
        ] + formatted_percentiles + ["max"])
        record = series.shape[0]
        objcounts = series.value_counts()
        count_unique = len(objcounts[objcounts != 0])
        count_null = series.isnull().sum()
        zeros = record - np.count_nonzero(series)
        d = ([
            record, count_unique, count_unique / record, count_null,
            count_null / record, zeros, zeros / record,
            series.mean(),
            series.std(),
            series.min()
        ] + series.quantile(percentiles).tolist() + [series.max()])
        return pd.Series(d, index=stat_index, name=series.name)

    def describe_categorical_1d(data):
        names = [
            "record", "unique", "unique_rate", "null", "null_rate", "empty",
            "empty_rate"
        ]
        record = data.shape[0]
        objcounts = data.value_counts()
        nobjcounts = objcounts[objcounts != 0]
        count_unique = len(nobjcounts)
        count_null = data.isnull().sum()
        empty = sum(data == '')
        result = [
            record, count_unique, count_unique / record, count_null,
            count_null / record, empty, empty / record
        ]
        dtype = None
        if result[1] > 0:
            top, top_freq = nobjcounts.index[0], nobjcounts.iloc[0]
            bottom, bottom_freq = nobjcounts.index[
                count_unique - 1], nobjcounts.iloc[count_unique - 1]
            names += [
                "top", "top_freq", "top_rate", "bottom", "bottom_freq",
                "bottom_rate"
            ]
            result += [
                top, top_freq, top_freq / record, bottom, bottom_freq,
                bottom_freq / record
            ]

        # If the DataFrame is empty, set 'top' and 'freq' to None
        # to maintain output shape consistency
        else:
            names += [
                "top", "top_freq", "top_rate", "bottom", "bottom_freq",
                "bottom_rate"
            ]
            result += [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan]
            dtype = "object"

        return pd.Series(result, index=names, name=data.name, dtype=dtype)

    def describe_timestamp_1d(data):
        # GH-30164
        stat_index = ([
            "record", "unique", "unique_rate", "null", "null_rate", "mean",
            "min"
        ] + formatted_percentiles + ["max"])
        record = data.shape[0]
        objcounts = data.value_counts()
        count_unique = len(objcounts[objcounts != 0])
        count_null = data.isnull().sum()
        d = ([
            record, count_unique, count_unique / record, count_null,
            count_null / record,
            data.mean(),
            data.min()
        ] + data.quantile(percentiles).tolist() + [data.max()])
        return pd.Series(d, index=stat_index, name=data.name)

    def describe_1d(data):
        if is_bool_dtype(data):
            return describe_categorical_1d(data)
        elif is_numeric_dtype(data):
            return describe_numeric_1d(data)
        elif is_datetime64_any_dtype(data):
            return describe_timestamp_1d(data)
        elif is_timedelta64_dtype(data):
            return describe_numeric_1d(data)
        else:
            return describe_categorical_1d(data)

    if data.ndim == 1:
        return describe_1d(data)
    elif (include is None) and (exclude is None):
        # when some numerics are found, keep only numerics
        data = data.select_dtypes(include=[np.number])
        if len(data.columns) == 0:
            data = data
    elif include == "all":
        if exclude is not None:
            msg = "exclude must be None when include is 'all'"
            raise ValueError(msg)
        data = data
    else:
        data = data.select_dtypes(include=include, exclude=exclude)

    ldesc = [describe_1d(s) for _, s in data.items()]
    # set a convenient order for rows
    names: List[Optional[Hashable]] = []
    ldesc_indexes = sorted((x.index for x in ldesc), key=len)
    for idxnames in ldesc_indexes:
        for name in idxnames:
            if name not in names:
                names.append(name)

    d = pd.concat([x.reindex(names, copy=False) for x in ldesc],
                  axis=1,
                  sort=False)
    d.columns = data.columns.copy()
    result = d.transpose()
    return result
예제 #6
0
def describe(data, percentiles=None, include='number'):
    percentiles = np.array([0.25, 0.5, 0.75])

    # sort and check for duplicates
    unique_pcts = np.unique(percentiles)
    if len(unique_pcts) < len(percentiles):
        raise ValueError("percentiles cannot contain duplicates")
    percentiles = unique_pcts

    from pandas.io.formats.format import format_percentiles
    formatted_percentiles = format_percentiles(percentiles)

    def describe_numeric_1d(series):
        stat_index = (['count', 'mean', 'std', 'min'] + formatted_percentiles +
                      ['max'])
        d = ([series.count(),
              series.mean(),
              series.std(),
              series.min()] + series.quantile(percentiles).tolist() +
             [series.max()])
        return pd.Series(d, index=stat_index, name=series.name)

    def describe_categorical_1d(data):
        names = ['count', 'unique']
        objcounts = data.value_counts()
        count_unique = len(objcounts[objcounts != 0])
        result = [data.count(), count_unique]
        if result[1] > 0:
            top, freq = objcounts.index[0], objcounts.iloc[0]
            names += ['top', 'freq']
            result += [top, freq]

        return pd.Series(result, index=names, name=data.name)

    rep_df = representation_checkout(data)
    numeric_variable = rep_df[rep_df['representation'] ==
                              'NUMERIC']['variable']
    category_variable = rep_df[rep_df['representation'] ==
                               'CATEGORY']['variable']
    numeric_data = data.loc[:, list(numeric_variable)]
    category_data = data.loc[:, list(category_variable)]

    # when some numerics are found, keep only numerics
    data0 = numeric_data if include == 'number' else category_data
    if len(data.columns) == 0:
        data0 = data

    data = data0
    ldesc = [
        describe_numeric_1d(s)
        if include == 'number' else describe_categorical_1d(s)
        for _, s in data.iteritems()
    ]
    # set a convenient order for rows
    names = []
    ldesc_indexes = sorted((x.index for x in ldesc), key=len)
    for idxnames in ldesc_indexes:
        for name in idxnames:
            if name not in names:
                names.append(name)

    d = pd.concat(ldesc, join_axes=pd.Index([names]), axis=1)
    d.columns = data.columns.copy()
    return d
예제 #7
0
 def _for_series(s,points):
     #trick: s.quantile will filter out the np.nan automatically
     return pd.Series([s.quantile(bp) for bp in points],index=format_percentiles(points))
예제 #8
0
 def describe_1d(s, pcts):
     stat_index=(['mean','std','skew','kurt','min'] +
                 format_percentiles(pcts) + ['max', 'n'])
     d=([s.mean(), s.std(), s.skew(), s.kurt(), s.min()] +
        [s.quantile(x) for x in pcts] + [s.max(), s.count()])
     return pd.Series(d, index=stat_index, name=s.name)
예제 #9
0
def describe_ndframe(
    *,
    obj: FrameOrSeries,
    include: Optional[Union[str, Sequence[str]]],
    exclude: Optional[Union[str, Sequence[str]]],
    datetime_is_numeric: bool,
    percentiles: Optional[Sequence[float]],
) -> FrameOrSeries:
    """Describe series or dataframe.

    Called from pandas.core.generic.NDFrame.describe()

    Parameters
    ----------
    obj: DataFrame or Series
        Either dataframe or series to be described.
    include : 'all', list-like of dtypes or None (default), optional
        A white list of data types to include in the result. Ignored for ``Series``.
    exclude : list-like of dtypes or None (default), optional,
        A black list of data types to omit from the result. Ignored for ``Series``.
    datetime_is_numeric : bool, default False
        Whether to treat datetime dtypes as numeric.
    percentiles : list-like of numbers, optional
        The percentiles to include in the output. All should fall between 0 and 1.
        The default is ``[.25, .5, .75]``, which returns the 25th, 50th, and
        75th percentiles.

    Returns
    -------
    Dataframe or series description.
    """
    if obj.ndim == 2 and obj.columns.size == 0:
        raise ValueError("Cannot describe a DataFrame without columns")

    if percentiles is not None:
        # explicit conversion of `percentiles` to list
        percentiles = list(percentiles)

        # get them all to be in [0, 1]
        validate_percentile(percentiles)

        # median should always be included
        if 0.5 not in percentiles:
            percentiles.append(0.5)
        percentiles = np.asarray(percentiles)
    else:
        percentiles = np.array([0.25, 0.5, 0.75])

    # sort and check for duplicates
    unique_pcts = np.unique(percentiles)
    assert percentiles is not None
    if len(unique_pcts) < len(percentiles):
        raise ValueError("percentiles cannot contain duplicates")
    percentiles = unique_pcts

    formatted_percentiles = format_percentiles(percentiles)

    def describe_numeric_1d(series) -> "Series":
        from pandas import Series

        stat_index = ["count", "mean", "std", "min"
                      ] + formatted_percentiles + ["max"]
        d = ([series.count(),
              series.mean(),
              series.std(),
              series.min()] + series.quantile(percentiles).tolist() +
             [series.max()])
        return Series(d, index=stat_index, name=series.name)

    def describe_categorical_1d(data) -> "Series":
        names = ["count", "unique"]
        objcounts = data.value_counts()
        count_unique = len(objcounts[objcounts != 0])
        result = [data.count(), count_unique]
        dtype = None
        if result[1] > 0:
            top, freq = objcounts.index[0], objcounts.iloc[0]
            if is_datetime64_any_dtype(data.dtype):
                if obj.ndim == 1:
                    stacklevel = 5
                else:
                    stacklevel = 6
                warnings.warn(
                    "Treating datetime data as categorical rather than numeric in "
                    "`.describe` is deprecated and will be removed in a future "
                    "version of pandas. Specify `datetime_is_numeric=True` to "
                    "silence this warning and adopt the future behavior now.",
                    FutureWarning,
                    stacklevel=stacklevel,
                )
                tz = data.dt.tz
                asint = data.dropna().values.view("i8")
                top = Timestamp(top)
                if top.tzinfo is not None and tz is not None:
                    # Don't tz_localize(None) if key is already tz-aware
                    top = top.tz_convert(tz)
                else:
                    top = top.tz_localize(tz)
                names += ["top", "freq", "first", "last"]
                result += [
                    top,
                    freq,
                    Timestamp(asint.min(), tz=tz),
                    Timestamp(asint.max(), tz=tz),
                ]
            else:
                names += ["top", "freq"]
                result += [top, freq]

        # If the DataFrame is empty, set 'top' and 'freq' to None
        # to maintain output shape consistency
        else:
            names += ["top", "freq"]
            result += [np.nan, np.nan]
            dtype = "object"

        from pandas import Series

        return Series(result, index=names, name=data.name, dtype=dtype)

    def describe_timestamp_1d(data) -> "Series":
        # GH-30164
        from pandas import Series

        stat_index = ["count", "mean", "min"] + formatted_percentiles + ["max"]
        d = ([data.count(), data.mean(), data.min()] +
             data.quantile(percentiles).tolist() + [data.max()])
        return Series(d, index=stat_index, name=data.name)

    def describe_1d(data) -> "Series":
        if is_bool_dtype(data.dtype):
            return describe_categorical_1d(data)
        elif is_numeric_dtype(data):
            return describe_numeric_1d(data)
        elif is_datetime64_any_dtype(data.dtype) and datetime_is_numeric:
            return describe_timestamp_1d(data)
        elif is_timedelta64_dtype(data.dtype):
            return describe_numeric_1d(data)
        else:
            return describe_categorical_1d(data)

    if obj.ndim == 1:
        # Incompatible return value type
        #  (got "Series", expected "FrameOrSeries")  [return-value]
        return describe_1d(obj)  # type:ignore[return-value]
    elif (include is None) and (exclude is None):
        # when some numerics are found, keep only numerics
        default_include = [np.number]
        if datetime_is_numeric:
            default_include.append("datetime")
        data = obj.select_dtypes(include=default_include)
        if len(data.columns) == 0:
            data = obj
    elif include == "all":
        if exclude is not None:
            msg = "exclude must be None when include is 'all'"
            raise ValueError(msg)
        data = obj
    else:
        data = obj.select_dtypes(include=include, exclude=exclude)

    ldesc = [describe_1d(s) for _, s in data.items()]
    # set a convenient order for rows
    names: List[Hashable] = []
    ldesc_indexes = sorted((x.index for x in ldesc), key=len)
    for idxnames in ldesc_indexes:
        for name in idxnames:
            if name not in names:
                names.append(name)

    d = concat([x.reindex(names, copy=False) for x in ldesc],
               axis=1,
               sort=False)
    d.columns = data.columns.copy()
    return d