def refine_percentiles(percentiles: Sequence[float] | None) -> Sequence[float]: """Ensure that percentiles are unique and sorted. Parameters ---------- percentiles : list-like of numbers, optional The percentiles to include in the output. """ if percentiles is None: # error: Incompatible return value type (got "ndarray", expected # "Sequence[float]") return np.array([0.25, 0.5, 0.75]) # type: ignore[return-value] # explicit conversion of `percentiles` to list percentiles = list(percentiles) # get them all to be in [0, 1] validate_percentile(percentiles) # median should always be included if 0.5 not in percentiles: percentiles.append(0.5) # error: Incompatible types in assignment (expression has type "ndarray", variable # has type "Optional[Sequence[float]]") percentiles = np.asarray(percentiles) # type: ignore[assignment] # sort and check for duplicates unique_pcts = np.unique(percentiles) assert percentiles is not None if len(unique_pcts) < len(percentiles): raise ValueError("percentiles cannot contain duplicates") return unique_pcts
def refine_percentiles( percentiles: Optional[Sequence[float]]) -> Sequence[float]: """Ensure that percentiles are unique and sorted. Parameters ---------- percentiles : list-like of numbers, optional The percentiles to include in the output. """ if percentiles is None: return np.array([0.25, 0.5, 0.75]) # explicit conversion of `percentiles` to list percentiles = list(percentiles) # get them all to be in [0, 1] validate_percentile(percentiles) # median should always be included if 0.5 not in percentiles: percentiles.append(0.5) percentiles = np.asarray(percentiles) # sort and check for duplicates unique_pcts = np.unique(percentiles) assert percentiles is not None if len(unique_pcts) < len(percentiles): raise ValueError("percentiles cannot contain duplicates") return unique_pcts
def describe(data, percentiles=None, include=None, exclude=None): if data.ndim == 2 and data.columns.size == 0: raise ValueError("Cannot describe a DataFrame without columns") if percentiles is not None: # explicit conversion of `percentiles` to list percentiles = list(percentiles) # get them all to be in [0, 1] validate_percentile(percentiles) # median should always be included if 0.5 not in percentiles: percentiles.append(0.5) percentiles = np.asarray(percentiles) else: percentiles = np.array([0.25, 0.5, 0.75]) # sort and check for duplicates unique_pcts = np.unique(percentiles) if len(unique_pcts) < len(percentiles): raise ValueError("percentiles cannot contain duplicates") percentiles = unique_pcts formatted_percentiles = format_percentiles(percentiles) def describe_numeric_1d(series): stat_index = ([ "record", "unique", "unique_rate", "null", "null_rate", "zeros", "zeros_rate", "mean", "std", "min" ] + formatted_percentiles + ["max"]) record = series.shape[0] objcounts = series.value_counts() count_unique = len(objcounts[objcounts != 0]) count_null = series.isnull().sum() zeros = record - np.count_nonzero(series) d = ([ record, count_unique, count_unique / record, count_null, count_null / record, zeros, zeros / record, series.mean(), series.std(), series.min() ] + series.quantile(percentiles).tolist() + [series.max()]) return pd.Series(d, index=stat_index, name=series.name) def describe_categorical_1d(data): names = [ "record", "unique", "unique_rate", "null", "null_rate", "empty", "empty_rate" ] record = data.shape[0] objcounts = data.value_counts() nobjcounts = objcounts[objcounts != 0] count_unique = len(nobjcounts) count_null = data.isnull().sum() empty = sum(data == '') result = [ record, count_unique, count_unique / record, count_null, count_null / record, empty, empty / record ] dtype = None if result[1] > 0: top, top_freq = nobjcounts.index[0], nobjcounts.iloc[0] bottom, bottom_freq = nobjcounts.index[ count_unique - 1], nobjcounts.iloc[count_unique - 1] names += [ "top", "top_freq", "top_rate", "bottom", "bottom_freq", "bottom_rate" ] result += [ top, top_freq, top_freq / record, bottom, bottom_freq, bottom_freq / record ] # If the DataFrame is empty, set 'top' and 'freq' to None # to maintain output shape consistency else: names += [ "top", "top_freq", "top_rate", "bottom", "bottom_freq", "bottom_rate" ] result += [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan] dtype = "object" return pd.Series(result, index=names, name=data.name, dtype=dtype) def describe_timestamp_1d(data): # GH-30164 stat_index = ([ "record", "unique", "unique_rate", "null", "null_rate", "mean", "min" ] + formatted_percentiles + ["max"]) record = data.shape[0] objcounts = data.value_counts() count_unique = len(objcounts[objcounts != 0]) count_null = data.isnull().sum() d = ([ record, count_unique, count_unique / record, count_null, count_null / record, data.mean(), data.min() ] + data.quantile(percentiles).tolist() + [data.max()]) return pd.Series(d, index=stat_index, name=data.name) def describe_1d(data): if is_bool_dtype(data): return describe_categorical_1d(data) elif is_numeric_dtype(data): return describe_numeric_1d(data) elif is_datetime64_any_dtype(data): return describe_timestamp_1d(data) elif is_timedelta64_dtype(data): return describe_numeric_1d(data) else: return describe_categorical_1d(data) if data.ndim == 1: return describe_1d(data) elif (include is None) and (exclude is None): # when some numerics are found, keep only numerics data = data.select_dtypes(include=[np.number]) if len(data.columns) == 0: data = data elif include == "all": if exclude is not None: msg = "exclude must be None when include is 'all'" raise ValueError(msg) data = data else: data = data.select_dtypes(include=include, exclude=exclude) ldesc = [describe_1d(s) for _, s in data.items()] # set a convenient order for rows names: List[Optional[Hashable]] = [] ldesc_indexes = sorted((x.index for x in ldesc), key=len) for idxnames in ldesc_indexes: for name in idxnames: if name not in names: names.append(name) d = pd.concat([x.reindex(names, copy=False) for x in ldesc], axis=1, sort=False) d.columns = data.columns.copy() result = d.transpose() return result
def describe_ndframe( *, obj: FrameOrSeries, include: Optional[Union[str, Sequence[str]]], exclude: Optional[Union[str, Sequence[str]]], datetime_is_numeric: bool, percentiles: Optional[Sequence[float]], ) -> FrameOrSeries: """Describe series or dataframe. Called from pandas.core.generic.NDFrame.describe() Parameters ---------- obj: DataFrame or Series Either dataframe or series to be described. include : 'all', list-like of dtypes or None (default), optional A white list of data types to include in the result. Ignored for ``Series``. exclude : list-like of dtypes or None (default), optional, A black list of data types to omit from the result. Ignored for ``Series``. datetime_is_numeric : bool, default False Whether to treat datetime dtypes as numeric. percentiles : list-like of numbers, optional The percentiles to include in the output. All should fall between 0 and 1. The default is ``[.25, .5, .75]``, which returns the 25th, 50th, and 75th percentiles. Returns ------- Dataframe or series description. """ if obj.ndim == 2 and obj.columns.size == 0: raise ValueError("Cannot describe a DataFrame without columns") if percentiles is not None: # explicit conversion of `percentiles` to list percentiles = list(percentiles) # get them all to be in [0, 1] validate_percentile(percentiles) # median should always be included if 0.5 not in percentiles: percentiles.append(0.5) percentiles = np.asarray(percentiles) else: percentiles = np.array([0.25, 0.5, 0.75]) # sort and check for duplicates unique_pcts = np.unique(percentiles) assert percentiles is not None if len(unique_pcts) < len(percentiles): raise ValueError("percentiles cannot contain duplicates") percentiles = unique_pcts formatted_percentiles = format_percentiles(percentiles) def describe_numeric_1d(series) -> "Series": from pandas import Series stat_index = ["count", "mean", "std", "min" ] + formatted_percentiles + ["max"] d = ([series.count(), series.mean(), series.std(), series.min()] + series.quantile(percentiles).tolist() + [series.max()]) return Series(d, index=stat_index, name=series.name) def describe_categorical_1d(data) -> "Series": names = ["count", "unique"] objcounts = data.value_counts() count_unique = len(objcounts[objcounts != 0]) result = [data.count(), count_unique] dtype = None if result[1] > 0: top, freq = objcounts.index[0], objcounts.iloc[0] if is_datetime64_any_dtype(data.dtype): if obj.ndim == 1: stacklevel = 5 else: stacklevel = 6 warnings.warn( "Treating datetime data as categorical rather than numeric in " "`.describe` is deprecated and will be removed in a future " "version of pandas. Specify `datetime_is_numeric=True` to " "silence this warning and adopt the future behavior now.", FutureWarning, stacklevel=stacklevel, ) tz = data.dt.tz asint = data.dropna().values.view("i8") top = Timestamp(top) if top.tzinfo is not None and tz is not None: # Don't tz_localize(None) if key is already tz-aware top = top.tz_convert(tz) else: top = top.tz_localize(tz) names += ["top", "freq", "first", "last"] result += [ top, freq, Timestamp(asint.min(), tz=tz), Timestamp(asint.max(), tz=tz), ] else: names += ["top", "freq"] result += [top, freq] # If the DataFrame is empty, set 'top' and 'freq' to None # to maintain output shape consistency else: names += ["top", "freq"] result += [np.nan, np.nan] dtype = "object" from pandas import Series return Series(result, index=names, name=data.name, dtype=dtype) def describe_timestamp_1d(data) -> "Series": # GH-30164 from pandas import Series stat_index = ["count", "mean", "min"] + formatted_percentiles + ["max"] d = ([data.count(), data.mean(), data.min()] + data.quantile(percentiles).tolist() + [data.max()]) return Series(d, index=stat_index, name=data.name) def describe_1d(data) -> "Series": if is_bool_dtype(data.dtype): return describe_categorical_1d(data) elif is_numeric_dtype(data): return describe_numeric_1d(data) elif is_datetime64_any_dtype(data.dtype) and datetime_is_numeric: return describe_timestamp_1d(data) elif is_timedelta64_dtype(data.dtype): return describe_numeric_1d(data) else: return describe_categorical_1d(data) if obj.ndim == 1: # Incompatible return value type # (got "Series", expected "FrameOrSeries") [return-value] return describe_1d(obj) # type:ignore[return-value] elif (include is None) and (exclude is None): # when some numerics are found, keep only numerics default_include = [np.number] if datetime_is_numeric: default_include.append("datetime") data = obj.select_dtypes(include=default_include) if len(data.columns) == 0: data = obj elif include == "all": if exclude is not None: msg = "exclude must be None when include is 'all'" raise ValueError(msg) data = obj else: data = obj.select_dtypes(include=include, exclude=exclude) ldesc = [describe_1d(s) for _, s in data.items()] # set a convenient order for rows names: List[Hashable] = [] ldesc_indexes = sorted((x.index for x in ldesc), key=len) for idxnames in ldesc_indexes: for name in idxnames: if name not in names: names.append(name) d = concat([x.reindex(names, copy=False) for x in ldesc], axis=1, sort=False) d.columns = data.columns.copy() return d
def describe_ndframe( *, obj: FrameOrSeries, include: Optional[Union[str, Sequence[str]]], exclude: Optional[Union[str, Sequence[str]]], datetime_is_numeric: bool, percentiles: Optional[Sequence[float]], ) -> FrameOrSeries: """Describe series or dataframe. Called from pandas.core.generic.NDFrame.describe() Parameters ---------- obj: DataFrame or Series Either dataframe or series to be described. include : 'all', list-like of dtypes or None (default), optional A white list of data types to include in the result. Ignored for ``Series``. exclude : list-like of dtypes or None (default), optional, A black list of data types to omit from the result. Ignored for ``Series``. datetime_is_numeric : bool, default False Whether to treat datetime dtypes as numeric. percentiles : list-like of numbers, optional The percentiles to include in the output. All should fall between 0 and 1. The default is ``[.25, .5, .75]``, which returns the 25th, 50th, and 75th percentiles. Returns ------- Dataframe or series description. """ if obj.ndim == 2 and obj.columns.size == 0: raise ValueError("Cannot describe a DataFrame without columns") if percentiles is not None: # explicit conversion of `percentiles` to list percentiles = list(percentiles) # get them all to be in [0, 1] validate_percentile(percentiles) # median should always be included if 0.5 not in percentiles: percentiles.append(0.5) percentiles = np.asarray(percentiles) else: percentiles = np.array([0.25, 0.5, 0.75]) # sort and check for duplicates unique_pcts = np.unique(percentiles) assert percentiles is not None if len(unique_pcts) < len(percentiles): raise ValueError("percentiles cannot contain duplicates") percentiles = unique_pcts if obj.ndim == 1: # Incompatible return value type # (got "Series", expected "FrameOrSeries") [return-value] return describe_1d( obj, percentiles, datetime_is_numeric, is_series=True, ) # type:ignore[return-value] elif (include is None) and (exclude is None): # when some numerics are found, keep only numerics default_include = [np.number] if datetime_is_numeric: default_include.append("datetime") data = obj.select_dtypes(include=default_include) if len(data.columns) == 0: data = obj elif include == "all": if exclude is not None: msg = "exclude must be None when include is 'all'" raise ValueError(msg) data = obj else: data = obj.select_dtypes(include=include, exclude=exclude) ldesc = [ describe_1d(s, percentiles, datetime_is_numeric, is_series=False) for _, s in data.items() ] # set a convenient order for rows names: List[Hashable] = [] ldesc_indexes = sorted((x.index for x in ldesc), key=len) for idxnames in ldesc_indexes: for name in idxnames: if name not in names: names.append(name) d = concat([x.reindex(names, copy=False) for x in ldesc], axis=1, sort=False) d.columns = data.columns.copy() return d