def describe_numeric_1d(series: Series, percentiles: Sequence[float]) -> Series: """Describe series containing numerical data. Parameters ---------- series : Series Series to be described. percentiles : list-like of numbers The percentiles to include in the output. """ from pandas import Series # error: Argument 1 to "format_percentiles" has incompatible type "Sequence[float]"; # expected "Union[ndarray, List[Union[int, float]], List[float], List[Union[str, # float]]]" formatted_percentiles = format_percentiles( percentiles) # type: ignore[arg-type] stat_index = ["count", "mean", "std", "min" ] + formatted_percentiles + ["max"] d = ([series.count(), series.mean(), series.std(), series.min()] + series.quantile(percentiles).tolist() + [series.max()]) return Series(d, index=stat_index, name=series.name)
def posterior_summary(self, q=[0.05, 0.95], plot=False): if self.posterior_df is None: self._posterior_sample_df() summary = self.posterior_df.describe(percentiles=q).T if not plot: return summary else: sns.scatterplot(x=summary['mean'], y=summary.index) for i, var in enumerate(summary.index): sns.lineplot(x=summary.loc[var, format_percentiles(q)], y=[var, var], color='k') plt.xlabel('') plt.show()
def describe_timestamp_1d(data: "Series", percentiles: Sequence[float]) -> "Series": """Describe series containing datetime64 dtype. Parameters ---------- data : Series Series to be described. percentiles : list-like of numbers The percentiles to include in the output. """ # GH-30164 from pandas import Series formatted_percentiles = format_percentiles(percentiles) stat_index = ["count", "mean", "min"] + formatted_percentiles + ["max"] d = ([data.count(), data.mean(), data.min()] + data.quantile(percentiles).tolist() + [data.max()]) return Series(d, index=stat_index, name=data.name)
def describe_numeric_1d(series: Series, percentiles: Sequence[float]) -> Series: """Describe series containing numerical data. Parameters ---------- series : Series Series to be described. percentiles : list-like of numbers The percentiles to include in the output. """ from pandas import Series formatted_percentiles = format_percentiles(percentiles) stat_index = ["count", "mean", "std", "min"] + formatted_percentiles + ["max"] d = ( [series.count(), series.mean(), series.std(), series.min()] + series.quantile(percentiles).tolist() + [series.max()] ) return Series(d, index=stat_index, name=series.name)
def describe(data, percentiles=None, include=None, exclude=None): if data.ndim == 2 and data.columns.size == 0: raise ValueError("Cannot describe a DataFrame without columns") if percentiles is not None: # explicit conversion of `percentiles` to list percentiles = list(percentiles) # get them all to be in [0, 1] validate_percentile(percentiles) # median should always be included if 0.5 not in percentiles: percentiles.append(0.5) percentiles = np.asarray(percentiles) else: percentiles = np.array([0.25, 0.5, 0.75]) # sort and check for duplicates unique_pcts = np.unique(percentiles) if len(unique_pcts) < len(percentiles): raise ValueError("percentiles cannot contain duplicates") percentiles = unique_pcts formatted_percentiles = format_percentiles(percentiles) def describe_numeric_1d(series): stat_index = ([ "record", "unique", "unique_rate", "null", "null_rate", "zeros", "zeros_rate", "mean", "std", "min" ] + formatted_percentiles + ["max"]) record = series.shape[0] objcounts = series.value_counts() count_unique = len(objcounts[objcounts != 0]) count_null = series.isnull().sum() zeros = record - np.count_nonzero(series) d = ([ record, count_unique, count_unique / record, count_null, count_null / record, zeros, zeros / record, series.mean(), series.std(), series.min() ] + series.quantile(percentiles).tolist() + [series.max()]) return pd.Series(d, index=stat_index, name=series.name) def describe_categorical_1d(data): names = [ "record", "unique", "unique_rate", "null", "null_rate", "empty", "empty_rate" ] record = data.shape[0] objcounts = data.value_counts() nobjcounts = objcounts[objcounts != 0] count_unique = len(nobjcounts) count_null = data.isnull().sum() empty = sum(data == '') result = [ record, count_unique, count_unique / record, count_null, count_null / record, empty, empty / record ] dtype = None if result[1] > 0: top, top_freq = nobjcounts.index[0], nobjcounts.iloc[0] bottom, bottom_freq = nobjcounts.index[ count_unique - 1], nobjcounts.iloc[count_unique - 1] names += [ "top", "top_freq", "top_rate", "bottom", "bottom_freq", "bottom_rate" ] result += [ top, top_freq, top_freq / record, bottom, bottom_freq, bottom_freq / record ] # If the DataFrame is empty, set 'top' and 'freq' to None # to maintain output shape consistency else: names += [ "top", "top_freq", "top_rate", "bottom", "bottom_freq", "bottom_rate" ] result += [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan] dtype = "object" return pd.Series(result, index=names, name=data.name, dtype=dtype) def describe_timestamp_1d(data): # GH-30164 stat_index = ([ "record", "unique", "unique_rate", "null", "null_rate", "mean", "min" ] + formatted_percentiles + ["max"]) record = data.shape[0] objcounts = data.value_counts() count_unique = len(objcounts[objcounts != 0]) count_null = data.isnull().sum() d = ([ record, count_unique, count_unique / record, count_null, count_null / record, data.mean(), data.min() ] + data.quantile(percentiles).tolist() + [data.max()]) return pd.Series(d, index=stat_index, name=data.name) def describe_1d(data): if is_bool_dtype(data): return describe_categorical_1d(data) elif is_numeric_dtype(data): return describe_numeric_1d(data) elif is_datetime64_any_dtype(data): return describe_timestamp_1d(data) elif is_timedelta64_dtype(data): return describe_numeric_1d(data) else: return describe_categorical_1d(data) if data.ndim == 1: return describe_1d(data) elif (include is None) and (exclude is None): # when some numerics are found, keep only numerics data = data.select_dtypes(include=[np.number]) if len(data.columns) == 0: data = data elif include == "all": if exclude is not None: msg = "exclude must be None when include is 'all'" raise ValueError(msg) data = data else: data = data.select_dtypes(include=include, exclude=exclude) ldesc = [describe_1d(s) for _, s in data.items()] # set a convenient order for rows names: List[Optional[Hashable]] = [] ldesc_indexes = sorted((x.index for x in ldesc), key=len) for idxnames in ldesc_indexes: for name in idxnames: if name not in names: names.append(name) d = pd.concat([x.reindex(names, copy=False) for x in ldesc], axis=1, sort=False) d.columns = data.columns.copy() result = d.transpose() return result
def describe(data, percentiles=None, include='number'): percentiles = np.array([0.25, 0.5, 0.75]) # sort and check for duplicates unique_pcts = np.unique(percentiles) if len(unique_pcts) < len(percentiles): raise ValueError("percentiles cannot contain duplicates") percentiles = unique_pcts from pandas.io.formats.format import format_percentiles formatted_percentiles = format_percentiles(percentiles) def describe_numeric_1d(series): stat_index = (['count', 'mean', 'std', 'min'] + formatted_percentiles + ['max']) d = ([series.count(), series.mean(), series.std(), series.min()] + series.quantile(percentiles).tolist() + [series.max()]) return pd.Series(d, index=stat_index, name=series.name) def describe_categorical_1d(data): names = ['count', 'unique'] objcounts = data.value_counts() count_unique = len(objcounts[objcounts != 0]) result = [data.count(), count_unique] if result[1] > 0: top, freq = objcounts.index[0], objcounts.iloc[0] names += ['top', 'freq'] result += [top, freq] return pd.Series(result, index=names, name=data.name) rep_df = representation_checkout(data) numeric_variable = rep_df[rep_df['representation'] == 'NUMERIC']['variable'] category_variable = rep_df[rep_df['representation'] == 'CATEGORY']['variable'] numeric_data = data.loc[:, list(numeric_variable)] category_data = data.loc[:, list(category_variable)] # when some numerics are found, keep only numerics data0 = numeric_data if include == 'number' else category_data if len(data.columns) == 0: data0 = data data = data0 ldesc = [ describe_numeric_1d(s) if include == 'number' else describe_categorical_1d(s) for _, s in data.iteritems() ] # set a convenient order for rows names = [] ldesc_indexes = sorted((x.index for x in ldesc), key=len) for idxnames in ldesc_indexes: for name in idxnames: if name not in names: names.append(name) d = pd.concat(ldesc, join_axes=pd.Index([names]), axis=1) d.columns = data.columns.copy() return d
def _for_series(s,points): #trick: s.quantile will filter out the np.nan automatically return pd.Series([s.quantile(bp) for bp in points],index=format_percentiles(points))
def describe_1d(s, pcts): stat_index=(['mean','std','skew','kurt','min'] + format_percentiles(pcts) + ['max', 'n']) d=([s.mean(), s.std(), s.skew(), s.kurt(), s.min()] + [s.quantile(x) for x in pcts] + [s.max(), s.count()]) return pd.Series(d, index=stat_index, name=s.name)
def describe_ndframe( *, obj: FrameOrSeries, include: Optional[Union[str, Sequence[str]]], exclude: Optional[Union[str, Sequence[str]]], datetime_is_numeric: bool, percentiles: Optional[Sequence[float]], ) -> FrameOrSeries: """Describe series or dataframe. Called from pandas.core.generic.NDFrame.describe() Parameters ---------- obj: DataFrame or Series Either dataframe or series to be described. include : 'all', list-like of dtypes or None (default), optional A white list of data types to include in the result. Ignored for ``Series``. exclude : list-like of dtypes or None (default), optional, A black list of data types to omit from the result. Ignored for ``Series``. datetime_is_numeric : bool, default False Whether to treat datetime dtypes as numeric. percentiles : list-like of numbers, optional The percentiles to include in the output. All should fall between 0 and 1. The default is ``[.25, .5, .75]``, which returns the 25th, 50th, and 75th percentiles. Returns ------- Dataframe or series description. """ if obj.ndim == 2 and obj.columns.size == 0: raise ValueError("Cannot describe a DataFrame without columns") if percentiles is not None: # explicit conversion of `percentiles` to list percentiles = list(percentiles) # get them all to be in [0, 1] validate_percentile(percentiles) # median should always be included if 0.5 not in percentiles: percentiles.append(0.5) percentiles = np.asarray(percentiles) else: percentiles = np.array([0.25, 0.5, 0.75]) # sort and check for duplicates unique_pcts = np.unique(percentiles) assert percentiles is not None if len(unique_pcts) < len(percentiles): raise ValueError("percentiles cannot contain duplicates") percentiles = unique_pcts formatted_percentiles = format_percentiles(percentiles) def describe_numeric_1d(series) -> "Series": from pandas import Series stat_index = ["count", "mean", "std", "min" ] + formatted_percentiles + ["max"] d = ([series.count(), series.mean(), series.std(), series.min()] + series.quantile(percentiles).tolist() + [series.max()]) return Series(d, index=stat_index, name=series.name) def describe_categorical_1d(data) -> "Series": names = ["count", "unique"] objcounts = data.value_counts() count_unique = len(objcounts[objcounts != 0]) result = [data.count(), count_unique] dtype = None if result[1] > 0: top, freq = objcounts.index[0], objcounts.iloc[0] if is_datetime64_any_dtype(data.dtype): if obj.ndim == 1: stacklevel = 5 else: stacklevel = 6 warnings.warn( "Treating datetime data as categorical rather than numeric in " "`.describe` is deprecated and will be removed in a future " "version of pandas. Specify `datetime_is_numeric=True` to " "silence this warning and adopt the future behavior now.", FutureWarning, stacklevel=stacklevel, ) tz = data.dt.tz asint = data.dropna().values.view("i8") top = Timestamp(top) if top.tzinfo is not None and tz is not None: # Don't tz_localize(None) if key is already tz-aware top = top.tz_convert(tz) else: top = top.tz_localize(tz) names += ["top", "freq", "first", "last"] result += [ top, freq, Timestamp(asint.min(), tz=tz), Timestamp(asint.max(), tz=tz), ] else: names += ["top", "freq"] result += [top, freq] # If the DataFrame is empty, set 'top' and 'freq' to None # to maintain output shape consistency else: names += ["top", "freq"] result += [np.nan, np.nan] dtype = "object" from pandas import Series return Series(result, index=names, name=data.name, dtype=dtype) def describe_timestamp_1d(data) -> "Series": # GH-30164 from pandas import Series stat_index = ["count", "mean", "min"] + formatted_percentiles + ["max"] d = ([data.count(), data.mean(), data.min()] + data.quantile(percentiles).tolist() + [data.max()]) return Series(d, index=stat_index, name=data.name) def describe_1d(data) -> "Series": if is_bool_dtype(data.dtype): return describe_categorical_1d(data) elif is_numeric_dtype(data): return describe_numeric_1d(data) elif is_datetime64_any_dtype(data.dtype) and datetime_is_numeric: return describe_timestamp_1d(data) elif is_timedelta64_dtype(data.dtype): return describe_numeric_1d(data) else: return describe_categorical_1d(data) if obj.ndim == 1: # Incompatible return value type # (got "Series", expected "FrameOrSeries") [return-value] return describe_1d(obj) # type:ignore[return-value] elif (include is None) and (exclude is None): # when some numerics are found, keep only numerics default_include = [np.number] if datetime_is_numeric: default_include.append("datetime") data = obj.select_dtypes(include=default_include) if len(data.columns) == 0: data = obj elif include == "all": if exclude is not None: msg = "exclude must be None when include is 'all'" raise ValueError(msg) data = obj else: data = obj.select_dtypes(include=include, exclude=exclude) ldesc = [describe_1d(s) for _, s in data.items()] # set a convenient order for rows names: List[Hashable] = [] ldesc_indexes = sorted((x.index for x in ldesc), key=len) for idxnames in ldesc_indexes: for name in idxnames: if name not in names: names.append(name) d = concat([x.reindex(names, copy=False) for x in ldesc], axis=1, sort=False) d.columns = data.columns.copy() return d