def describe_numeric_1d(series, **kwargs): """Compute summary statistics of a numerical (`TYPE_NUM`) variable (a Series). Also create histograms (mini an full) of its distribution. Parameters ---------- series : Series The variable to describe. Returns ------- Series The description of the variable as a Series with index being stats keys. """ # Format a number as a percentage. For example 0.25 will be turned to 25%. _percentile_format = "{:.0%}" stats = dict() stats['type'] = base.TYPE_NUM stats['mean'] = series.mean() stats['std'] = series.std() stats['variance'] = series.var() stats['min'] = series.min() stats['max'] = series.max() stats['range'] = stats['max'] - stats['min'] # To avoid to compute it several times _series_no_na = series.dropna() for percentile in np.array([0.05, 0.25, 0.5, 0.75, 0.95]): # The dropna() is a workaround for https://github.com/pydata/pandas/issues/13098 stats[_percentile_format.format(percentile)] = _series_no_na.quantile( percentile) stats['iqr'] = stats['75%'] - stats['25%'] stats['kurtosis'] = series.kurt() stats['skewness'] = series.skew() stats['sum'] = series.sum() stats['mad'] = series.mad() stats['cv'] = stats['std'] / stats['mean'] if stats['mean'] else np.NaN stats['n_zeros'] = (len(series) - np.count_nonzero(series)) stats['p_zeros'] = stats['n_zeros'] * 1.0 / len(series) # Histograms stats['histogram'] = histogram(series, **kwargs) stats['mini_histogram'] = mini_histogram(series, **kwargs) return pd.Series(stats, name=series.name)
def describe_numeric_1d(series, **kwargs): """Compute summary statistics of a numerical (`TYPE_NUM`) variable (a Series). Also create histograms (mini an full) of its distribution. Parameters ---------- series : Series The variable to describe. Returns ------- Series The description of the variable as a Series with index being stats keys. """ # Format a number as a percentage. For example 0.25 will be turned to 25%. _percentile_format = "{:.0%}" stats = dict() stats['type'] = base.TYPE_NUM stats['mean'] = series.mean() stats['std'] = series.std() stats['variance'] = series.var() stats['min'] = series.min() stats['max'] = series.max() stats['range'] = stats['max'] - stats['min'] # To avoid to compute it several times _series_no_na = series.dropna() for percentile in np.array([0.05, 0.25, 0.5, 0.75, 0.95]): # The dropna() is a workaround for https://github.com/pydata/pandas/issues/13098 stats[_percentile_format.format(percentile)] = _series_no_na.quantile(percentile) stats['iqr'] = stats['75%'] - stats['25%'] stats['kurtosis'] = series.kurt() stats['skewness'] = series.skew() stats['sum'] = series.sum() stats['mad'] = series.mad() stats['cv'] = stats['std'] / stats['mean'] if stats['mean'] else np.NaN stats['n_zeros'] = (len(series) - np.count_nonzero(series)) stats['p_zeros'] = stats['n_zeros'] / len(series) # Histograms stats['histogram'] = histogram(series, **kwargs) stats['mini_histogram'] = mini_histogram(series, **kwargs) return pd.Series(stats, name=series.name)
def describe_date_1d(series): """Compute summary statistics of a date (`TYPE_DATE`) variable (a Series). Also create histograms (mini an full) of its distribution. Parameters ---------- series : Series The variable to describe. Returns ------- Series The description of the variable as a Series with index being stats keys. """ stats = dict() stats['type'] = base.TYPE_DATE stats['min'] = series.min() stats['max'] = series.max() stats['range'] = stats['max'] - stats['min'] # Histograms stats['histogram'] = histogram(series) stats['mini_histogram'] = mini_histogram(series) return pd.Series(stats, name=series.name)