def describe(df, bins=10, check_correlation=True, correlation_threshold=0.9, correlation_overrides=None, check_recoded=False, pool_size=multiprocessing.cpu_count(), **kwargs): """Generates a dict containing summary statistics for a given dataset stored as a pandas `DataFrame`. Used has is it will output its content as an HTML report in a Jupyter notebook. Parameters ---------- df : DataFrame Data to be analyzed bins : int Number of bins in histogram. The default is 10. check_correlation : boolean Whether or not to check correlation. It's `True` by default. correlation_threshold: float Threshold to determine if the variable pair is correlated. The default is 0.9. correlation_overrides : list Variable names not to be rejected because they are correlated. There is no variable in the list (`None`) by default. check_recoded : boolean Whether or not to check recoded correlation (memory heavy feature). Since it's an expensive computation it can be activated for small datasets. `check_correlation` must be true to disable this check. It's `False` by default. pool_size : int Number of workers in thread pool The default is equal to the number of CPU. Returns ------- dict Containing the following keys: * table: general statistics on the dataset * variables: summary statistics for each variable * freq: frequency table Notes: ------ * The section dedicated to check the correlation should be externalized """ if not isinstance(df, pd.DataFrame): raise TypeError("df must be of type pandas.DataFrame") if df.empty: raise ValueError("df can not be empty") try: # reset matplotlib style before use # Fails in matplotlib 1.4.x so plot might look bad matplotlib.style.use("default") except: pass matplotlib.style.use(resource_filename(__name__, "pandas_profiling.mplstyle")) # Clearing the cache before computing stats base.clear_cache() if not pd.Index(np.arange(0, len(df))).equals(df.index): # Treat index as any other column df = df.reset_index() kwargs.update({'bins': bins}) # Describe all variables in a univariate way if pool_size == 1: local_multiprocess_func = partial(multiprocess_func, **kwargs) ldesc = {col: s for col, s in map(local_multiprocess_func, df.iteritems())} else: pool = multiprocessing.Pool(pool_size) local_multiprocess_func = partial(multiprocess_func, **kwargs) ldesc = {col: s for col, s in pool.map(local_multiprocess_func, df.iteritems())} pool.close() # Get correlations dfcorrPear = df.corr(method="pearson") dfcorrSpear = df.corr(method="spearman") # Check correlations between variable if check_correlation is True: ''' TODO: corr(x,y) > 0.9 and corr(y,z) > 0.9 does not imply corr(x,z) > 0.9 If x~y and y~z but not x~z, it would be better to delete only y Better way would be to find out which variable causes the highest increase in multicollinearity. ''' corr = dfcorrPear.copy() for x, corr_x in corr.iterrows(): if correlation_overrides and x in correlation_overrides: continue for y, corr in corr_x.iteritems(): if x == y: break if corr > correlation_threshold: ldesc[x] = pd.Series(['CORR', y, corr], index=['type', 'correlation_var', 'correlation']) if check_recoded: categorical_variables = [(name, data) for (name, data) in df.iteritems() if base.get_vartype(data)=='CAT'] for (name1, data1), (name2, data2) in itertools.combinations(categorical_variables, 2): if correlation_overrides and name1 in correlation_overrides: continue confusion_matrix=pd.crosstab(data1,data2) if confusion_matrix.values.diagonal().sum() == len(df): ldesc[name1] = pd.Series(['RECODED', name2], index=['type', 'correlation_var']) # Convert ldesc to a DataFrame names = [] ldesc_indexes = sorted([x.index for x in ldesc.values()], key=len) for idxnames in ldesc_indexes: for name in idxnames: if name not in names: names.append(name) variable_stats = pd.concat(ldesc, join_axes=pd.Index([names]), axis=1) variable_stats.columns.names = df.columns.names # General statistics table_stats = {} table_stats['n'] = len(df) table_stats['nvar'] = len(df.columns) table_stats['total_missing'] = variable_stats.loc['n_missing'].sum() / (table_stats['n'] * table_stats['nvar']) unsupported_columns = variable_stats.transpose()[variable_stats.transpose().type != base.S_TYPE_UNSUPPORTED].index.tolist() table_stats['n_duplicates'] = sum(df.duplicated(subset=unsupported_columns)) if len(unsupported_columns) > 0 else 0 memsize = df.memory_usage(index=True).sum() table_stats['memsize'] = formatters.fmt_bytesize(memsize) table_stats['recordsize'] = formatters.fmt_bytesize(memsize / table_stats['n']) table_stats.update({k: 0 for k in ("NUM", "DATE", "CONST", "CAT", "UNIQUE", "CORR", "RECODED", "BOOL", "UNSUPPORTED")}) table_stats.update(dict(variable_stats.loc['type'].value_counts())) table_stats['REJECTED'] = table_stats['CONST'] + table_stats['CORR'] + table_stats['RECODED'] return { 'table': table_stats, 'variables': variable_stats.T, 'freq': {k: (base.get_groupby_statistic(df[k])[0] if variable_stats[k].type != base.S_TYPE_UNSUPPORTED else None) for k in df.columns}, 'correlations': {'pearson': dfcorrPear, 'spearman': dfcorrSpear} }
def describe(df, **kwargs): """ Generates a object containing summary statistics for a given DataFrame :param df: DataFrame to be analyzed :param bins: Number of bins in histogram :return: Dictionary containing table: general statistics on the DataFrame variables: summary statistics for each variable freq: frequency table """ if not isinstance(df, pd.DataFrame): raise TypeError("df must be of type pandas.DataFrame") if df.empty: raise ValueError("df can not be empty") bins = kwargs.get('bins', 10) try: # reset matplotlib style before use # Fails in matplotlib 1.4.x so plot might look bad matplotlib.style.use("default") except: pass matplotlib.style.use(resource_filename(__name__, "pandas_profiling.mplstyle")) def pretty_name(x): x *= 100 if x == int(x): return '%.0f%%' % x else: return '%.1f%%' % x def describe_numeric_1d(series, base_stats): stats = {'mean': series.mean(), 'std': series.std(), 'variance': series.var(), 'min': series.min(), 'max': series.max()} stats['range'] = stats['max'] - stats['min'] for x in np.array([0.05, 0.25, 0.5, 0.75, 0.95]): stats[pretty_name(x)] = series.quantile(x) stats['iqr'] = stats['75%'] - stats['25%'] stats['kurtosis'] = series.kurt() stats['skewness'] = series.skew() stats['sum'] = series.sum() stats['mad'] = series.mad() stats['cv'] = stats['std'] / stats['mean'] if stats['mean'] else np.NaN stats['type'] = "NUM" stats['n_zeros'] = (len(series) - np.count_nonzero(series)) stats['p_zeros'] = stats['n_zeros'] / len(series) # Large histogram imgdata = BytesIO() plot = series.plot(kind='hist', figsize=(6, 4), facecolor='#337ab7', bins=bins) # TODO when running on server, send this off to a different thread plot.figure.subplots_adjust(left=0.15, right=0.95, top=0.9, bottom=0.1, wspace=0, hspace=0) plot.figure.savefig(imgdata) imgdata.seek(0) stats['histogram'] = 'data:image/png;base64,' + quote(base64.b64encode(imgdata.getvalue())) #TODO Think about writing this to disk instead of caching them in strings plt.close(plot.figure) stats['mini_histogram'] = mini_histogram(series) return pd.Series(stats, name=series.name) def mini_histogram(series): # Small histogram imgdata = BytesIO() plot = series.plot(kind='hist', figsize=(2, 0.75), facecolor='#337ab7', bins=bins) plot.axes.get_yaxis().set_visible(False) plot.set_axis_bgcolor("w") xticks = plot.xaxis.get_major_ticks() for tick in xticks[1:-1]: tick.set_visible(False) tick.label.set_visible(False) for tick in (xticks[0], xticks[-1]): tick.label.set_fontsize(8) plot.figure.subplots_adjust(left=0.15, right=0.85, top=1, bottom=0.35, wspace=0, hspace=0) plot.figure.savefig(imgdata) imgdata.seek(0) result_string = 'data:image/png;base64,' + quote(base64.b64encode(imgdata.getvalue())) plt.close(plot.figure) return result_string def describe_date_1d(series, base_stats): stats = {'min': series.min(), 'max': series.max()} stats['range'] = stats['max'] - stats['min'] stats['type'] = "DATE" # TODO: Matplotlib can't do histograms of dates. # stats['mini_histogram'] = mini_histogram(series) return pd.Series(stats, name=series.name) def describe_categorical_1d(data): # Only run if at least 1 non-missing value objcounts = data.value_counts() top, freq = objcounts.index[0], objcounts.iloc[0] names = [] result = [] if data.dtype == object or com.is_categorical_dtype(data.dtype): names += ['top', 'freq', 'type'] result += [top, freq, 'CAT'] return pd.Series(result, index=names, name=data.name) def describe_constant_1d(data): return pd.Series(['CONST'], index=['type'], name=data.name) def describe_unique_1d(data): return pd.Series(['UNIQUE'], index=['type'], name=data.name) def describe_1d(data): count = data.count() leng = len(data) distinct_count = data.nunique(dropna=False) if count > distinct_count > 1: mode = data.mode().iloc[0] else: mode = data[0] results_data = {'count': count, 'distinct_count': distinct_count, 'p_missing': 1 - count / leng, 'n_missing': leng - count, 'is_unique': distinct_count == leng, 'mode': mode, 'p_unique': distinct_count / count} try: # pandas 0.17 onwards results_data['memorysize'] = data.memory_usage() except: results_data['memorysize'] = 0 result = pd.Series(results_data, name=data.name) if distinct_count <= 1: result = result.append(describe_constant_1d(data)) elif com.is_numeric_dtype(data): result = result.append(describe_numeric_1d(data, result)) elif com.is_datetime64_dtype(data): result = result.append(describe_date_1d(data, result)) elif distinct_count == leng: result = result.append(describe_unique_1d(data)) else: result = result.append(describe_categorical_1d(data)) return result if not pd.Index(np.arange(0, len(df))).equals(df.index): # Treat index as any other column df = df.reset_index() # Describe all variables in a univariate way ldesc = {col: describe_1d(s) for col, s in df.iteritems()} # Check correlations between variables ''' TODO: corr(x,y) > 0.9 and corr(y,z) > 0.9 does not imply corr(x,z) > 0.9 If x~y and y~z but not x~z, it would be better to delete only y Better way would be to find out which variable causes the highest increase in multicollinearity. ''' corr = df.corr() for x, corr_x in corr.iterrows(): for y, corr in corr_x.iteritems(): if x == y: break if corr > 0.9: ldesc[x] = pd.Series(['CORR', y, corr], index=['type', 'correlation_var', 'correlation'], name=x) # Convert ldesc to a DataFrame names = [] ldesc_indexes = sorted([x.index for x in ldesc.values()], key=len) for idxnames in ldesc_indexes: for name in idxnames: if name not in names: names.append(name) variable_stats = pd.concat(ldesc, join_axes=pd.Index([names]), axis=1) variable_stats.columns.names = df.columns.names # General statistics table_stats = {'n': len(df), 'nvar': len(df.columns)} table_stats['total_missing'] = variable_stats.loc['n_missing'].sum() / (table_stats['n'] * table_stats['nvar']) table_stats['n_duplicates'] = sum(df.duplicated()) memsize = df.memory_usage(index=True).sum() table_stats['memsize'] = formatters.fmt_bytesize(memsize) table_stats['recordsize'] = formatters.fmt_bytesize(memsize / table_stats['n']) table_stats.update({k: 0 for k in ("NUM", "DATE", "CONST", "CAT", "UNIQUE", "CORR")}) table_stats.update(dict(variable_stats.loc['type'].value_counts())) table_stats['REJECTED'] = table_stats['CONST'] + table_stats['CORR'] return {'table': table_stats, 'variables': variable_stats.T, 'freq': {k: df[k].value_counts() for k in df.columns}}
def describe(df, bins=10, correlation_overrides=None, pool_size=multiprocessing.cpu_count(), **kwargs): """ Generates a object containing summary statistics for a given DataFrame :param df: DataFrame to be analyzed :param bins: Number of bins in histogram :param correlation_overrides: Variable names not to be rejected because they are correlated :param pool_size: Number of workers in thread pool :return: Dictionary containing table: general statistics on the DataFrame variables: summary statistics for each variable freq: frequency table """ if not isinstance(df, pd.DataFrame): raise TypeError("df must be of type pandas.DataFrame") if df.empty: raise ValueError("df can not be empty") try: # reset matplotlib style before use # Fails in matplotlib 1.4.x so plot might look bad matplotlib.style.use("default") except: pass matplotlib.style.use( resource_filename(__name__, "pandas_profiling.mplstyle")) def pretty_name(x): x *= 100 if x == int(x): return '%.0f%%' % x else: return '%.1f%%' % x def describe_numeric_1d(series, base_stats): stats = { 'mean': series.mean(), 'std': series.std(), 'variance': series.var(), 'min': series.min(), 'max': series.max() } stats['range'] = stats['max'] - stats['min'] for x in np.array([0.05, 0.25, 0.5, 0.75, 0.95]): stats[pretty_name(x)] = series.dropna().quantile( x ) # The dropna() is a workaround for https://github.com/pydata/pandas/issues/13098 stats['iqr'] = stats['75%'] - stats['25%'] stats['kurtosis'] = series.kurt() stats['skewness'] = series.skew() stats['sum'] = series.sum() stats['mad'] = series.mad() stats['cv'] = stats['std'] / stats['mean'] if stats['mean'] else np.NaN stats['type'] = "NUM" stats['n_zeros'] = (len(series) - np.count_nonzero(series)) stats['p_zeros'] = stats['n_zeros'] / len(series) # Histograms stats['histogram'] = histogram(series) stats['mini_histogram'] = mini_histogram(series) return pd.Series(stats, name=series.name) def _plot_histogram(series, figsize=(6, 4), facecolor='#337ab7', bins=bins): """Plot an histogram from the data and return the AxesSubplot object. Parameters ---------- series: Series, default None The data to plot figsize: a tuple (width, height) in inches, default (6,4) The size of the figure. facecolor: str The color code. bins: int, default The number of equal-width bins in the given range. Returns ------- matplotlib.AxesSubplot, The plot. """ if com.is_datetime64_dtype(series): # TODO: These calls should be merged fig = plt.figure(figsize=figsize) plot = fig.add_subplot(111) plot.set_ylabel('Frequency') try: plot.hist(series.values, facecolor=facecolor, bins=bins) except TypeError: # matplotlib 1.4 can't plot dates so will show empty plot instead pass else: plot = series.plot( kind='hist', figsize=figsize, facecolor=facecolor, bins=bins ) # TODO when running on server, send this off to a different thread return plot def histogram(series): """Plot an histogram of the data. Parameters ---------- series: Series, default None The data to plot. Returns ------- str, The resulting image encoded as a string. """ imgdata = BytesIO() plot = _plot_histogram(series) plot.figure.subplots_adjust(left=0.15, right=0.95, top=0.9, bottom=0.1, wspace=0, hspace=0) plot.figure.savefig(imgdata) imgdata.seek(0) result_string = 'data:image/png;base64,' + quote( base64.b64encode(imgdata.getvalue())) # TODO Think about writing this to disk instead of caching them in strings plt.close(plot.figure) return result_string def mini_histogram(series): """Plot a small (mini) histogram of the data. Parameters ---------- series: Series, default None The data to plot. Returns ------- str, The resulting image encoded as a string. """ imgdata = BytesIO() plot = _plot_histogram(series, figsize=(2, 0.75)) plot.axes.get_yaxis().set_visible(False) plot.set_axis_bgcolor("w") xticks = plot.xaxis.get_major_ticks() for tick in xticks[1:-1]: tick.set_visible(False) tick.label.set_visible(False) for tick in (xticks[0], xticks[-1]): tick.label.set_fontsize(8) plot.figure.subplots_adjust(left=0.15, right=0.85, top=1, bottom=0.35, wspace=0, hspace=0) plot.figure.savefig(imgdata) imgdata.seek(0) result_string = 'data:image/png;base64,' + quote( base64.b64encode(imgdata.getvalue())) plt.close(plot.figure) return result_string def describe_date_1d(series, base_stats): stats = {'min': series.min(), 'max': series.max()} stats['range'] = stats['max'] - stats['min'] stats['type'] = "DATE" stats['histogram'] = histogram(series) stats['mini_histogram'] = mini_histogram(series) return pd.Series(stats, name=series.name) def describe_categorical_1d(data): # Only run if at least 1 non-missing value objcounts = data.value_counts() top, freq = objcounts.index[0], objcounts.iloc[0] names = [] result = [] if data.dtype == object or com.is_categorical_dtype(data.dtype): names += ['top', 'freq'] result += [top, freq] return pd.Series(result, index=names, name=data.name) def describe_constant_1d(data): return pd.Series(['CONST'], index=['type'], name=data.name) def describe_unique_1d(data): return pd.Series(['UNIQUE'], index=['type'], name=data.name) def describe_1d(data): leng = len(data) # number of observations in the Series count = data.count() # number of non-NaN observations in the Series # Replace infinite values with NaNs to avoid issues with # histograms later. data.replace(to_replace=[np.inf, np.NINF, np.PINF], value=np.nan, inplace=True) n_infinite = count - data.count( ) # number of infinte observations in the Series distinct_count = data.nunique( dropna=False) # number of unique elements in the Series if count > distinct_count > 1: mode = data.mode().iloc[0] else: mode = data[0] results_data = { 'count': count, 'distinct_count': distinct_count, 'p_missing': 1 - count / leng, 'n_missing': leng - count, 'p_infinite': n_infinite / leng, 'n_infinite': n_infinite, 'is_unique': distinct_count == leng, 'mode': mode, 'p_unique': distinct_count / count } try: # pandas 0.17 onwards results_data['memorysize'] = data.memory_usage() except: results_data['memorysize'] = 0 result = pd.Series(results_data, name=data.name) if distinct_count <= 1: result = result.append(describe_constant_1d(data)) elif com.is_numeric_dtype(data): result = result.append(describe_numeric_1d(data, result)) elif com.is_datetime64_dtype(data): result = result.append(describe_date_1d(data, result)) elif distinct_count == leng: result = result.append(describe_unique_1d(data)) else: result = result.append(describe_categorical_1d(data)) result['type'] = 'CAT' return result if not pd.Index(np.arange(0, len(df))).equals(df.index): # Treat index as any other column df = df.reset_index() # Describe all variables in a univariate way pool = multiprocessing.Pool(pool_size) local_multiprocess_func = partial(multiprocess_func, **kwargs) ldesc = { col: s for col, s in pool.map(local_multiprocess_func, df.iteritems()) } pool.close() # Check correlations between variables ''' TODO: corr(x,y) > 0.9 and corr(y,z) > 0.9 does not imply corr(x,z) > 0.9 If x~y and y~z but not x~z, it would be better to delete only y Better way would be to find out which variable causes the highest increase in multicollinearity. ''' corr = df.corr() for x, corr_x in corr.iterrows(): if correlation_overrides and x in correlation_overrides: continue for y, corr in corr_x.iteritems(): if x == y: break if corr > 0.9: ldesc[x] = pd.Series( ['CORR', y, corr], index=['type', 'correlation_var', 'correlation'], name=x) # Convert ldesc to a DataFrame names = [] ldesc_indexes = sorted([x.index for x in ldesc.values()], key=len) for idxnames in ldesc_indexes: for name in idxnames: if name not in names: names.append(name) variable_stats = pd.concat(ldesc, join_axes=pd.Index([names]), axis=1) variable_stats.columns.names = df.columns.names # General statistics table_stats = {'n': len(df), 'nvar': len(df.columns)} table_stats['total_missing'] = variable_stats.loc['n_missing'].sum() / ( table_stats['n'] * table_stats['nvar']) table_stats['n_duplicates'] = sum(df.duplicated()) memsize = df.memory_usage(index=True).sum() table_stats['memsize'] = formatters.fmt_bytesize(memsize) table_stats['recordsize'] = formatters.fmt_bytesize(memsize / table_stats['n']) table_stats.update( {k: 0 for k in ("NUM", "DATE", "CONST", "CAT", "UNIQUE", "CORR")}) table_stats.update(dict(variable_stats.loc['type'].value_counts())) table_stats['REJECTED'] = table_stats['CONST'] + table_stats['CORR'] return { 'table': table_stats, 'variables': variable_stats.T, 'freq': {k: df[k].value_counts() for k in df.columns} }
def describe(df, **kwargs): """ Generates a object containing summary statistics for a given DataFrame :param df: DataFrame to be analyzed :param bins: Number of bins in histogram :return: Dictionary containing table: general statistics on the DataFrame variables: summary statistics for each variable freq: frequency table """ if not isinstance(df, pd.DataFrame): raise TypeError("df must be of type pandas.DataFrame") if df.empty: raise ValueError("df can not be empty") bins = kwargs.get('bins', 10) try: # reset matplotlib style before use # Fails in matplotlib 1.4.x so plot might look bad matplotlib.style.use("default") except: pass matplotlib.style.use( resource_filename(__name__, "pandas_profiling.mplstyle")) def pretty_name(x): x *= 100 if x == int(x): return '%.0f%%' % x else: return '%.1f%%' % x def describe_numeric_1d(series, base_stats): stats = { 'mean': series.mean(), 'std': series.std(), 'variance': series.var(), 'min': series.min(), 'max': series.max() } stats['range'] = stats['max'] - stats['min'] for x in np.array([0.05, 0.25, 0.5, 0.75, 0.95]): stats[pretty_name(x)] = series.quantile(x) stats['iqr'] = stats['75%'] - stats['25%'] stats['kurtosis'] = series.kurt() stats['skewness'] = series.skew() stats['sum'] = series.sum() stats['mad'] = series.mad() stats['cv'] = stats['std'] / stats['mean'] if stats['mean'] else np.NaN stats['type'] = "NUM" stats['n_zeros'] = (len(series) - np.count_nonzero(series)) stats['p_zeros'] = stats['n_zeros'] / len(series) # Large histogram imgdata = BytesIO() plot = series.plot( kind='hist', figsize=(6, 4), facecolor='#337ab7', bins=bins ) # TODO when running on server, send this off to a different thread plot.figure.subplots_adjust(left=0.15, right=0.95, top=0.9, bottom=0.1, wspace=0, hspace=0) plot.figure.savefig(imgdata) imgdata.seek(0) stats['histogram'] = 'data:image/png;base64,' + quote( base64.b64encode(imgdata.getvalue())) #TODO Think about writing this to disk instead of caching them in strings plt.close(plot.figure) stats['mini_histogram'] = mini_histogram(series) return pd.Series(stats, name=series.name) def mini_histogram(series): # Small histogram imgdata = BytesIO() plot = series.plot(kind='hist', figsize=(2, 0.75), facecolor='#337ab7', bins=bins) plot.axes.get_yaxis().set_visible(False) plot.set_axis_bgcolor("w") xticks = plot.xaxis.get_major_ticks() for tick in xticks[1:-1]: tick.set_visible(False) tick.label.set_visible(False) for tick in (xticks[0], xticks[-1]): tick.label.set_fontsize(8) plot.figure.subplots_adjust(left=0.15, right=0.85, top=1, bottom=0.35, wspace=0, hspace=0) plot.figure.savefig(imgdata) imgdata.seek(0) result_string = 'data:image/png;base64,' + quote( base64.b64encode(imgdata.getvalue())) plt.close(plot.figure) return result_string def describe_date_1d(series, base_stats): stats = {'min': series.min(), 'max': series.max()} stats['range'] = stats['max'] - stats['min'] stats['type'] = "DATE" # TODO: Matplotlib can't do histograms of dates. # stats['mini_histogram'] = mini_histogram(series) return pd.Series(stats, name=series.name) def describe_categorical_1d(data): # Only run if at least 1 non-missing value objcounts = data.value_counts() top, freq = objcounts.index[0], objcounts.iloc[0] names = [] result = [] if data.dtype == object or com.is_categorical_dtype(data.dtype): names += ['top', 'freq', 'type'] result += [top, freq, 'CAT'] return pd.Series(result, index=names, name=data.name) def describe_constant_1d(data): return pd.Series(['CONST'], index=['type'], name=data.name) def describe_unique_1d(data): return pd.Series(['UNIQUE'], index=['type'], name=data.name) def describe_1d(data): count = data.count() leng = len(data) distinct_count = data.nunique(dropna=False) if count > distinct_count > 1: mode = data.mode().iloc[0] else: mode = data[0] results_data = { 'count': count, 'distinct_count': distinct_count, 'p_missing': 1 - count / leng, 'n_missing': leng - count, 'is_unique': distinct_count == leng, 'mode': mode, 'p_unique': distinct_count / count } try: # pandas 0.17 onwards results_data['memorysize'] = data.memory_usage() except: results_data['memorysize'] = 0 result = pd.Series(results_data, name=data.name) if distinct_count <= 1: result = result.append(describe_constant_1d(data)) elif com.is_numeric_dtype(data): result = result.append(describe_numeric_1d(data, result)) elif com.is_datetime64_dtype(data): result = result.append(describe_date_1d(data, result)) elif distinct_count == leng: result = result.append(describe_unique_1d(data)) else: result = result.append(describe_categorical_1d(data)) return result if not pd.Index(np.arange(0, len(df))).equals(df.index): # Treat index as any other column df = df.reset_index() # Describe all variables in a univariate way ldesc = {col: describe_1d(s) for col, s in df.iteritems()} # Check correlations between variables ''' TODO: corr(x,y) > 0.9 and corr(y,z) > 0.9 does not imply corr(x,z) > 0.9 If x~y and y~z but not x~z, it would be better to delete only y Better way would be to find out which variable causes the highest increase in multicollinearity. ''' corr = df.corr() for x, corr_x in corr.iterrows(): for y, corr in corr_x.iteritems(): if x == y: break if corr > 0.9: ldesc[x] = pd.Series( ['CORR', y, corr], index=['type', 'correlation_var', 'correlation'], name=x) # Convert ldesc to a DataFrame names = [] ldesc_indexes = sorted([x.index for x in ldesc.values()], key=len) for idxnames in ldesc_indexes: for name in idxnames: if name not in names: names.append(name) variable_stats = pd.concat(ldesc, join_axes=pd.Index([names]), axis=1) variable_stats.columns.names = df.columns.names # General statistics table_stats = {'n': len(df), 'nvar': len(df.columns)} table_stats['total_missing'] = variable_stats.loc['n_missing'].sum() / ( table_stats['n'] * table_stats['nvar']) table_stats['n_duplicates'] = sum(df.duplicated()) memsize = df.memory_usage(index=True).sum() table_stats['memsize'] = formatters.fmt_bytesize(memsize) table_stats['recordsize'] = formatters.fmt_bytesize(memsize / table_stats['n']) table_stats.update( {k: 0 for k in ("NUM", "DATE", "CONST", "CAT", "UNIQUE", "CORR")}) table_stats.update(dict(variable_stats.loc['type'].value_counts())) table_stats['REJECTED'] = table_stats['CONST'] + table_stats['CORR'] return { 'table': table_stats, 'variables': variable_stats.T, 'freq': {k: df[k].value_counts() for k in df.columns} }
def describe(df, bins=10, correlation_overrides=None, pool_size=multiprocessing.cpu_count(), **kwargs): """ Generates a object containing summary statistics for a given DataFrame :param df: DataFrame to be analyzed :param bins: Number of bins in histogram :param correlation_overrides: Variable names not to be rejected because they are correlated :param pool_size: Number of workers in thread pool :return: Dictionary containing table: general statistics on the DataFrame variables: summary statistics for each variable freq: frequency table """ if not isinstance(df, pd.DataFrame): raise TypeError("df must be of type pandas.DataFrame") if df.empty: raise ValueError("df can not be empty") try: # reset matplotlib style before use # Fails in matplotlib 1.4.x so plot might look bad matplotlib.style.use("default") except: pass matplotlib.style.use( resource_filename(__name__, "pandas_profiling.mplstyle")) if not pd.Index(np.arange(0, len(df))).equals(df.index): # Treat index as any other column df = df.reset_index() # Describe all variables in a univariate way pool = multiprocessing.Pool(pool_size) local_multiprocess_func = partial(multiprocess_func, **kwargs) ldesc = { col: s for col, s in pool.map(local_multiprocess_func, df.iteritems()) } pool.close() # Check correlations between variables ''' TODO: corr(x,y) > 0.9 and corr(y,z) > 0.9 does not imply corr(x,z) > 0.9 If x~y and y~z but not x~z, it would be better to delete only y Better way would be to find out which variable causes the highest increase in multicollinearity. ''' corr = df.corr() for x, corr_x in corr.iterrows(): if correlation_overrides and x in correlation_overrides: continue for y, corr in corr_x.iteritems(): if x == y: break if corr > 0.9: ldesc[x] = pd.Series( ['CORR', y, corr], index=['type', 'correlation_var', 'correlation']) categorical_variables = [(name, data) for (name, data) in df.iteritems() if get_vartype(data) == 'CAT'] for (name1, data1), (name2, data2) in itertools.combinations(categorical_variables, 2): if correlation_overrides and name1 in correlation_overrides: continue confusion_matrix = pd.crosstab(data1, data2) if confusion_matrix.values.diagonal().sum() == len(df): ldesc[name1] = pd.Series(['RECODED', name2], index=['type', 'correlation_var']) # Convert ldesc to a DataFrame names = [] ldesc_indexes = sorted([x.index for x in ldesc.values()], key=len) for idxnames in ldesc_indexes: for name in idxnames: if name not in names: names.append(name) variable_stats = pd.concat(ldesc, join_axes=pd.Index([names]), axis=1) variable_stats.columns.names = df.columns.names # General statistics table_stats = {'n': len(df), 'nvar': len(df.columns)} table_stats['total_missing'] = variable_stats.loc['n_missing'].sum() / ( table_stats['n'] * table_stats['nvar']) table_stats['n_duplicates'] = sum(df.duplicated()) memsize = df.memory_usage(index=True).sum() table_stats['memsize'] = formatters.fmt_bytesize(memsize) table_stats['recordsize'] = formatters.fmt_bytesize(memsize / table_stats['n']) table_stats.update({ k: 0 for k in ("NUM", "DATE", "CONST", "CAT", "UNIQUE", "CORR", "RECODED") }) table_stats.update(dict(variable_stats.loc['type'].value_counts())) table_stats['REJECTED'] = table_stats['CONST'] + table_stats[ 'CORR'] + table_stats['RECODED'] return { 'table': table_stats, 'variables': variable_stats.T, 'freq': {k: df[k].value_counts() for k in df.columns} }
def describe(df): if not isinstance(df, pd.DataFrame): raise TypeError("df must be of type pandas.DataFrame") if df.empty: raise ValueError("df can not be empty") # reset matplotlib style before use matplotlib.style.use("default") matplotlib.style.use(resource_filename(__name__, "pandas_profiling.mplstyle")) def pretty_name(x): x *= 100 if x == int(x): return '%.0f%%' % x else: return '%.1f%%' % x def describe_numeric_1d(series, base_stats): stats = {'mean': series.mean(), 'std': series.std(), 'variance': series.var(), 'min': series.min(), 'max': series.max()} stats['range'] = stats['max'] - stats['min'] for x in np.array([0.05, 0.25, 0.5, 0.75, 0.95]): stats[pretty_name(x)] = series.quantile(x) stats['iqr'] = stats['75%'] - stats['25%'] stats['kurtosis'] = series.kurt() stats['skewness'] = series.skew() stats['sum'] = series.sum() stats['mad'] = series.mad() stats['cv'] = stats['std'] / stats['mean'] if stats['mean'] else np.NaN stats['type'] = "NUM" stats['p_zeros'] = (len(series) - np.count_nonzero(series)) / len(series) # Large histogram imgdata = BytesIO() plot = series.plot(kind='hist', figsize=(6, 4), facecolor='#337ab7') # TODO when running on server, send this off to a different thread plot.figure.subplots_adjust(left=0.15, right=0.95, top=0.9, bottom=0.1, wspace=0, hspace=0) plot.figure.savefig(imgdata) imgdata.seek(0) stats['histogram'] = 'data:image/png;base64,' + quote(base64.b64encode(imgdata.getvalue())) #TODO Think about writing this to disk instead of caching them in strings plt.close(plot.figure) stats['mini_histogram'] = mini_histogram(series) return pd.Series(stats, name=series.name) def mini_histogram(series): # Small histogram imgdata = BytesIO() plot = series.plot(kind='hist', figsize=(2, 0.75), facecolor='#337ab7') plot.axes.get_yaxis().set_visible(False) plot.set_axis_bgcolor("w") xticks = plot.xaxis.get_major_ticks() for tick in xticks[1:-1]: tick.set_visible(False) tick.label.set_visible(False) for tick in (xticks[0], xticks[-1]): tick.label.set_fontsize(8) plot.figure.subplots_adjust(left=0.15, right=0.85, top=1, bottom=0.35, wspace=0, hspace=0) plot.figure.savefig(imgdata) imgdata.seek(0) result_string = 'data:image/png;base64,' + quote(base64.b64encode(imgdata.getvalue())) plt.close(plot.figure) return result_string def describe_date_1d(series, base_stats): stats = {'min': series.min(), 'max': series.max()} stats['range'] = stats['max'] - stats['min'] stats['type'] = "DATE" # TODO: Matplotlib can't do dates of histograms. # stats['mini_histogram'] = mini_histogram(series) return pd.Series(stats, name=series.name) def describe_categorical_1d(data): # Only run if at least 1 non-missing value objcounts = data.value_counts() top, freq = objcounts.index[0], objcounts.iloc[0] names = [] result = [] if data.dtype == object or com.is_categorical_dtype(data.dtype): names += ['top', 'freq', 'type'] result += [top, freq, 'CAT'] return pd.Series(result, index=names, name=data.name) def describe_constant_1d(data): return pd.Series(['CONST'], index=['type'], name=data.name) def describe_unique_1d(data): return pd.Series(['UNIQUE'], index=['type'], name=data.name) def describe_1d(data): # Is unique # Percent missing names = ['count', 'distinct_count', 'p_missing', 'n_missing', 'is_unique', 'mode', 'p_unique', 'memorysize'] count = data.count() leng = len(data) distinct_count = data.nunique(dropna=False) if count > distinct_count > 1: mode = data.mode().iloc[0] else: mode = data[0] results_data = [count, distinct_count, 1 - count / leng, leng - count, distinct_count == leng, mode, distinct_count / count, data.memory_usage()] result = pd.Series(results_data, index=names, name=data.name) if distinct_count <= 1: result = result.append(describe_constant_1d(data)) elif com.is_numeric_dtype(data): result = result.append(describe_numeric_1d(data, result)) elif com.is_datetime64_dtype(data): result = result.append(describe_date_1d(data, result)) elif distinct_count == leng: result = result.append(describe_unique_1d(data)) else: result = result.append(describe_categorical_1d(data)) return result if not pd.Index(np.arange(0, len(df))).equals(df.index): # Treat index as any other column df = df.reset_index() ldesc = [describe_1d(s) for _, s in df.iteritems()] # set a convenient order for rows names = [] ldesc_indexes = sorted([x.index for x in ldesc], key=len) for idxnames in ldesc_indexes: for name in idxnames: if name not in names: names.append(name) variable_stats = pd.concat(ldesc, join_axes=pd.Index([names]), axis=1) variable_stats.columns.names = df.columns.names table_stats = {'n': len(df), 'nvar': len(df.columns)} table_stats['total_missing'] = variable_stats.loc['n_missing'].sum() / (table_stats['n'] * table_stats['nvar']) table_stats['n_duplicates'] = sum(df.duplicated()) memsize = df.memory_usage(index=True).sum() table_stats['memsize'] = formatters.fmt_bytesize(memsize) table_stats['recordsize'] = formatters.fmt_bytesize(memsize / table_stats['n']) table_stats.update({k: 0 for k in ("NUM", "DATE", "CONST", "CAT", "UNIQUE")}) table_stats.update(dict(variable_stats.loc['type'].value_counts())) return {'table': table_stats, 'variables': variable_stats.T, 'freq': {k: df[k].value_counts() for k in df.columns}}
def describe(df, bins=10, check_correlation=True, correlation_overrides=None, pool_size=multiprocessing.cpu_count(), **kwargs): """Generates a dict containing summary statistics for a given dataset stored as a pandas `DataFrame`. Used has is it will output its content as an HTML report in a Jupyter notebook. Parameters ---------- df : DataFrame Data to be analyzed bins : int Number of bins in histogram check_correlation : boolean Whether or not to check correlation. correlation_overrides : list Variable names not to be rejected because they are correlated pool_size: int Number of workers in thread pool Returns ------- dict Containing the following keys: * table: general statistics on the dataset * variables: summary statistics for each variable * freq: frequency table Notes: ------ * The section dedicated to check the correlation should be externalized """ if not isinstance(df, pd.DataFrame): raise TypeError("df must be of type pandas.DataFrame") if df.empty: raise ValueError("df can not be empty") try: # reset matplotlib style before use # Fails in matplotlib 1.4.x so plot might look bad matplotlib.style.use("default") except: pass matplotlib.style.use(resource_filename(__name__, "pandas_profiling.mplstyle")) if not pd.Index(np.arange(0, len(df))).equals(df.index): # Treat index as any other column df = df.reset_index() # Describe all variables in a univariate way pool = multiprocessing.Pool(pool_size) local_multiprocess_func = partial(multiprocess_func, **kwargs) ldesc = {col: s for col, s in pool.map(local_multiprocess_func, df.iteritems())} pool.close() # Check correlations between variable if check_correlation is True: ''' TODO: corr(x,y) > 0.9 and corr(y,z) > 0.9 does not imply corr(x,z) > 0.9 If x~y and y~z but not x~z, it would be better to delete only y Better way would be to find out which variable causes the highest increase in multicollinearity. ''' corr = df.corr() for x, corr_x in corr.iterrows(): if correlation_overrides and x in correlation_overrides: continue for y, corr in corr_x.iteritems(): if x == y: break if corr > 0.9: ldesc[x] = pd.Series(['CORR', y, corr], index=['type', 'correlation_var', 'correlation']) categorical_variables = [(name, data) for (name, data) in df.iteritems() if base.get_vartype(data)=='CAT'] for (name1, data1), (name2, data2) in itertools.combinations(categorical_variables, 2): if correlation_overrides and name1 in correlation_overrides: continue confusion_matrix=pd.crosstab(data1,data2) if confusion_matrix.values.diagonal().sum() == len(df): ldesc[name1] = pd.Series(['RECODED', name2], index=['type', 'correlation_var']) # Convert ldesc to a DataFrame names = [] ldesc_indexes = sorted([x.index for x in ldesc.values()], key=len) for idxnames in ldesc_indexes: for name in idxnames: if name not in names: names.append(name) variable_stats = pd.concat(ldesc, join_axes=pd.Index([names]), axis=1) variable_stats.columns.names = df.columns.names # General statistics table_stats = {'n': len(df), 'nvar': len(df.columns)} table_stats['total_missing'] = variable_stats.loc['n_missing'].sum() / (table_stats['n'] * table_stats['nvar']) table_stats['n_duplicates'] = sum(df.duplicated()) memsize = df.memory_usage(index=True).sum() table_stats['memsize'] = formatters.fmt_bytesize(memsize) table_stats['recordsize'] = formatters.fmt_bytesize(memsize / table_stats['n']) table_stats.update({k: 0 for k in ("NUM", "DATE", "CONST", "CAT", "UNIQUE", "CORR", "RECODED", "BOOL")}) table_stats.update(dict(variable_stats.loc['type'].value_counts())) table_stats['REJECTED'] = table_stats['CONST'] + table_stats['CORR'] + table_stats['RECODED'] return {'table': table_stats, 'variables': variable_stats.T, 'freq': {k: df[k].value_counts() for k in df.columns}}
def describe(df, bins=10, correlation_overrides=None, pool_size=multiprocessing.cpu_count(), **kwargs): """ Generates a object containing summary statistics for a given DataFrame :param df: DataFrame to be analyzed :param bins: Number of bins in histogram :param correlation_overrides: Variable names not to be rejected because they are correlated :param pool_size: Number of workers in thread pool :return: Dictionary containing table: general statistics on the DataFrame variables: summary statistics for each variable freq: frequency table """ if not isinstance(df, pd.DataFrame): raise TypeError("df must be of type pandas.DataFrame") if df.empty: raise ValueError("df can not be empty") try: # reset matplotlib style before use # Fails in matplotlib 1.4.x so plot might look bad matplotlib.style.use("default") except: pass matplotlib.style.use(resource_filename(__name__, "pandas_profiling.mplstyle")) def pretty_name(x): x *= 100 if x == int(x): return '%.0f%%' % x else: return '%.1f%%' % x def describe_numeric_1d(series, base_stats): stats = {'mean': series.mean(), 'std': series.std(), 'variance': series.var(), 'min': series.min(), 'max': series.max()} stats['range'] = stats['max'] - stats['min'] for x in np.array([0.05, 0.25, 0.5, 0.75, 0.95]): stats[pretty_name(x)] = series.dropna().quantile(x) # The dropna() is a workaround for https://github.com/pydata/pandas/issues/13098 stats['iqr'] = stats['75%'] - stats['25%'] stats['kurtosis'] = series.kurt() stats['skewness'] = series.skew() stats['sum'] = series.sum() stats['mad'] = series.mad() stats['cv'] = stats['std'] / stats['mean'] if stats['mean'] else np.NaN stats['type'] = "NUM" stats['n_zeros'] = (len(series) - np.count_nonzero(series)) stats['p_zeros'] = stats['n_zeros'] / len(series) # Histograms stats['histogram'] = histogram(series) stats['mini_histogram'] = mini_histogram(series) return pd.Series(stats, name=series.name) def _plot_histogram(series, figsize=(6, 4), facecolor='#337ab7', bins=bins): """Plot an histogram from the data and return the AxesSubplot object. Parameters ---------- series: Series, default None The data to plot figsize: a tuple (width, height) in inches, default (6,4) The size of the figure. facecolor: str The color code. bins: int, default The number of equal-width bins in the given range. Returns ------- matplotlib.AxesSubplot, The plot. """ if com.is_datetime64_dtype(series): # TODO: These calls should be merged fig = plt.figure(figsize=figsize) plot = fig.add_subplot(111) plot.set_ylabel('Frequency') try: plot.hist(series.values, facecolor=facecolor, bins=bins) except TypeError: # matplotlib 1.4 can't plot dates so will show empty plot instead pass else: plot = series.plot(kind='hist', figsize=figsize, facecolor=facecolor, bins=bins) # TODO when running on server, send this off to a different thread return plot def histogram(series): """Plot an histogram of the data. Parameters ---------- series: Series, default None The data to plot. Returns ------- str, The resulting image encoded as a string. """ imgdata = BytesIO() plot = _plot_histogram(series) plot.figure.subplots_adjust(left=0.15, right=0.95, top=0.9, bottom=0.1, wspace=0, hspace=0) plot.figure.savefig(imgdata) imgdata.seek(0) result_string = 'data:image/png;base64,' + quote(base64.b64encode(imgdata.getvalue())) # TODO Think about writing this to disk instead of caching them in strings plt.close(plot.figure) return result_string def mini_histogram(series): """Plot a small (mini) histogram of the data. Parameters ---------- series: Series, default None The data to plot. Returns ------- str, The resulting image encoded as a string. """ imgdata = BytesIO() plot = _plot_histogram(series, figsize=(2, 0.75)) plot.axes.get_yaxis().set_visible(False) plot.set_axis_bgcolor("w") xticks = plot.xaxis.get_major_ticks() for tick in xticks[1:-1]: tick.set_visible(False) tick.label.set_visible(False) for tick in (xticks[0], xticks[-1]): tick.label.set_fontsize(8) plot.figure.subplots_adjust(left=0.15, right=0.85, top=1, bottom=0.35, wspace=0, hspace=0) plot.figure.savefig(imgdata) imgdata.seek(0) result_string = 'data:image/png;base64,' + quote(base64.b64encode(imgdata.getvalue())) plt.close(plot.figure) return result_string def describe_date_1d(series, base_stats): stats = {'min': series.min(), 'max': series.max()} stats['range'] = stats['max'] - stats['min'] stats['type'] = "DATE" stats['histogram'] = histogram(series) stats['mini_histogram'] = mini_histogram(series) return pd.Series(stats, name=series.name) def describe_categorical_1d(data): # Only run if at least 1 non-missing value objcounts = data.value_counts() top, freq = objcounts.index[0], objcounts.iloc[0] names = [] result = [] if data.dtype == object or com.is_categorical_dtype(data.dtype): names += ['top', 'freq'] result += [top, freq] return pd.Series(result, index=names, name=data.name) def describe_constant_1d(data): return pd.Series(['CONST'], index=['type'], name=data.name) def describe_unique_1d(data): return pd.Series(['UNIQUE'], index=['type'], name=data.name) def describe_1d(data): leng = len(data) # number of observations in the Series count = data.count() # number of non-NaN observations in the Series # Replace infinite values with NaNs to avoid issues with # histograms later. data.replace(to_replace=[np.inf, np.NINF, np.PINF], value=np.nan, inplace=True) n_infinite = count - data.count() # number of infinte observations in the Series distinct_count = data.nunique(dropna=False) # number of unique elements in the Series if count > distinct_count > 1: mode = data.mode().iloc[0] else: mode = data[0] results_data = {'count': count, 'distinct_count': distinct_count, 'p_missing': 1 - count / leng, 'n_missing': leng - count, 'p_infinite': n_infinite / leng, 'n_infinite': n_infinite, 'is_unique': distinct_count == leng, 'mode': mode, 'p_unique': distinct_count / count} try: # pandas 0.17 onwards results_data['memorysize'] = data.memory_usage() except: results_data['memorysize'] = 0 result = pd.Series(results_data, name=data.name) if distinct_count <= 1: result = result.append(describe_constant_1d(data)) elif com.is_numeric_dtype(data): result = result.append(describe_numeric_1d(data, result)) elif com.is_datetime64_dtype(data): result = result.append(describe_date_1d(data, result)) elif distinct_count == leng: result = result.append(describe_unique_1d(data)) else: result = result.append(describe_categorical_1d(data)) result['type'] = 'CAT' return result if not pd.Index(np.arange(0, len(df))).equals(df.index): # Treat index as any other column df = df.reset_index() # Describe all variables in a univariate way pool = multiprocessing.Pool(pool_size) local_multiprocess_func = partial(multiprocess_func, **kwargs) ldesc = {col: s for col, s in pool.map(local_multiprocess_func, df.iteritems())} # Check correlations between variables ''' TODO: corr(x,y) > 0.9 and corr(y,z) > 0.9 does not imply corr(x,z) > 0.9 If x~y and y~z but not x~z, it would be better to delete only y Better way would be to find out which variable causes the highest increase in multicollinearity. ''' corr = df.corr() for x, corr_x in corr.iterrows(): if correlation_overrides and x in correlation_overrides: continue for y, corr in corr_x.iteritems(): if x == y: break if corr > 0.9: ldesc[x] = pd.Series(['CORR', y, corr], index=['type', 'correlation_var', 'correlation'], name=x) # Convert ldesc to a DataFrame names = [] ldesc_indexes = sorted([x.index for x in ldesc.values()], key=len) for idxnames in ldesc_indexes: for name in idxnames: if name not in names: names.append(name) variable_stats = pd.concat(ldesc, join_axes=pd.Index([names]), axis=1) variable_stats.columns.names = df.columns.names # General statistics table_stats = {'n': len(df), 'nvar': len(df.columns)} table_stats['total_missing'] = variable_stats.loc['n_missing'].sum() / (table_stats['n'] * table_stats['nvar']) table_stats['n_duplicates'] = sum(df.duplicated()) memsize = df.memory_usage(index=True).sum() table_stats['memsize'] = formatters.fmt_bytesize(memsize) table_stats['recordsize'] = formatters.fmt_bytesize(memsize / table_stats['n']) table_stats.update({k: 0 for k in ("NUM", "DATE", "CONST", "CAT", "UNIQUE", "CORR")}) table_stats.update(dict(variable_stats.loc['type'].value_counts())) table_stats['REJECTED'] = table_stats['CONST'] + table_stats['CORR'] return {'table': table_stats, 'variables': variable_stats.T, 'freq': {k: df[k].value_counts() for k in df.columns}}