def _plot_histogram(series, bins=10, figsize=(6, 4), facecolor='#337ab7'): """Plot an histogram from the data and return the AxesSubplot object. Parameters ---------- series : Series The data to plot figsize : tuple The size of the figure (width, height) in inches, default (6,4) facecolor : str The color code. Returns ------- matplotlib.AxesSubplot The plot. """ if base.get_vartype(series) == base.TYPE_DATE: # TODO: These calls should be merged fig = plt.figure(figsize=figsize) plot = fig.add_subplot(111) plot.set_ylabel('Frequency') try: plot.hist(series.values, facecolor=facecolor, bins=bins) except TypeError: # matplotlib 1.4 can't plot dates so will show empty plot instead pass else: plot = series.plot( kind='hist', figsize=figsize, facecolor=facecolor, bins=bins ) # TODO when running on server, send this off to a different thread return plot
def _plot_histogram(series, bins=10, figsize=(6, 4), facecolor='#337ab7'): """Plot an histogram from the data and return the AxesSubplot object. Parameters ---------- series : Series The data to plot figsize : tuple The size of the figure (width, height) in inches, default (6,4) facecolor : str The color code. Returns ------- matplotlib.AxesSubplot The plot. """ if base.get_vartype(series) == base.TYPE_DATE: # TODO: These calls should be merged fig = plt.figure(figsize=figsize) plot = fig.add_subplot(111) plot.set_ylabel('Frequency') try: plot.hist(series.values, facecolor=facecolor, bins=bins) except TypeError: # matplotlib 1.4 can't plot dates so will show empty plot instead pass else: plot = series.plot(kind='hist', figsize=figsize, facecolor=facecolor, bins=bins) # TODO when running on server, send this off to a different thread return plot
def describe_categorical_1d(series): """Compute summary statistics of a categorical (`TYPE_CAT`) variable (a Series). Parameters ---------- series : Series The variable to describe. Returns ------- Series The description of the variable as a Series with index being stats keys. """ # Only run if at least 1 non-missing value value_counts, distinct_count = base.get_groupby_statistic(series) top, freq = value_counts.index[0], value_counts.iloc[0] names = [] result = [] if base.get_vartype(series) == base.TYPE_CAT: names += ['top', 'freq', 'type'] result += [top, freq, base.TYPE_CAT] # Report max length and max trimmed length series_str = series.fillna('') max_trimmed_length = series_str.apply(lambda x: len(x.strip())).max() names.append('max_trimmed_length') result.append(max_trimmed_length) max_length = series_str.apply(lambda x: len(x)).max() names.append('max_length') result.append(max_length) # Add a mask type mask = series.apply(getMask)[series.apply(pd.notnull)] mask.name = mask.name + '_mask' mask_value_counts, mask_distinct_count = base.get_groupby_statistic( mask) names += ['top_mask', 'freq_mask'] result += [ mask_value_counts.index[0], '%0.0f' % (100.0 * mask_value_counts.iloc[0] / len(mask)) ] # Find all special characters sc = mask.apply(lambda x: re.sub(r'[lLDs]', '', x)) sc_list = ''.join(sc.values) sc_set = set(list(sc_list)) names.append('sc_set') result.append(sc_set) return pd.Series(result, index=names, name=series.name)
def describe_1d(data, **kwargs): """Compute summary statistics of a variable (a Series). The description is different according to the type of the variable. However a set of common stats is also computed. Parameters ---------- series : Series The variable to describe. Returns ------- Series The description of the variable as a Series with index being stats keys. """ # Replace infinite values with NaNs to avoid issues with # histograms later. data.replace(to_replace=[np.inf, np.NINF, np.PINF], value=np.nan, inplace=True) result = pd.Series({}, name=data.name) vartype = base.get_vartype(data) if vartype == base.S_TYPE_UNSUPPORTED: result = result.append(describe_unsupported(data)) else: result = result.append(describe_supported(data)) if vartype == base.S_TYPE_CONST: result = result.append(describe_constant_1d(data)) elif vartype == base.TYPE_BOOL: result = result.append(describe_boolean_1d(data)) elif vartype == base.TYPE_NUM: result = result.append(describe_numeric_1d(data, **kwargs)) elif vartype == base.TYPE_DATE: result = result.append(describe_date_1d(data)) elif vartype == base.S_TYPE_UNIQUE: result = result.append(describe_unique_1d(data)) else: # TYPE_CAT result = result.append(describe_categorical_1d(data)) return result
def describe_categorical_1d(series): """Compute summary statistics of a categorical (`TYPE_CAT`) variable (a Series). Parameters ---------- series : Series The variable to describe. Returns ------- Series The description of the variable as a Series with index being stats keys. """ # Only run if at least 1 non-missing value value_counts, distinct_count = base.get_groupby_statistic(series) top, freq = value_counts.index[0], value_counts.iloc[0] names = [] result = [] if base.get_vartype(series) == base.TYPE_CAT: names += ['top', 'freq', 'type'] result += [top, freq, base.TYPE_CAT] return pd.Series(result, index=names, name=series.name)
def describe_categorical_1d(series): """Compute summary statistics of a categorical (`TYPE_CAT`) variable (a Series). Parameters ---------- series : Series The variable to describe. Returns ------- Series The description of the variable as a Series with index being stats keys. """ # Only run if at least 1 non-missing value objcounts = series.value_counts() top, freq = objcounts.index[0], objcounts.iloc[0] names = [] result = [] if base.get_vartype(series) == base.TYPE_CAT: names += ['top', 'freq', 'type'] result += [top, freq, base.TYPE_CAT] return pd.Series(result, index=names, name=series.name)
def describe(df, bins=10, check_correlation=True, correlation_threshold=0.9, correlation_overrides=None, check_recoded=False, pool_size=multiprocessing.cpu_count(), **kwargs): """Generates a dict containing summary statistics for a given dataset stored as a pandas `DataFrame`. Used has is it will output its content as an HTML report in a Jupyter notebook. Parameters ---------- df : DataFrame Data to be analyzed bins : int Number of bins in histogram. The default is 10. check_correlation : boolean Whether or not to check correlation. It's `True` by default. correlation_threshold: float Threshold to determine if the variable pair is correlated. The default is 0.9. correlation_overrides : list Variable names not to be rejected because they are correlated. There is no variable in the list (`None`) by default. check_recoded : boolean Whether or not to check recoded correlation (memory heavy feature). Since it's an expensive computation it can be activated for small datasets. `check_correlation` must be true to disable this check. It's `False` by default. pool_size : int Number of workers in thread pool The default is equal to the number of CPU. Returns ------- dict Containing the following keys: * table: general statistics on the dataset * variables: summary statistics for each variable * freq: frequency table Notes: ------ * The section dedicated to check the correlation should be externalized """ if not isinstance(df, pd.DataFrame): raise TypeError("df must be of type pandas.DataFrame") if df.empty: raise ValueError("df can not be empty") try: # reset matplotlib style before use # Fails in matplotlib 1.4.x so plot might look bad matplotlib.style.use("default") except: pass matplotlib.style.use(resource_filename(__name__, "pandas_profiling.mplstyle")) # Clearing the cache before computing stats base.clear_cache() if not pd.Index(np.arange(0, len(df))).equals(df.index): # Treat index as any other column df = df.reset_index() kwargs.update({'bins': bins}) # Describe all variables in a univariate way if pool_size == 1: local_multiprocess_func = partial(multiprocess_func, **kwargs) ldesc = {col: s for col, s in map(local_multiprocess_func, df.iteritems())} else: pool = multiprocessing.Pool(pool_size) local_multiprocess_func = partial(multiprocess_func, **kwargs) ldesc = {col: s for col, s in pool.map(local_multiprocess_func, df.iteritems())} pool.close() # Get correlations dfcorrPear = df.corr(method="pearson") dfcorrSpear = df.corr(method="spearman") # Check correlations between variable if check_correlation is True: ''' TODO: corr(x,y) > 0.9 and corr(y,z) > 0.9 does not imply corr(x,z) > 0.9 If x~y and y~z but not x~z, it would be better to delete only y Better way would be to find out which variable causes the highest increase in multicollinearity. ''' corr = dfcorrPear.copy() for x, corr_x in corr.iterrows(): if correlation_overrides and x in correlation_overrides: continue for y, corr in corr_x.iteritems(): if x == y: break if corr > correlation_threshold: ldesc[x] = pd.Series(['CORR', y, corr], index=['type', 'correlation_var', 'correlation']) if check_recoded: categorical_variables = [(name, data) for (name, data) in df.iteritems() if base.get_vartype(data)=='CAT'] for (name1, data1), (name2, data2) in itertools.combinations(categorical_variables, 2): if correlation_overrides and name1 in correlation_overrides: continue confusion_matrix=pd.crosstab(data1,data2) if confusion_matrix.values.diagonal().sum() == len(df): ldesc[name1] = pd.Series(['RECODED', name2], index=['type', 'correlation_var']) # Convert ldesc to a DataFrame names = [] ldesc_indexes = sorted([x.index for x in ldesc.values()], key=len) for idxnames in ldesc_indexes: for name in idxnames: if name not in names: names.append(name) variable_stats = pd.concat(ldesc, join_axes=pd.Index([names]), axis=1) variable_stats.columns.names = df.columns.names # General statistics table_stats = {} table_stats['n'] = len(df) table_stats['nvar'] = len(df.columns) table_stats['total_missing'] = variable_stats.loc['n_missing'].sum() / (table_stats['n'] * table_stats['nvar']) unsupported_columns = variable_stats.transpose()[variable_stats.transpose().type != base.S_TYPE_UNSUPPORTED].index.tolist() table_stats['n_duplicates'] = sum(df.duplicated(subset=unsupported_columns)) if len(unsupported_columns) > 0 else 0 memsize = df.memory_usage(index=True).sum() table_stats['memsize'] = formatters.fmt_bytesize(memsize) table_stats['recordsize'] = formatters.fmt_bytesize(memsize / table_stats['n']) table_stats.update({k: 0 for k in ("NUM", "DATE", "CONST", "CAT", "UNIQUE", "CORR", "RECODED", "BOOL", "UNSUPPORTED")}) table_stats.update(dict(variable_stats.loc['type'].value_counts())) table_stats['REJECTED'] = table_stats['CONST'] + table_stats['CORR'] + table_stats['RECODED'] return { 'table': table_stats, 'variables': variable_stats.T, 'freq': {k: (base.get_groupby_statistic(df[k])[0] if variable_stats[k].type != base.S_TYPE_UNSUPPORTED else None) for k in df.columns}, 'correlations': {'pearson': dfcorrPear, 'spearman': dfcorrSpear} }
def describe(df, bins=10, check_correlation=True, correlation_overrides=None, pool_size=multiprocessing.cpu_count(), **kwargs): """Generates a dict containing summary statistics for a given dataset stored as a pandas `DataFrame`. Used has is it will output its content as an HTML report in a Jupyter notebook. Parameters ---------- df : DataFrame Data to be analyzed bins : int Number of bins in histogram check_correlation : boolean Whether or not to check correlation. correlation_overrides : list Variable names not to be rejected because they are correlated pool_size: int Number of workers in thread pool Returns ------- dict Containing the following keys: * table: general statistics on the dataset * variables: summary statistics for each variable * freq: frequency table Notes: ------ * The section dedicated to check the correlation should be externalized """ if not isinstance(df, pd.DataFrame): raise TypeError("df must be of type pandas.DataFrame") if df.empty: raise ValueError("df can not be empty") try: # reset matplotlib style before use # Fails in matplotlib 1.4.x so plot might look bad matplotlib.style.use("default") except: pass matplotlib.style.use(resource_filename(__name__, "pandas_profiling.mplstyle")) if not pd.Index(np.arange(0, len(df))).equals(df.index): # Treat index as any other column df = df.reset_index() # Describe all variables in a univariate way pool = multiprocessing.Pool(pool_size) local_multiprocess_func = partial(multiprocess_func, **kwargs) ldesc = {col: s for col, s in pool.map(local_multiprocess_func, df.iteritems())} pool.close() # Check correlations between variable if check_correlation is True: ''' TODO: corr(x,y) > 0.9 and corr(y,z) > 0.9 does not imply corr(x,z) > 0.9 If x~y and y~z but not x~z, it would be better to delete only y Better way would be to find out which variable causes the highest increase in multicollinearity. ''' corr = df.corr() for x, corr_x in corr.iterrows(): if correlation_overrides and x in correlation_overrides: continue for y, corr in corr_x.iteritems(): if x == y: break if corr > 0.9: ldesc[x] = pd.Series(['CORR', y, corr], index=['type', 'correlation_var', 'correlation']) categorical_variables = [(name, data) for (name, data) in df.iteritems() if base.get_vartype(data)=='CAT'] for (name1, data1), (name2, data2) in itertools.combinations(categorical_variables, 2): if correlation_overrides and name1 in correlation_overrides: continue confusion_matrix=pd.crosstab(data1,data2) if confusion_matrix.values.diagonal().sum() == len(df): ldesc[name1] = pd.Series(['RECODED', name2], index=['type', 'correlation_var']) # Convert ldesc to a DataFrame names = [] ldesc_indexes = sorted([x.index for x in ldesc.values()], key=len) for idxnames in ldesc_indexes: for name in idxnames: if name not in names: names.append(name) variable_stats = pd.concat(ldesc, join_axes=pd.Index([names]), axis=1) variable_stats.columns.names = df.columns.names # General statistics table_stats = {'n': len(df), 'nvar': len(df.columns)} table_stats['total_missing'] = variable_stats.loc['n_missing'].sum() / (table_stats['n'] * table_stats['nvar']) table_stats['n_duplicates'] = sum(df.duplicated()) memsize = df.memory_usage(index=True).sum() table_stats['memsize'] = formatters.fmt_bytesize(memsize) table_stats['recordsize'] = formatters.fmt_bytesize(memsize / table_stats['n']) table_stats.update({k: 0 for k in ("NUM", "DATE", "CONST", "CAT", "UNIQUE", "CORR", "RECODED", "BOOL")}) table_stats.update(dict(variable_stats.loc['type'].value_counts())) table_stats['REJECTED'] = table_stats['CONST'] + table_stats['CORR'] + table_stats['RECODED'] return {'table': table_stats, 'variables': variable_stats.T, 'freq': {k: df[k].value_counts() for k in df.columns}}
def describe_1d(data, **kwargs): """Compute summary statistics of a variable (a Series). The description is different according to the type of the variable. However a set of common stats is also computed. Parameters ---------- series : Series The variable to describe. Returns ------- Series The description of the variable as a Series with index being stats keys. """ leng = len(data) # number of observations in the Series count = data.count() # number of non-NaN observations in the Series # Replace infinite values with NaNs to avoid issues with # histograms later. data.replace(to_replace=[np.inf, np.NINF, np.PINF], value=np.nan, inplace=True) n_infinite = count - data.count() # number of infinte observations in the Series distinct_count = data.nunique(dropna=False) # number of unique elements in the Series if count > distinct_count > 1: mode = data.mode().iloc[0] else: mode = data[0] results_data = {'count': count, 'distinct_count': distinct_count, 'p_missing': 1 - count / leng, 'n_missing': leng - count, 'p_infinite': n_infinite / leng, 'n_infinite': n_infinite, 'is_unique': distinct_count == leng, 'mode': mode, 'p_unique': distinct_count / leng} try: # pandas 0.17 onwards results_data['memorysize'] = data.memory_usage() except: results_data['memorysize'] = 0 result = pd.Series(results_data, name=data.name) vartype = base.get_vartype(data) if vartype == base.S_TYPE_CONST: result = result.append(describe_constant_1d(data)) elif vartype == base.TYPE_BOOL: result = result.append(describe_boolean_1d(data, **kwargs)) elif vartype == base.TYPE_NUM: result = result.append(describe_numeric_1d(data, **kwargs)) elif vartype == base.TYPE_DATE: result = result.append(describe_date_1d(data, **kwargs)) elif vartype == base.S_TYPE_UNIQUE: result = result.append(describe_unique_1d(data, **kwargs)) else: # TYPE_CAT result = result.append(describe_categorical_1d(data)) return result