示例#1
0
def _plot_histogram(series, bins=10, figsize=(6, 4), facecolor='#337ab7'):
    """Plot an histogram from the data and return the AxesSubplot object.

    Parameters
    ----------
    series : Series
        The data to plot
    figsize : tuple
        The size of the figure (width, height) in inches, default (6,4)
    facecolor : str
        The color code.

    Returns
    -------
    matplotlib.AxesSubplot
        The plot.
    """
    if base.get_vartype(series) == base.TYPE_DATE:
        # TODO: These calls should be merged
        fig = plt.figure(figsize=figsize)
        plot = fig.add_subplot(111)
        plot.set_ylabel('Frequency')
        try:
            plot.hist(series.values, facecolor=facecolor, bins=bins)
        except TypeError:  # matplotlib 1.4 can't plot dates so will show empty plot instead
            pass
    else:
        plot = series.plot(
            kind='hist', figsize=figsize, facecolor=facecolor, bins=bins
        )  # TODO when running on server, send this off to a different thread
    return plot
示例#2
0
def _plot_histogram(series, bins=10, figsize=(6, 4), facecolor='#337ab7'):
    """Plot an histogram from the data and return the AxesSubplot object.

    Parameters
    ----------
    series : Series
        The data to plot
    figsize : tuple
        The size of the figure (width, height) in inches, default (6,4)
    facecolor : str
        The color code.

    Returns
    -------
    matplotlib.AxesSubplot
        The plot.
    """
    if base.get_vartype(series) == base.TYPE_DATE:
        # TODO: These calls should be merged
        fig = plt.figure(figsize=figsize)
        plot = fig.add_subplot(111)
        plot.set_ylabel('Frequency')
        try:
            plot.hist(series.values, facecolor=facecolor, bins=bins)
        except TypeError: # matplotlib 1.4 can't plot dates so will show empty plot instead
            pass
    else:
        plot = series.plot(kind='hist', figsize=figsize,
                           facecolor=facecolor,
                           bins=bins)  # TODO when running on server, send this off to a different thread
    return plot
示例#3
0
def describe_categorical_1d(series):
    """Compute summary statistics of a categorical (`TYPE_CAT`) variable (a Series).

    Parameters
    ----------
    series : Series
        The variable to describe.

    Returns
    -------
    Series
        The description of the variable as a Series with index being stats keys.
    """
    # Only run if at least 1 non-missing value
    value_counts, distinct_count = base.get_groupby_statistic(series)
    top, freq = value_counts.index[0], value_counts.iloc[0]
    names = []
    result = []

    if base.get_vartype(series) == base.TYPE_CAT:
        names += ['top', 'freq', 'type']
        result += [top, freq, base.TYPE_CAT]

        # Report max length and max trimmed length
        series_str = series.fillna('')
        max_trimmed_length = series_str.apply(lambda x: len(x.strip())).max()
        names.append('max_trimmed_length')
        result.append(max_trimmed_length)

        max_length = series_str.apply(lambda x: len(x)).max()
        names.append('max_length')
        result.append(max_length)

        # Add a mask type
        mask = series.apply(getMask)[series.apply(pd.notnull)]
        mask.name = mask.name + '_mask'
        mask_value_counts, mask_distinct_count = base.get_groupby_statistic(
            mask)
        names += ['top_mask', 'freq_mask']
        result += [
            mask_value_counts.index[0],
            '%0.0f' % (100.0 * mask_value_counts.iloc[0] / len(mask))
        ]

        # Find all special characters
        sc = mask.apply(lambda x: re.sub(r'[lLDs]', '', x))
        sc_list = ''.join(sc.values)
        sc_set = set(list(sc_list))
        names.append('sc_set')
        result.append(sc_set)

    return pd.Series(result, index=names, name=series.name)
示例#4
0
def describe_1d(data, **kwargs):
    """Compute summary statistics of a variable (a Series).

    The description is different according to the type of the variable.
    However a set of common stats is also computed.

    Parameters
    ----------
    series : Series
        The variable to describe.

    Returns
    -------
    Series
        The description of the variable as a Series with index being stats keys.
    """

    # Replace infinite values with NaNs to avoid issues with
    # histograms later.
    data.replace(to_replace=[np.inf, np.NINF, np.PINF],
                 value=np.nan,
                 inplace=True)

    result = pd.Series({}, name=data.name)

    vartype = base.get_vartype(data)

    if vartype == base.S_TYPE_UNSUPPORTED:
        result = result.append(describe_unsupported(data))
    else:
        result = result.append(describe_supported(data))

        if vartype == base.S_TYPE_CONST:
            result = result.append(describe_constant_1d(data))
        elif vartype == base.TYPE_BOOL:
            result = result.append(describe_boolean_1d(data))
        elif vartype == base.TYPE_NUM:
            result = result.append(describe_numeric_1d(data, **kwargs))
        elif vartype == base.TYPE_DATE:
            result = result.append(describe_date_1d(data))
        elif vartype == base.S_TYPE_UNIQUE:
            result = result.append(describe_unique_1d(data))
        else:
            # TYPE_CAT
            result = result.append(describe_categorical_1d(data))

    return result
示例#5
0
def describe_categorical_1d(series):
    """Compute summary statistics of a categorical (`TYPE_CAT`) variable (a Series).

    Parameters
    ----------
    series : Series
        The variable to describe.

    Returns
    -------
    Series
        The description of the variable as a Series with index being stats keys.
    """
    # Only run if at least 1 non-missing value
    value_counts, distinct_count = base.get_groupby_statistic(series)
    top, freq = value_counts.index[0], value_counts.iloc[0]
    names = []
    result = []

    if base.get_vartype(series) == base.TYPE_CAT:
        names += ['top', 'freq', 'type']
        result += [top, freq, base.TYPE_CAT]

    return pd.Series(result, index=names, name=series.name)
示例#6
0
def describe_categorical_1d(series):
    """Compute summary statistics of a categorical (`TYPE_CAT`) variable (a Series).

    Parameters
    ----------
    series : Series
        The variable to describe.

    Returns
    -------
    Series
        The description of the variable as a Series with index being stats keys.
    """
    # Only run if at least 1 non-missing value
    objcounts = series.value_counts()
    top, freq = objcounts.index[0], objcounts.iloc[0]
    names = []
    result = []

    if base.get_vartype(series) == base.TYPE_CAT:
        names += ['top', 'freq', 'type']
        result += [top, freq, base.TYPE_CAT]

    return pd.Series(result, index=names, name=series.name)
示例#7
0
def describe(df, bins=10, check_correlation=True, correlation_threshold=0.9, correlation_overrides=None, check_recoded=False, pool_size=multiprocessing.cpu_count(), **kwargs):
    """Generates a dict containing summary statistics for a given dataset stored as a pandas `DataFrame`.

    Used has is it will output its content as an HTML report in a Jupyter notebook.

    Parameters
    ----------
    df : DataFrame
        Data to be analyzed
    bins : int
        Number of bins in histogram.
        The default is 10.
    check_correlation : boolean
        Whether or not to check correlation.
        It's `True` by default.
    correlation_threshold: float
        Threshold to determine if the variable pair is correlated.
        The default is 0.9.
    correlation_overrides : list
        Variable names not to be rejected because they are correlated.
        There is no variable in the list (`None`) by default.
    check_recoded : boolean
        Whether or not to check recoded correlation (memory heavy feature).
        Since it's an expensive computation it can be activated for small datasets.
        `check_correlation` must be true to disable this check.
        It's `False` by default.
    pool_size : int
        Number of workers in thread pool
        The default is equal to the number of CPU.

    Returns
    -------
    dict
        Containing the following keys:
            * table: general statistics on the dataset
            * variables: summary statistics for each variable
            * freq: frequency table

    Notes:
    ------
        * The section dedicated to check the correlation should be externalized
    """

    if not isinstance(df, pd.DataFrame):
        raise TypeError("df must be of type pandas.DataFrame")
    if df.empty:
        raise ValueError("df can not be empty")

    try:
        # reset matplotlib style before use
        # Fails in matplotlib 1.4.x so plot might look bad
        matplotlib.style.use("default")
    except:
        pass

    matplotlib.style.use(resource_filename(__name__, "pandas_profiling.mplstyle"))

    # Clearing the cache before computing stats
    base.clear_cache()

    if not pd.Index(np.arange(0, len(df))).equals(df.index):
        # Treat index as any other column
        df = df.reset_index()

    kwargs.update({'bins': bins})
    # Describe all variables in a univariate way
    if pool_size == 1:
        local_multiprocess_func = partial(multiprocess_func, **kwargs)
        ldesc = {col: s for col, s in map(local_multiprocess_func, df.iteritems())}
    else:
        pool = multiprocessing.Pool(pool_size)
        local_multiprocess_func = partial(multiprocess_func, **kwargs)
        ldesc = {col: s for col, s in pool.map(local_multiprocess_func, df.iteritems())}
        pool.close()

    # Get correlations
    dfcorrPear = df.corr(method="pearson")
    dfcorrSpear = df.corr(method="spearman")

    # Check correlations between variable
    if check_correlation is True:
        ''' TODO: corr(x,y) > 0.9 and corr(y,z) > 0.9 does not imply corr(x,z) > 0.9
        If x~y and y~z but not x~z, it would be better to delete only y
        Better way would be to find out which variable causes the highest increase in multicollinearity.
        '''
        corr = dfcorrPear.copy()
        for x, corr_x in corr.iterrows():
            if correlation_overrides and x in correlation_overrides:
                continue

            for y, corr in corr_x.iteritems():
                if x == y: break

                if corr > correlation_threshold:
                    ldesc[x] = pd.Series(['CORR', y, corr], index=['type', 'correlation_var', 'correlation'])

        if check_recoded:
            categorical_variables = [(name, data) for (name, data) in df.iteritems() if base.get_vartype(data)=='CAT']
            for (name1, data1), (name2, data2) in itertools.combinations(categorical_variables, 2):
                if correlation_overrides and name1 in correlation_overrides:
                    continue

                confusion_matrix=pd.crosstab(data1,data2)
                if confusion_matrix.values.diagonal().sum() == len(df):
                    ldesc[name1] = pd.Series(['RECODED', name2], index=['type', 'correlation_var'])

    # Convert ldesc to a DataFrame
    names = []
    ldesc_indexes = sorted([x.index for x in ldesc.values()], key=len)
    for idxnames in ldesc_indexes:
        for name in idxnames:
            if name not in names:
                names.append(name)
    variable_stats = pd.concat(ldesc, join_axes=pd.Index([names]), axis=1)
    variable_stats.columns.names = df.columns.names

    # General statistics
    table_stats = {}

    table_stats['n'] = len(df)
    table_stats['nvar'] = len(df.columns)
    table_stats['total_missing'] = variable_stats.loc['n_missing'].sum() / (table_stats['n'] * table_stats['nvar'])
    unsupported_columns = variable_stats.transpose()[variable_stats.transpose().type != base.S_TYPE_UNSUPPORTED].index.tolist()
    table_stats['n_duplicates'] = sum(df.duplicated(subset=unsupported_columns)) if len(unsupported_columns) > 0 else 0

    memsize = df.memory_usage(index=True).sum()
    table_stats['memsize'] = formatters.fmt_bytesize(memsize)
    table_stats['recordsize'] = formatters.fmt_bytesize(memsize / table_stats['n'])

    table_stats.update({k: 0 for k in ("NUM", "DATE", "CONST", "CAT", "UNIQUE", "CORR", "RECODED", "BOOL", "UNSUPPORTED")})
    table_stats.update(dict(variable_stats.loc['type'].value_counts()))
    table_stats['REJECTED'] = table_stats['CONST'] + table_stats['CORR'] + table_stats['RECODED']

    return {
        'table': table_stats,
        'variables': variable_stats.T,
        'freq': {k: (base.get_groupby_statistic(df[k])[0] if variable_stats[k].type != base.S_TYPE_UNSUPPORTED else None) for k in df.columns},
        'correlations': {'pearson': dfcorrPear, 'spearman': dfcorrSpear}
    }
示例#8
0
def describe(df, bins=10, check_correlation=True, correlation_overrides=None, pool_size=multiprocessing.cpu_count(), **kwargs):
    """Generates a dict containing summary statistics for a given dataset stored as a pandas `DataFrame`.

    Used has is it will output its content as an HTML report in a Jupyter notebook.

    Parameters
    ----------
    df : DataFrame
        Data to be analyzed
    bins : int
        Number of bins in histogram
    check_correlation : boolean
        Whether or not to check correlation.
    correlation_overrides : list
        Variable names not to be rejected because they are correlated
    pool_size: int
        Number of workers in thread pool

    Returns
    -------
    dict
        Containing the following keys:
            * table: general statistics on the dataset
            * variables: summary statistics for each variable
            * freq: frequency table

    Notes:
    ------
        * The section dedicated to check the correlation should be externalized
    """

    if not isinstance(df, pd.DataFrame):
        raise TypeError("df must be of type pandas.DataFrame")
    if df.empty:
        raise ValueError("df can not be empty")

    try:
        # reset matplotlib style before use
        # Fails in matplotlib 1.4.x so plot might look bad
        matplotlib.style.use("default")
    except:
        pass

    matplotlib.style.use(resource_filename(__name__, "pandas_profiling.mplstyle"))

    if not pd.Index(np.arange(0, len(df))).equals(df.index):
        # Treat index as any other column
        df = df.reset_index()

    # Describe all variables in a univariate way
    pool = multiprocessing.Pool(pool_size)
    local_multiprocess_func = partial(multiprocess_func, **kwargs)
    ldesc = {col: s for col, s in pool.map(local_multiprocess_func, df.iteritems())}
    pool.close()

    # Check correlations between variable
    if check_correlation is True:
        ''' TODO: corr(x,y) > 0.9 and corr(y,z) > 0.9 does not imply corr(x,z) > 0.9
        If x~y and y~z but not x~z, it would be better to delete only y
        Better way would be to find out which variable causes the highest increase in multicollinearity.
        '''
        corr = df.corr()
        for x, corr_x in corr.iterrows():
            if correlation_overrides and x in correlation_overrides:
                continue

            for y, corr in corr_x.iteritems():
                if x == y: break

                if corr > 0.9:
                    ldesc[x] = pd.Series(['CORR', y, corr], index=['type', 'correlation_var', 'correlation'])

        categorical_variables = [(name, data) for (name, data) in df.iteritems() if base.get_vartype(data)=='CAT']
        for (name1, data1), (name2, data2) in itertools.combinations(categorical_variables, 2):
            if correlation_overrides and name1 in correlation_overrides:
                continue

            confusion_matrix=pd.crosstab(data1,data2)
            if confusion_matrix.values.diagonal().sum() == len(df):
                ldesc[name1] = pd.Series(['RECODED', name2], index=['type', 'correlation_var'])

    # Convert ldesc to a DataFrame
    names = []
    ldesc_indexes = sorted([x.index for x in ldesc.values()], key=len)
    for idxnames in ldesc_indexes:
        for name in idxnames:
            if name not in names:
                names.append(name)
    variable_stats = pd.concat(ldesc, join_axes=pd.Index([names]), axis=1)
    variable_stats.columns.names = df.columns.names

    # General statistics
    table_stats = {'n': len(df), 'nvar': len(df.columns)}
    table_stats['total_missing'] = variable_stats.loc['n_missing'].sum() / (table_stats['n'] * table_stats['nvar'])
    table_stats['n_duplicates'] = sum(df.duplicated())

    memsize = df.memory_usage(index=True).sum()
    table_stats['memsize'] = formatters.fmt_bytesize(memsize)
    table_stats['recordsize'] = formatters.fmt_bytesize(memsize / table_stats['n'])

    table_stats.update({k: 0 for k in ("NUM", "DATE", "CONST", "CAT", "UNIQUE", "CORR", "RECODED", "BOOL")})
    table_stats.update(dict(variable_stats.loc['type'].value_counts()))
    table_stats['REJECTED'] = table_stats['CONST'] + table_stats['CORR'] + table_stats['RECODED']

    return {'table': table_stats, 'variables': variable_stats.T, 'freq': {k: df[k].value_counts() for k in df.columns}}
示例#9
0
def describe_1d(data, **kwargs):
    """Compute summary statistics of a variable (a Series).

    The description is different according to the type of the variable.
    However a set of common stats is also computed.

    Parameters
    ----------
    series : Series
        The variable to describe.

    Returns
    -------
    Series
        The description of the variable as a Series with index being stats keys.
    """
    leng = len(data)  # number of observations in the Series
    count = data.count()  # number of non-NaN observations in the Series

    # Replace infinite values with NaNs to avoid issues with
    # histograms later.
    data.replace(to_replace=[np.inf, np.NINF, np.PINF], value=np.nan, inplace=True)

    n_infinite = count - data.count()  # number of infinte observations in the Series

    distinct_count = data.nunique(dropna=False)  # number of unique elements in the Series
    if count > distinct_count > 1:
        mode = data.mode().iloc[0]
    else:
        mode = data[0]

    results_data = {'count': count,
                    'distinct_count': distinct_count,
                    'p_missing': 1 - count / leng,
                    'n_missing': leng - count,
                    'p_infinite': n_infinite / leng,
                    'n_infinite': n_infinite,
                    'is_unique': distinct_count == leng,
                    'mode': mode,
                    'p_unique': distinct_count / leng}
    try:
        # pandas 0.17 onwards
        results_data['memorysize'] = data.memory_usage()
    except:
        results_data['memorysize'] = 0

    result = pd.Series(results_data, name=data.name)

    vartype = base.get_vartype(data)
    if vartype == base.S_TYPE_CONST:
        result = result.append(describe_constant_1d(data))
    elif vartype == base.TYPE_BOOL:
        result = result.append(describe_boolean_1d(data, **kwargs))
    elif vartype == base.TYPE_NUM:
        result = result.append(describe_numeric_1d(data, **kwargs))
    elif vartype == base.TYPE_DATE:
        result = result.append(describe_date_1d(data, **kwargs))
    elif vartype == base.S_TYPE_UNIQUE:
        result = result.append(describe_unique_1d(data, **kwargs))
    else:
        # TYPE_CAT
        result = result.append(describe_categorical_1d(data))
    return result