示例#1
0
def describe_categorical_1d(series):
    """Compute summary statistics of a categorical (`TYPE_CAT`) variable (a Series).

    Parameters
    ----------
    series : Series
        The variable to describe.

    Returns
    -------
    Series
        The description of the variable as a Series with index being stats keys.
    """
    # Only run if at least 1 non-missing value
    value_counts, distinct_count = base.get_groupby_statistic(series)
    top, freq = value_counts.index[0], value_counts.iloc[0]
    names = []
    result = []

    if base.get_vartype(series) == base.TYPE_CAT:
        names += ['top', 'freq', 'type']
        result += [top, freq, base.TYPE_CAT]

        # Report max length and max trimmed length
        series_str = series.fillna('')
        max_trimmed_length = series_str.apply(lambda x: len(x.strip())).max()
        names.append('max_trimmed_length')
        result.append(max_trimmed_length)

        max_length = series_str.apply(lambda x: len(x)).max()
        names.append('max_length')
        result.append(max_length)

        # Add a mask type
        mask = series.apply(getMask)[series.apply(pd.notnull)]
        mask.name = mask.name + '_mask'
        mask_value_counts, mask_distinct_count = base.get_groupby_statistic(
            mask)
        names += ['top_mask', 'freq_mask']
        result += [
            mask_value_counts.index[0],
            '%0.0f' % (100.0 * mask_value_counts.iloc[0] / len(mask))
        ]

        # Find all special characters
        sc = mask.apply(lambda x: re.sub(r'[lLDs]', '', x))
        sc_list = ''.join(sc.values)
        sc_set = set(list(sc_list))
        names.append('sc_set')
        result.append(sc_set)

    return pd.Series(result, index=names, name=series.name)
示例#2
0
def describe_supported(series, **kwargs):
    """Compute summary statistics of a supported variable (a Series).

    Parameters
    ----------
    series : Series
        The variable to describe.

    Returns
    -------
    Series
        The description of the variable as a Series with index being stats keys.
    """
    leng = len(series)  # number of observations in the Series
    lengf = 1.0 * leng
    count = series.count()  # number of non-NaN observations in the Series
    n_infinite = count - series.count(
    )  # number of infinte observations in the Series

    value_counts, distinct_count = base.get_groupby_statistic(series)
    if count > distinct_count > 1:
        mode = series.mode().iloc[0]
    else:
        mode = series[0]

    results_data = {
        'count': count,
        'distinct_count': distinct_count,
        'p_missing': 1 - count / lengf,
        'n_missing': leng - count,
        'p_infinite': n_infinite / lengf,
        'n_infinite': n_infinite,
        'is_unique': distinct_count == leng,
        'mode': mode,
        'p_unique': distinct_count / lengf
    }
    try:
        # pandas 0.17 onwards
        results_data['memorysize'] = series.memory_usage()
    except:
        results_data['memorysize'] = 0

    return pd.Series(results_data, name=series.name)
示例#3
0
def describe_boolean_1d(series):
    """Compute summary statistics of a boolean (`TYPE_BOOL`) variable (a Series).

    Parameters
    ----------
    series : Series
        The variable to describe.

    Returns
    -------
    Series
        The description of the variable as a Series with index being stats keys.
    """
    value_counts, distinct_count = base.get_groupby_statistic(series)
    top, freq = value_counts.index[0], value_counts.iloc[0]
    # The mean of boolean is an interesting information
    mean = series.mean()
    names = []
    result = []
    names += ['top', 'freq', 'type', 'mean']
    result += [top, freq, base.TYPE_BOOL, mean]

    return pd.Series(result, index=names, name=series.name)
示例#4
0
def describe_categorical_1d(series):
    """Compute summary statistics of a categorical (`TYPE_CAT`) variable (a Series).

    Parameters
    ----------
    series : Series
        The variable to describe.

    Returns
    -------
    Series
        The description of the variable as a Series with index being stats keys.
    """
    # Only run if at least 1 non-missing value
    value_counts, distinct_count = base.get_groupby_statistic(series)
    top, freq = value_counts.index[0], value_counts.iloc[0]
    names = []
    result = []

    if base.get_vartype(series) == base.TYPE_CAT:
        names += ['top', 'freq', 'type']
        result += [top, freq, base.TYPE_CAT]

    return pd.Series(result, index=names, name=series.name)
示例#5
0
def describe(df, bins=10, check_correlation=True, correlation_threshold=0.9, correlation_overrides=None, check_recoded=False, pool_size=multiprocessing.cpu_count(), **kwargs):
    """Generates a dict containing summary statistics for a given dataset stored as a pandas `DataFrame`.

    Used has is it will output its content as an HTML report in a Jupyter notebook.

    Parameters
    ----------
    df : DataFrame
        Data to be analyzed
    bins : int
        Number of bins in histogram.
        The default is 10.
    check_correlation : boolean
        Whether or not to check correlation.
        It's `True` by default.
    correlation_threshold: float
        Threshold to determine if the variable pair is correlated.
        The default is 0.9.
    correlation_overrides : list
        Variable names not to be rejected because they are correlated.
        There is no variable in the list (`None`) by default.
    check_recoded : boolean
        Whether or not to check recoded correlation (memory heavy feature).
        Since it's an expensive computation it can be activated for small datasets.
        `check_correlation` must be true to disable this check.
        It's `False` by default.
    pool_size : int
        Number of workers in thread pool
        The default is equal to the number of CPU.

    Returns
    -------
    dict
        Containing the following keys:
            * table: general statistics on the dataset
            * variables: summary statistics for each variable
            * freq: frequency table

    Notes:
    ------
        * The section dedicated to check the correlation should be externalized
    """

    if not isinstance(df, pd.DataFrame):
        raise TypeError("df must be of type pandas.DataFrame")
    if df.empty:
        raise ValueError("df can not be empty")

    try:
        # reset matplotlib style before use
        # Fails in matplotlib 1.4.x so plot might look bad
        matplotlib.style.use("default")
    except:
        pass

    matplotlib.style.use(resource_filename(__name__, "pandas_profiling.mplstyle"))

    # Clearing the cache before computing stats
    base.clear_cache()

    if not pd.Index(np.arange(0, len(df))).equals(df.index):
        # Treat index as any other column
        df = df.reset_index()

    kwargs.update({'bins': bins})
    # Describe all variables in a univariate way
    if pool_size == 1:
        local_multiprocess_func = partial(multiprocess_func, **kwargs)
        ldesc = {col: s for col, s in map(local_multiprocess_func, df.iteritems())}
    else:
        pool = multiprocessing.Pool(pool_size)
        local_multiprocess_func = partial(multiprocess_func, **kwargs)
        ldesc = {col: s for col, s in pool.map(local_multiprocess_func, df.iteritems())}
        pool.close()

    # Get correlations
    dfcorrPear = df.corr(method="pearson")
    dfcorrSpear = df.corr(method="spearman")

    # Check correlations between variable
    if check_correlation is True:
        ''' TODO: corr(x,y) > 0.9 and corr(y,z) > 0.9 does not imply corr(x,z) > 0.9
        If x~y and y~z but not x~z, it would be better to delete only y
        Better way would be to find out which variable causes the highest increase in multicollinearity.
        '''
        corr = dfcorrPear.copy()
        for x, corr_x in corr.iterrows():
            if correlation_overrides and x in correlation_overrides:
                continue

            for y, corr in corr_x.iteritems():
                if x == y: break

                if corr > correlation_threshold:
                    ldesc[x] = pd.Series(['CORR', y, corr], index=['type', 'correlation_var', 'correlation'])

        if check_recoded:
            categorical_variables = [(name, data) for (name, data) in df.iteritems() if base.get_vartype(data)=='CAT']
            for (name1, data1), (name2, data2) in itertools.combinations(categorical_variables, 2):
                if correlation_overrides and name1 in correlation_overrides:
                    continue

                confusion_matrix=pd.crosstab(data1,data2)
                if confusion_matrix.values.diagonal().sum() == len(df):
                    ldesc[name1] = pd.Series(['RECODED', name2], index=['type', 'correlation_var'])

    # Convert ldesc to a DataFrame
    names = []
    ldesc_indexes = sorted([x.index for x in ldesc.values()], key=len)
    for idxnames in ldesc_indexes:
        for name in idxnames:
            if name not in names:
                names.append(name)
    variable_stats = pd.concat(ldesc, join_axes=pd.Index([names]), axis=1)
    variable_stats.columns.names = df.columns.names

    # General statistics
    table_stats = {}

    table_stats['n'] = len(df)
    table_stats['nvar'] = len(df.columns)
    table_stats['total_missing'] = variable_stats.loc['n_missing'].sum() / (table_stats['n'] * table_stats['nvar'])
    unsupported_columns = variable_stats.transpose()[variable_stats.transpose().type != base.S_TYPE_UNSUPPORTED].index.tolist()
    table_stats['n_duplicates'] = sum(df.duplicated(subset=unsupported_columns)) if len(unsupported_columns) > 0 else 0

    memsize = df.memory_usage(index=True).sum()
    table_stats['memsize'] = formatters.fmt_bytesize(memsize)
    table_stats['recordsize'] = formatters.fmt_bytesize(memsize / table_stats['n'])

    table_stats.update({k: 0 for k in ("NUM", "DATE", "CONST", "CAT", "UNIQUE", "CORR", "RECODED", "BOOL", "UNSUPPORTED")})
    table_stats.update(dict(variable_stats.loc['type'].value_counts()))
    table_stats['REJECTED'] = table_stats['CONST'] + table_stats['CORR'] + table_stats['RECODED']

    return {
        'table': table_stats,
        'variables': variable_stats.T,
        'freq': {k: (base.get_groupby_statistic(df[k])[0] if variable_stats[k].type != base.S_TYPE_UNSUPPORTED else None) for k in df.columns},
        'correlations': {'pearson': dfcorrPear, 'spearman': dfcorrSpear}
    }