Exemplo n.º 1
0
def calc_hist(srs: dd.Series, bins: int,
              orig_df_len: int) -> Tuple[pd.DataFrame, float]:
    """
    Calculate a histogram over a given series.

    Parameters
    ----------
    srs : dd.Series
        one numerical column over which to compute the histogram
    bins : int
        number of bins to use in the histogram
    orig_df_len : int
        length of the original dataframe

    Returns
    -------
    Tuple[pd.DataFrame, float]:
        The histogram in a dataframe and the percent of missing values
    """
    miss_pct = round(srs.isna().sum() / len(srs) * 100, 1)
    data = srs.dropna().values
    if len(data) == 0:  # all values in column are missing
        return pd.DataFrame({"left": [], "right": [], "freq": []}), miss_pct
    minv, maxv = data.min(), data.max()
    hist_arr, bins_arr = np.histogram(data, range=[minv, maxv], bins=bins)
    intervals = _format_bin_intervals(bins_arr)
    hist_df = pd.DataFrame({
        "intervals": intervals,
        "left": bins_arr[:-1],
        "right": bins_arr[1:],
        "freq": hist_arr,
        "pct": hist_arr / orig_df_len * 100,
    })
    return hist_df, miss_pct
Exemplo n.º 2
0
def get_type(data: dd.Series) -> DataType:
    """ Returns the type of the input data.
        Identified types are according to the DataType Enumeration.

    Parameter
    __________
    The data for which the type needs to be identified.

    Returns
    __________
    str representing the type of the data.
    """
    col_type = DataType.TYPE_UNSUP
    try:
        if pd.api.types.is_bool_dtype(data):
            col_type = DataType.TYPE_CAT
        elif (pd.api.types.is_numeric_dtype(data)
              and dask.compute(data.dropna().unique().size) == 2):
            col_type = DataType.TYPE_CAT
        elif pd.api.types.is_numeric_dtype(data):
            col_type = DataType.TYPE_NUM
        else:
            col_type = DataType.TYPE_CAT
    except NotImplementedError as error:  # TO-DO
        LOGGER.info("Type cannot be determined due to : %s", error)

    return col_type
Exemplo n.º 3
0
def cont_comps(srs: dd.Series, cfg: Config) -> Dict[str, Any]:
    """
    All computations required for plot(df, Continuous)
    """
    # pylint: disable=too-many-branches
    data: Dict[str, Any] = {}

    if cfg.stats.enable or cfg.hist.enable:
        data["nrows"] = srs.shape[0]  # total rows
    srs = srs.dropna()
    if cfg.stats.enable:
        data["npres"] = srs.shape[0]  # number of present (not null) values
    srs = srs[~srs.isin({np.inf, -np.inf})]  # remove infinite values
    if cfg.hist.enable or cfg.qqnorm.enable and cfg.insight.enable:
        data["hist"] = da.histogram(srs, cfg.hist.bins, (srs.min(), srs.max()))
        if cfg.insight.enable:
            data["norm"] = normaltest(data["hist"][0])
    if cfg.hist.enable and cfg.insight.enable:
        data["chisq"] = chisquare(data["hist"][0])
    # compute only the required amount of quantiles
    if cfg.qqnorm.enable:
        data["qntls"] = srs.quantile(np.linspace(0.01, 0.99, 99))
    elif cfg.stats.enable:
        data["qntls"] = srs.quantile([0.05, 0.25, 0.5, 0.75, 0.95])
    elif cfg.box.enable:
        data["qntls"] = srs.quantile([0.25, 0.5, 0.75])
    if cfg.stats.enable or cfg.hist.enable and cfg.insight.enable:
        data["skew"] = skew(srs)
    if cfg.stats.enable or cfg.qqnorm.enable:
        data["mean"] = srs.mean()
        data["std"] = srs.std()
    if cfg.stats.enable:
        data["min"] = srs.min()
        data["max"] = srs.max()
        data["nreals"] = srs.shape[0]
        data["nzero"] = (srs == 0).sum()
        data["nneg"] = (srs < 0).sum()
        data["kurt"] = kurtosis(srs)
        data["mem_use"] = srs.memory_usage(deep=True)
    # compute the density histogram
    if cfg.kde.enable:
        # To avoid the singular matrix problem, gaussian_kde needs a non-zero std.
        if not math.isclose(
                dask.compute(data["min"])[0],
                dask.compute(data["max"])[0]):
            data["dens"] = da.histogram(srs,
                                        cfg.kde.bins, (srs.min(), srs.max()),
                                        density=True)
            # gaussian kernel density estimate
            data["kde"] = gaussian_kde(
                srs.map_partitions(lambda x: x.sample(min(1000, x.shape[0])),
                                   meta=srs))
        else:
            data["kde"] = None
    if cfg.box.enable:
        data.update(_calc_box(srs, data["qntls"], cfg))
    if cfg.value_table.enable:
        value_counts = srs.value_counts(sort=False)
        if cfg.stats.enable:
            data["nuniq"] = value_counts.shape[0]
        data["value_table"] = value_counts.nlargest(cfg.value_table.ngroups)
    elif cfg.stats.enable:
        data["nuniq"] = srs.nunique_approx()

    return data
Exemplo n.º 4
0
def nom_comps(srs: dd.Series, head: pd.Series, cfg: Config) -> Dict[str, Any]:
    """
    All computations required for plot(df, Nominal)
    """
    # pylint: disable=too-many-branches
    data: Dict[str, Any] = dict()

    data["nrows"] = srs.shape[0]  # total rows
    srs = srs.dropna()  # drop null values
    grps = srs.value_counts(
        sort=False)  # counts of unique values in the series
    data["geo"] = grps

    if cfg.stats.enable or cfg.bar.enable or cfg.pie.enable:
        data["nuniq"] = grps.shape[0]  # total number of groups

    # compute bar and pie together unless the parameters are different
    if cfg.bar.enable or cfg.pie.enable:
        # select the largest or smallest groups
        data["bar"] = (grps.nlargest(cfg.bar.bars) if cfg.bar.sort_descending
                       else grps.nsmallest(cfg.bar.bars))

        if cfg.bar.bars == cfg.pie.slices and cfg.bar.sort_descending == cfg.pie.sort_descending:
            data["pie"] = data["bar"]
        else:
            data["pie"] = (grps.nlargest(cfg.pie.slices)
                           if cfg.pie.sort_descending else grps.nsmallest(
                               cfg.pie.slices))

        if cfg.bar.bars == cfg.value_table.ngroups and cfg.bar.sort_descending:
            data["value_table"] = data["bar"]
        elif cfg.pie.slices == cfg.value_table.ngroups and cfg.pie.sort_descending:
            data["value_table"] = data["pie"]
        else:
            data["value_table"] = grps.nlargest(cfg.value_table.ngroups)

        if cfg.insight.enable:
            data["chisq"] = chisquare(grps.values)

    df = grps.reset_index()  # dataframe with group names and counts

    if cfg.stats.enable or cfg.wordlen.enable:
        if not head.apply(lambda x: isinstance(x, str)).all():
            srs = srs.astype(
                str)  # srs must be a string to compute the value lengths
    if cfg.stats.enable or cfg.wordcloud.enable or cfg.wordfreq.enable:
        if not head.apply(lambda x: isinstance(x, str)).all():
            df[df.columns[0]] = df[df.columns[0]].astype(str)

    if cfg.stats.enable:
        data.update(_calc_nom_stats(srs, df, data["nrows"], data["nuniq"]))
    elif cfg.wordfreq.enable and cfg.insight.enable:
        data["len_stats"] = {
            "Minimum": srs.str.len().min(),
            "Maximum": srs.str.len().max()
        }
    if cfg.wordlen.enable:
        lens = srs.str.len()
        data["len_hist"] = da.histogram(lens, cfg.wordlen.bins,
                                        (lens.min(), lens.max()))
    if cfg.wordcloud.enable or cfg.wordfreq.enable:
        if all(
                getattr(cfg.wordcloud, att) == getattr(cfg.wordfreq, att)
                for att in ("top_words", "stopword", "stem", "lemmatize")):
            word_freqs = _calc_word_freq(
                df,
                cfg.wordfreq.top_words,
                cfg.wordfreq.stopword,
                cfg.wordfreq.lemmatize,
                cfg.wordfreq.stem,
            )
            data["word_cnts_cloud"] = word_freqs["word_cnts"]
            data["nuniq_words_cloud"] = word_freqs["nuniq_words"]
        else:
            word_freqs = _calc_word_freq(
                df.copy(),
                cfg.wordfreq.top_words,
                cfg.wordfreq.stopword,
                cfg.wordfreq.lemmatize,
                cfg.wordfreq.stem,
            )
            word_freqs_cloud = _calc_word_freq(
                df,
                cfg.wordcloud.top_words,
                cfg.wordcloud.stopword,
                cfg.wordcloud.lemmatize,
                cfg.wordcloud.stem,
            )
            data["word_cnts_cloud"] = word_freqs_cloud["word_cnts"]
            data["nuniq_words_cloud"] = word_freqs["nuniq_words"]

        data["word_cnts_freq"] = word_freqs["word_cnts"]
        data["nwords_freq"] = word_freqs["nwords"]

    return data
Exemplo n.º 5
0
def cont_comps(srs: dd.Series, bins: int) -> Dict[str, Any]:
    """
    This function aggregates all of the computations required for plot(df, Continuous())

    Parameters
    ----------
    srs
        one numerical column
    bins
        the number of bins in the histogram
    """

    data: Dict[str, Any] = {}

    ## if cfg.stats_enable or cfg.hist_enable or
    # calculate the total number of rows then drop the missing values
    data["nrows"] = srs.shape[0]
    srs = srs.dropna()
    ## if cfg.stats_enable
    # number of not null (present) values
    data["npres"] = srs.shape[0]
    # remove infinite values
    srs = srs[~srs.isin({np.inf, -np.inf})]

    # shared computations
    ## if cfg.stats_enable or cfg.hist_enable or cfg.qqplot_enable and cfg.insights_enable:
    data["min"], data["max"] = srs.min(), srs.max()
    ## if cfg.hist_enable or cfg.qqplot_enable and cfg.ingsights_enable:
    data["hist"] = da.histogram(srs,
                                bins=bins,
                                range=[data["min"], data["max"]])
    ## if cfg.insights_enable and (cfg.qqplot_enable or cfg.hist_enable):
    data["norm"] = normaltest(data["hist"][0])
    ## if cfg.qqplot_enable
    data["qntls"] = srs.quantile(np.linspace(0.01, 0.99, 99))
    ## elif cfg.stats_enable
    ## data["qntls"] = srs.quantile([0.05, 0.25, 0.5, 0.75, 0.95])
    ## elif cfg.boxplot_enable
    ## data["qntls"] = srs.quantile([0.25, 0.5, 0.75])
    ## if cfg.stats_enable or cfg.hist_enable and cfg.insights_enable:
    data["skew"] = skew(srs)

    # if cfg.stats_enable
    data["nuniq"] = srs.nunique()
    data["nreals"] = srs.shape[0]
    data["nzero"] = (srs == 0).sum()
    data["nneg"] = (srs < 0).sum()
    data["mean"] = srs.mean()
    data["std"] = srs.std()
    data["kurt"] = kurtosis(srs)
    data["mem_use"] = srs.memory_usage(deep=True)

    ## if cfg.hist_enable and cfg.insight_enable
    data["chisq"] = chisquare(data["hist"][0])

    # compute the density histogram
    data["dens"] = da.histogram(srs,
                                bins=bins,
                                range=[data["min"], data["max"]],
                                density=True)
    # gaussian kernel density estimate
    data["kde"] = gaussian_kde(
        srs.map_partitions(lambda x: x.sample(min(1000, x.shape[0])),
                           meta=srs))

    ## if cfg.box_enable
    data.update(calc_box(srs, data["qntls"]))

    return data
Exemplo n.º 6
0
def nom_comps(
    srs: dd.Series,
    first_rows: pd.Series,
    ngroups: int,
    largest: bool,
    bins: int,
    top_words: int,
    stopword: bool,
    lemmatize: bool,
    stem: bool,
) -> Dict[str, Any]:
    """
    This function aggregates all of the computations required for plot(df, Nominal())

    Parameters
    ----------
    srs
        one categorical column
    ngroups
        Number of groups to return
    largest
        If true, show the groups with the largest count,
        else show the groups with the smallest count
    bins
        number of bins for the category length frequency histogram
    top_words
        Number of highest frequency words to show in the
        wordcloud and word frequency bar chart
    stopword
        If True, remove stop words, else keep them
    lemmatize
        If True, lemmatize the words before computing
        the word frequencies, else don't
    stem
        If True, extract the stem of the words before
        computing the word frequencies, else don't
    """  # pylint: disable=too-many-arguments

    data: Dict[str, Any] = {}

    # total rows
    data["nrows"] = srs.shape[0]
    # drop null values
    srs = srs.dropna()

    ## if cfg.bar_enable or cfg.pie_enable
    # counts of unique values in the series
    grps = srs.value_counts(sort=False)
    # total number of groups
    data["nuniq"] = grps.shape[0]
    # select the largest or smallest groups
    data["bar"] = grps.nlargest(ngroups) if largest else grps.nsmallest(
        ngroups)
    ##     if cfg.barchart_bars == cfg.piechart_slices:
    data["pie"] = data["bar"]
    ##     else
    ##     data["pie"] = grps.nlargest(ngroups) if largest else grps.nsmallest(ngroups)
    ##     if cfg.insights.evenness_enable
    data["chisq"] = chisquare(grps.values)

    ## if cfg.stats_enable
    df = grps.reset_index()
    ## if cfg.stats_enable or cfg.word_freq_enable
    if not first_rows.apply(lambda x: isinstance(x, str)).all():
        srs = srs.astype(
            str)  # srs must be a string to compute the value lengths
        df[df.columns[0]] = df[df.columns[0]].astype(str)
    data.update(calc_cat_stats(srs, df, bins, data["nrows"], data["nuniq"]))
    # ## if cfg.word_freq_enable
    data.update(calc_word_freq(df, top_words, stopword, lemmatize, stem))

    return data
Exemplo n.º 7
0
def calc_word_freq(
    srs: dd.Series,
    top_words: int = 30,
    stopword: bool = True,
    lemmatize: bool = False,
    stem: bool = False,
) -> Dict[str, Any]:
    """
    Parse a categorical column of text data into words, and then
    compute the frequency distribution of words and the total
    number of words.

    Parameters
    ----------
    srs
        One categorical column
    top_words
        Number of highest frequency words to show in the
        wordcloud and word frequency bar chart
    stopword
        If True, remove stop words, else keep them
    lemmatize
        If True, lemmatize the words before computing
        the word frequencies, else don't
    stem
        If True, extract the stem of the words before
        computing the word frequencies, else don't
    """
    # pylint: disable=unnecessary-lambda
    if stopword:
        # use a regex to replace stop words with empty string
        srs = srs.str.replace(
            r"\b(?:{})\b".format("|".join(english_stopwords)), "")
    # replace all non-alphanumeric characters with an empty string, and convert to lowercase
    srs = srs.str.replace(r"[^\w+ ]", "").str.lower()

    # split each string on whitespace into words then apply "explode()" to "stack" all
    # the words into a series
    # NOTE this is slow. One possibly better solution: after .split(), count the words
    # immediately rather than create a new series with .explode() and apply
    # .value_counts()
    srs = srs.str.split().explode()

    # lemmatize and stem
    if lemmatize or stem:
        srs = srs.dropna()
    if lemmatize:
        lem = WordNetLemmatizer()
        srs = srs.apply(lambda x: lem.lemmatize(x), meta=(srs.name, "object"))
    if stem:
        porter = PorterStemmer()
        srs = srs.apply(lambda x: porter.stem(x), meta=(srs.name, "object"))

    # counts of words, excludes null values
    word_cnts = srs.value_counts(sort=False)
    # total number of words
    nwords = word_cnts.sum()
    # total uniq words
    nuniq_words = word_cnts.shape[0]
    # words with the highest frequency
    fnl_word_cnts = word_cnts.nlargest(n=top_words)

    return {
        "word_cnts": fnl_word_cnts,
        "nwords": nwords,
        "nuniq_words": nuniq_words
    }