Exemplo n.º 1
0
def calc_hist(srs: dd.Series, bins: int,
              orig_df_len: int) -> Tuple[pd.DataFrame, float]:
    """
    Calculate a histogram over a given series.

    Parameters
    ----------
    srs : dd.Series
        one numerical column over which to compute the histogram
    bins : int
        number of bins to use in the histogram
    orig_df_len : int
        length of the original dataframe

    Returns
    -------
    Tuple[pd.DataFrame, float]:
        The histogram in a dataframe and the percent of missing values
    """
    miss_pct = round(srs.isna().sum() / len(srs) * 100, 1)
    data = srs.dropna().values
    if len(data) == 0:  # all values in column are missing
        return pd.DataFrame({"left": [], "right": [], "freq": []}), miss_pct
    minv, maxv = data.min(), data.max()
    hist_arr, bins_arr = np.histogram(data, range=[minv, maxv], bins=bins)
    intervals = _format_bin_intervals(bins_arr)
    hist_df = pd.DataFrame({
        "intervals": intervals,
        "left": bins_arr[:-1],
        "right": bins_arr[1:],
        "freq": hist_arr,
        "pct": hist_arr / orig_df_len * 100,
    })
    return hist_df, miss_pct
Exemplo n.º 2
0
def calc_bar_pie(srs: dd.Series, ngroups: int,
                 largest: bool) -> Tuple[pd.DataFrame, int, float]:
    """
    Calculates the group counts given a series.

    Parameters
    ----------
    srs
        One categorical column
    ngroups
        Number of groups to return
    largest
        If true, show the groups with the largest count,
        else show the groups with the smallest count

    Returns
    -------
    Tuple[pd.DataFrame, float]
        A dataframe of the group counts, the total count of groups,
        and the percent of missing values
    """
    miss_pct = round(srs.isna().sum() / len(srs) * 100, 1)
    try:
        grp_srs = srs.groupby(srs).size()
    except TypeError:
        srs = srs.astype(str)
        grp_srs = srs.groupby(srs).size()
    # select largest or smallest groups
    smp_srs = grp_srs.nlargest(n=ngroups) if largest else grp_srs.nsmallest(
        n=ngroups)
    df = smp_srs.to_frame().rename(columns={srs.name: "cnt"}).reset_index()
    # add a row containing the sum of the other groups
    other_cnt = len(srs) - df["cnt"].sum()
    df = df.append(pd.DataFrame({srs.name: ["Others"], "cnt": [other_cnt]}))
    # add a column containing the percent of count in each group
    df["pct"] = df["cnt"] / len(srs) * 100
    df.columns = ["col", "cnt", "pct"]
    df["col"] = df["col"].astype(
        str)  # needed when numeric is cast as categorical
    return df, len(grp_srs), miss_pct