예제 #1
0
def uni_histogram(
    srs: dd.Series,
    bins: int,
    dtype: Optional[DTypeDef] = None,
) -> Tuple[da.Array, ...]:
    """Calculate "histogram" for both numerical and categorical."""

    if is_dtype(detect_dtype(srs, dtype), Continuous()):

        counts, edges = da.histogram(srs, bins, range=[srs.min(), srs.max()])
        centers = (edges[:-1] + edges[1:]) / 2

        return counts, centers, edges

    elif is_dtype(detect_dtype(srs, dtype), Nominal()):
        # Dask array's unique is way slower than the values_counts on Series
        # See https://github.com/dask/dask/issues/2851
        # centers, counts = da.unique(arr, return_counts=True)

        value_counts = srs.value_counts()

        counts = value_counts.to_dask_array()
        centers = value_counts.index.to_dask_array()

        return (counts, centers)
    else:
        raise ValueError(f"Unsupported dtype {srs.dtype}")
예제 #2
0
def _cont_calcs(srs: dd.Series, cfg: Config) -> Dict[str, Any]:
    """
    Computations for a continuous column in plot(df)
    """
    # dictionary of data for the histogram and related insights
    data: Dict[str, Any] = {}

    if cfg.insight.enable:
        data["npres"] = srs.shape[0]  # number of present (not null) values

    # drop infinite values
    srs = srs[~srs.isin({np.inf, -np.inf})]

    # histogram
    data["hist"] = da.histogram(srs, bins=cfg.hist.bins, range=(srs.min(), srs.max()))

    if cfg.insight.enable:
        data["chisq"] = chisquare(data["hist"][0])
        data["norm"] = normaltest(data["hist"][0])
        data["skew"] = skewtest(data["hist"][0])
        data["nneg"] = (srs < 0).sum()  # number of negative values
        data["nuniq"] = srs.nunique_approx()  # number of unique values
        data["nzero"] = (srs == 0).sum()  # number of zeros
        data["nreals"] = srs.shape[0]  # number of non-inf values
    return data
예제 #3
0
def calc_stats_dt(srs: dd.Series) -> Dict[str, str]:
    """
    Calculate stats from a datetime column

    Parameters
    ----------
    srs
        a datetime column
    Returns
    -------
    Dict[str, str]
        Dictionary that contains Overview
    """
    size = len(srs)  # include nan
    count = srs.count()  # exclude nan
    uniq_count = srs.nunique()
    overview_dict = {
        "Distinct Count": uniq_count,
        "Unique (%)": uniq_count / count,
        "Missing": size - count,
        "Missing (%)": 1 - (count / size),
        "Memory Size": srs.memory_usage(),
        "Minimum": srs.min(),
        "Maximum": srs.max(),
    }

    return overview_dict
예제 #4
0
파일: common.py 프로젝트: sfu-db/dataprep
def uni_histogram(
    srs: dd.Series,
    srs_dtype: DType,
    cfg: Config,
) -> Tuple[da.Array, ...]:
    """Calculate "histogram" for both numerical and categorical."""

    if isinstance(srs_dtype, Continuous):

        counts, edges = da.histogram(srs, cfg.hist.bins, (srs.min(), srs.max()))
        centers = (edges[:-1] + edges[1:]) / 2

        return counts, centers, edges

    elif isinstance(srs_dtype, (Nominal, GeoGraphy, SmallCardNum, DateTime)):
        # Dask array's unique is way slower than the values_counts on Series
        # See https://github.com/dask/dask/issues/2851
        # centers, counts = da.unique(arr, return_counts=True)

        value_counts = srs.value_counts()

        counts = value_counts.to_dask_array()
        centers = value_counts.index.to_dask_array()

        return (counts, centers)
    else:
        raise ValueError(f"Unsupported dtype {srs.dtype}")
예제 #5
0
def calc_cont_col(srs: dd.Series, bins: int) -> Dict[str, Any]:
    """
    Computations for a numerical column in plot(df)

    Parameters
    ----------
    srs
        srs over which to compute the barchart and insights
    bins
        number of bins in the bar chart
    """
    # dictionary of data for the histogram and related insights
    data: Dict[str, Any] = {}

    ## if cfg.insight.missing_enable:
    data["npres"] = srs.shape[0]

    ## if cfg.insight.infinity_enable:
    is_inf_srs = srs.isin({np.inf, -np.inf})
    data["ninf"] = is_inf_srs.sum()

    # remove infinite values
    srs = srs[~is_inf_srs]

    ## if cfg.hist_enable or config.insight.uniform_enable or cfg.insight.normal_enable:
    ## bins = cfg.hist_bins
    data["hist"] = da.histogram(srs, bins=bins, range=[srs.min(), srs.max()])

    ## if cfg.insight.uniform_enable:
    data["chisq"] = chisquare(data["hist"][0])

    ## if cfg.insight.normal_enable
    data["norm"] = normaltest(data["hist"][0])

    ## if cfg.insight.negative_enable:
    data["nneg"] = (srs < 0).sum()

    ## if cfg.insight.skew_enabled:
    data["skew"] = skewtest(data["hist"][0])

    ## if cfg.insight.unique_enabled:
    data["nuniq"] = srs.nunique()

    ## if cfg.insight.zero_enabled:
    data["nzero"] = (srs == 0).sum()

    return data
예제 #6
0
def histogram(
    srs: dd.Series,
    bins: Optional[int] = None,
    return_edges: bool = True,
    range: Optional[Tuple[int, int]] = None,  # pylint: disable=redefined-builtin
    dtype: Optional[DTypeDef] = None,
) -> Union[Tuple[da.Array, da.Array], Tuple[da.Array, da.Array, da.Array]]:
    """
    Calculate "histogram" for both numerical and categorical
    """

    if is_dtype(detect_dtype(srs, dtype), Continuous()):
        if range is not None:
            minimum, maximum = range
        else:
            minimum, maximum = srs.min(axis=0), srs.max(axis=0)
        minimum, maximum = dask.compute(minimum, maximum)

        assert (
            bins is not None
        ), "num_bins cannot be None if calculating numerical histograms"

        counts, edges = da.histogram(srs.to_dask_array(),
                                     bins,
                                     range=[minimum, maximum])
        centers = (edges[:-1] + edges[1:]) / 2

        if not return_edges:
            return counts, centers
        return counts, centers, edges
    elif is_dtype(detect_dtype(srs, dtype), Nominal()):
        value_counts = srs.value_counts()
        counts = value_counts.to_dask_array()

        # Dask array dones't understand the pandas dtypes such as categorical type.
        # We convert these types into str before calling into `to_dask_array`.

        if is_pandas_categorical(value_counts.index.dtype):
            centers = value_counts.index.astype("str").to_dask_array()
        else:
            centers = value_counts.index.to_dask_array()
        return (counts, centers)
    else:
        raise UnreachableError()
예제 #7
0
def calc_stats_dt(srs: dd.Series) -> Dict[str, str]:
    """
    Calculate stats from a datetime column
    """
    size = srs.shape[0]  # include nan
    count = srs.count()  # exclude nan
    # nunique_approx() has error when type is datetime
    try:
        uniq_count = srs.nunique_approx()
    except:  # pylint: disable=W0702
        uniq_count = srs.nunique()
    overview_dict = {
        "Distinct Count": uniq_count,
        "Approximate Unique (%)": uniq_count / count,
        "Missing": size - count,
        "Missing (%)": 1 - (count / size),
        "Memory Size": srs.memory_usage(deep=True),
        "Minimum": srs.min(),
        "Maximum": srs.max(),
    }

    return overview_dict
예제 #8
0
def cont_comps(srs: dd.Series, cfg: Config) -> Dict[str, Any]:
    """
    All computations required for plot(df, Continuous)
    """
    # pylint: disable=too-many-branches
    data: Dict[str, Any] = {}

    if cfg.stats.enable or cfg.hist.enable:
        data["nrows"] = srs.shape[0]  # total rows
    srs = srs.dropna()
    if cfg.stats.enable:
        data["npres"] = srs.shape[0]  # number of present (not null) values
    srs = srs[~srs.isin({np.inf, -np.inf})]  # remove infinite values
    if cfg.hist.enable or cfg.qqnorm.enable and cfg.insight.enable:
        data["hist"] = da.histogram(srs, cfg.hist.bins, (srs.min(), srs.max()))
        if cfg.insight.enable:
            data["norm"] = normaltest(data["hist"][0])
    if cfg.hist.enable and cfg.insight.enable:
        data["chisq"] = chisquare(data["hist"][0])
    # compute only the required amount of quantiles
    if cfg.qqnorm.enable:
        data["qntls"] = srs.quantile(np.linspace(0.01, 0.99, 99))
    elif cfg.stats.enable:
        data["qntls"] = srs.quantile([0.05, 0.25, 0.5, 0.75, 0.95])
    elif cfg.box.enable:
        data["qntls"] = srs.quantile([0.25, 0.5, 0.75])
    if cfg.stats.enable or cfg.hist.enable and cfg.insight.enable:
        data["skew"] = skew(srs)
    if cfg.stats.enable or cfg.qqnorm.enable:
        data["mean"] = srs.mean()
        data["std"] = srs.std()
    if cfg.stats.enable:
        data["min"] = srs.min()
        data["max"] = srs.max()
        data["nreals"] = srs.shape[0]
        data["nzero"] = (srs == 0).sum()
        data["nneg"] = (srs < 0).sum()
        data["kurt"] = kurtosis(srs)
        data["mem_use"] = srs.memory_usage(deep=True)
    # compute the density histogram
    if cfg.kde.enable:
        # To avoid the singular matrix problem, gaussian_kde needs a non-zero std.
        if not math.isclose(
                dask.compute(data["min"])[0],
                dask.compute(data["max"])[0]):
            data["dens"] = da.histogram(srs,
                                        cfg.kde.bins, (srs.min(), srs.max()),
                                        density=True)
            # gaussian kernel density estimate
            data["kde"] = gaussian_kde(
                srs.map_partitions(lambda x: x.sample(min(1000, x.shape[0])),
                                   meta=srs))
        else:
            data["kde"] = None
    if cfg.box.enable:
        data.update(_calc_box(srs, data["qntls"], cfg))
    if cfg.value_table.enable:
        value_counts = srs.value_counts(sort=False)
        if cfg.stats.enable:
            data["nuniq"] = value_counts.shape[0]
        data["value_table"] = value_counts.nlargest(cfg.value_table.ngroups)
    elif cfg.stats.enable:
        data["nuniq"] = srs.nunique_approx()

    return data
예제 #9
0
def cont_comps(srs: dd.Series, bins: int) -> Dict[str, Any]:
    """
    This function aggregates all of the computations required for plot(df, Continuous())

    Parameters
    ----------
    srs
        one numerical column
    bins
        the number of bins in the histogram
    """

    data: Dict[str, Any] = {}

    ## if cfg.stats_enable or cfg.hist_enable or
    # calculate the total number of rows then drop the missing values
    data["nrows"] = srs.shape[0]
    srs = srs.dropna()
    ## if cfg.stats_enable
    # number of not null (present) values
    data["npres"] = srs.shape[0]
    # remove infinite values
    srs = srs[~srs.isin({np.inf, -np.inf})]

    # shared computations
    ## if cfg.stats_enable or cfg.hist_enable or cfg.qqplot_enable and cfg.insights_enable:
    data["min"], data["max"] = srs.min(), srs.max()
    ## if cfg.hist_enable or cfg.qqplot_enable and cfg.ingsights_enable:
    data["hist"] = da.histogram(srs,
                                bins=bins,
                                range=[data["min"], data["max"]])
    ## if cfg.insights_enable and (cfg.qqplot_enable or cfg.hist_enable):
    data["norm"] = normaltest(data["hist"][0])
    ## if cfg.qqplot_enable
    data["qntls"] = srs.quantile(np.linspace(0.01, 0.99, 99))
    ## elif cfg.stats_enable
    ## data["qntls"] = srs.quantile([0.05, 0.25, 0.5, 0.75, 0.95])
    ## elif cfg.boxplot_enable
    ## data["qntls"] = srs.quantile([0.25, 0.5, 0.75])
    ## if cfg.stats_enable or cfg.hist_enable and cfg.insights_enable:
    data["skew"] = skew(srs)

    # if cfg.stats_enable
    data["nuniq"] = srs.nunique()
    data["nreals"] = srs.shape[0]
    data["nzero"] = (srs == 0).sum()
    data["nneg"] = (srs < 0).sum()
    data["mean"] = srs.mean()
    data["std"] = srs.std()
    data["kurt"] = kurtosis(srs)
    data["mem_use"] = srs.memory_usage(deep=True)

    ## if cfg.hist_enable and cfg.insight_enable
    data["chisq"] = chisquare(data["hist"][0])

    # compute the density histogram
    data["dens"] = da.histogram(srs,
                                bins=bins,
                                range=[data["min"], data["max"]],
                                density=True)
    # gaussian kernel density estimate
    data["kde"] = gaussian_kde(
        srs.map_partitions(lambda x: x.sample(min(1000, x.shape[0])),
                           meta=srs))

    ## if cfg.box_enable
    data.update(calc_box(srs, data["qntls"]))

    return data