Exemplo n.º 1
0
def histogram(
    arr: da.Array,
    bins: Optional[int] = None,
    return_edges: bool = True,
    range: Optional[Tuple[int, int]] = None,  # pylint: disable=redefined-builtin
    dtype: Optional[DTypeDef] = None,
) -> Tuple[da.Array, ...]:
    """Calculate "histogram" for both numerical and categorical."""
    if len(arr.shape) != 1:
        raise ValueError("Histogram only supports 1-d array.")

    if is_dtype(detect_dtype(arr, dtype), Continuous()):
        if range is not None:
            minimum, maximum = range
        else:
            minimum, maximum = arr.min(axis=0), arr.max(axis=0)

        if bins is None:
            raise ValueError(
                "num_bins cannot be None if calculating numerical histograms.")

        counts, edges = da.histogram(arr, bins, range=[minimum, maximum])
        centers = (edges[:-1] + edges[1:]) / 2

        if not return_edges:
            return counts, centers
        return counts, centers, edges
    elif is_dtype(detect_dtype(arr, dtype), Nominal()):
        # Dask array's unique is way slower than the values_counts on Series
        # See https://github.com/dask/dask/issues/2851
        # centers, counts = da.unique(arr, return_counts=True)

        srs = dd.from_dask_array(arr)
        value_counts = srs.value_counts()

        counts = value_counts.to_dask_array()
        centers = value_counts.index.to_dask_array()

        return (counts, centers)
    else:
        raise ValueError(f"Unsupported dtype {arr.dtype}")
Exemplo n.º 2
0
def calc_hist_kde(
        data: da.Array, bins: int,
        bandwidth: float) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray]:
    """
    Calculate a density histogram and its corresponding kernel density
    estimate over a given series. The kernel is guassian.

    Parameters
    ----------
    data: da.Array
        one numerical column over which to compute the histogram and kde
    bins : int
        number of bins to use in the histogram
    bandwidth: float
        bandwidth for the kde

    Returns
    -------
    Tuple[pd.DataFrame, np.ndarray, np.ndarray]
        The histogram in a dataframe, range of points for the kde,
        and the kde calculated at the specified points
    """
    minv, maxv = dask.compute(data.min(), data.max())
    hist_arr, bins_arr = da.histogram(data,
                                      range=[minv, maxv],
                                      bins=bins,
                                      density=True)
    hist_arr = hist_arr.compute()
    intervals = _format_bin_intervals(bins_arr)
    hist_df = pd.DataFrame({
        "intervals": intervals,
        "left": bins_arr[:-1],
        "right": bins_arr[1:],
        "freq": hist_arr,
    })
    pts_rng = np.linspace(minv, maxv, 1000)
    pdf = gaussian_kde(data.compute(), bw_method=bandwidth)(pts_rng)
    return hist_df, pts_rng, pdf