def uni_histogram( srs: dd.Series, bins: int, dtype: Optional[DTypeDef] = None, ) -> Tuple[da.Array, ...]: """Calculate "histogram" for both numerical and categorical.""" if is_dtype(detect_dtype(srs, dtype), Continuous()): counts, edges = da.histogram(srs, bins, range=[srs.min(), srs.max()]) centers = (edges[:-1] + edges[1:]) / 2 return counts, centers, edges elif is_dtype(detect_dtype(srs, dtype), Nominal()): # Dask array's unique is way slower than the values_counts on Series # See https://github.com/dask/dask/issues/2851 # centers, counts = da.unique(arr, return_counts=True) value_counts = srs.value_counts() counts = value_counts.to_dask_array() centers = value_counts.index.to_dask_array() return (counts, centers) else: raise ValueError(f"Unsupported dtype {srs.dtype}")
def _cont_calcs(srs: dd.Series, cfg: Config) -> Dict[str, Any]: """ Computations for a continuous column in plot(df) """ # dictionary of data for the histogram and related insights data: Dict[str, Any] = {} if cfg.insight.enable: data["npres"] = srs.shape[0] # number of present (not null) values # drop infinite values srs = srs[~srs.isin({np.inf, -np.inf})] # histogram data["hist"] = da.histogram(srs, bins=cfg.hist.bins, range=(srs.min(), srs.max())) if cfg.insight.enable: data["chisq"] = chisquare(data["hist"][0]) data["norm"] = normaltest(data["hist"][0]) data["skew"] = skewtest(data["hist"][0]) data["nneg"] = (srs < 0).sum() # number of negative values data["nuniq"] = srs.nunique_approx() # number of unique values data["nzero"] = (srs == 0).sum() # number of zeros data["nreals"] = srs.shape[0] # number of non-inf values return data
def calc_stats_dt(srs: dd.Series) -> Dict[str, str]: """ Calculate stats from a datetime column Parameters ---------- srs a datetime column Returns ------- Dict[str, str] Dictionary that contains Overview """ size = len(srs) # include nan count = srs.count() # exclude nan uniq_count = srs.nunique() overview_dict = { "Distinct Count": uniq_count, "Unique (%)": uniq_count / count, "Missing": size - count, "Missing (%)": 1 - (count / size), "Memory Size": srs.memory_usage(), "Minimum": srs.min(), "Maximum": srs.max(), } return overview_dict
def uni_histogram( srs: dd.Series, srs_dtype: DType, cfg: Config, ) -> Tuple[da.Array, ...]: """Calculate "histogram" for both numerical and categorical.""" if isinstance(srs_dtype, Continuous): counts, edges = da.histogram(srs, cfg.hist.bins, (srs.min(), srs.max())) centers = (edges[:-1] + edges[1:]) / 2 return counts, centers, edges elif isinstance(srs_dtype, (Nominal, GeoGraphy, SmallCardNum, DateTime)): # Dask array's unique is way slower than the values_counts on Series # See https://github.com/dask/dask/issues/2851 # centers, counts = da.unique(arr, return_counts=True) value_counts = srs.value_counts() counts = value_counts.to_dask_array() centers = value_counts.index.to_dask_array() return (counts, centers) else: raise ValueError(f"Unsupported dtype {srs.dtype}")
def calc_cont_col(srs: dd.Series, bins: int) -> Dict[str, Any]: """ Computations for a numerical column in plot(df) Parameters ---------- srs srs over which to compute the barchart and insights bins number of bins in the bar chart """ # dictionary of data for the histogram and related insights data: Dict[str, Any] = {} ## if cfg.insight.missing_enable: data["npres"] = srs.shape[0] ## if cfg.insight.infinity_enable: is_inf_srs = srs.isin({np.inf, -np.inf}) data["ninf"] = is_inf_srs.sum() # remove infinite values srs = srs[~is_inf_srs] ## if cfg.hist_enable or config.insight.uniform_enable or cfg.insight.normal_enable: ## bins = cfg.hist_bins data["hist"] = da.histogram(srs, bins=bins, range=[srs.min(), srs.max()]) ## if cfg.insight.uniform_enable: data["chisq"] = chisquare(data["hist"][0]) ## if cfg.insight.normal_enable data["norm"] = normaltest(data["hist"][0]) ## if cfg.insight.negative_enable: data["nneg"] = (srs < 0).sum() ## if cfg.insight.skew_enabled: data["skew"] = skewtest(data["hist"][0]) ## if cfg.insight.unique_enabled: data["nuniq"] = srs.nunique() ## if cfg.insight.zero_enabled: data["nzero"] = (srs == 0).sum() return data
def histogram( srs: dd.Series, bins: Optional[int] = None, return_edges: bool = True, range: Optional[Tuple[int, int]] = None, # pylint: disable=redefined-builtin dtype: Optional[DTypeDef] = None, ) -> Union[Tuple[da.Array, da.Array], Tuple[da.Array, da.Array, da.Array]]: """ Calculate "histogram" for both numerical and categorical """ if is_dtype(detect_dtype(srs, dtype), Continuous()): if range is not None: minimum, maximum = range else: minimum, maximum = srs.min(axis=0), srs.max(axis=0) minimum, maximum = dask.compute(minimum, maximum) assert ( bins is not None ), "num_bins cannot be None if calculating numerical histograms" counts, edges = da.histogram(srs.to_dask_array(), bins, range=[minimum, maximum]) centers = (edges[:-1] + edges[1:]) / 2 if not return_edges: return counts, centers return counts, centers, edges elif is_dtype(detect_dtype(srs, dtype), Nominal()): value_counts = srs.value_counts() counts = value_counts.to_dask_array() # Dask array dones't understand the pandas dtypes such as categorical type. # We convert these types into str before calling into `to_dask_array`. if is_pandas_categorical(value_counts.index.dtype): centers = value_counts.index.astype("str").to_dask_array() else: centers = value_counts.index.to_dask_array() return (counts, centers) else: raise UnreachableError()
def calc_stats_dt(srs: dd.Series) -> Dict[str, str]: """ Calculate stats from a datetime column """ size = srs.shape[0] # include nan count = srs.count() # exclude nan # nunique_approx() has error when type is datetime try: uniq_count = srs.nunique_approx() except: # pylint: disable=W0702 uniq_count = srs.nunique() overview_dict = { "Distinct Count": uniq_count, "Approximate Unique (%)": uniq_count / count, "Missing": size - count, "Missing (%)": 1 - (count / size), "Memory Size": srs.memory_usage(deep=True), "Minimum": srs.min(), "Maximum": srs.max(), } return overview_dict
def cont_comps(srs: dd.Series, cfg: Config) -> Dict[str, Any]: """ All computations required for plot(df, Continuous) """ # pylint: disable=too-many-branches data: Dict[str, Any] = {} if cfg.stats.enable or cfg.hist.enable: data["nrows"] = srs.shape[0] # total rows srs = srs.dropna() if cfg.stats.enable: data["npres"] = srs.shape[0] # number of present (not null) values srs = srs[~srs.isin({np.inf, -np.inf})] # remove infinite values if cfg.hist.enable or cfg.qqnorm.enable and cfg.insight.enable: data["hist"] = da.histogram(srs, cfg.hist.bins, (srs.min(), srs.max())) if cfg.insight.enable: data["norm"] = normaltest(data["hist"][0]) if cfg.hist.enable and cfg.insight.enable: data["chisq"] = chisquare(data["hist"][0]) # compute only the required amount of quantiles if cfg.qqnorm.enable: data["qntls"] = srs.quantile(np.linspace(0.01, 0.99, 99)) elif cfg.stats.enable: data["qntls"] = srs.quantile([0.05, 0.25, 0.5, 0.75, 0.95]) elif cfg.box.enable: data["qntls"] = srs.quantile([0.25, 0.5, 0.75]) if cfg.stats.enable or cfg.hist.enable and cfg.insight.enable: data["skew"] = skew(srs) if cfg.stats.enable or cfg.qqnorm.enable: data["mean"] = srs.mean() data["std"] = srs.std() if cfg.stats.enable: data["min"] = srs.min() data["max"] = srs.max() data["nreals"] = srs.shape[0] data["nzero"] = (srs == 0).sum() data["nneg"] = (srs < 0).sum() data["kurt"] = kurtosis(srs) data["mem_use"] = srs.memory_usage(deep=True) # compute the density histogram if cfg.kde.enable: # To avoid the singular matrix problem, gaussian_kde needs a non-zero std. if not math.isclose( dask.compute(data["min"])[0], dask.compute(data["max"])[0]): data["dens"] = da.histogram(srs, cfg.kde.bins, (srs.min(), srs.max()), density=True) # gaussian kernel density estimate data["kde"] = gaussian_kde( srs.map_partitions(lambda x: x.sample(min(1000, x.shape[0])), meta=srs)) else: data["kde"] = None if cfg.box.enable: data.update(_calc_box(srs, data["qntls"], cfg)) if cfg.value_table.enable: value_counts = srs.value_counts(sort=False) if cfg.stats.enable: data["nuniq"] = value_counts.shape[0] data["value_table"] = value_counts.nlargest(cfg.value_table.ngroups) elif cfg.stats.enable: data["nuniq"] = srs.nunique_approx() return data
def cont_comps(srs: dd.Series, bins: int) -> Dict[str, Any]: """ This function aggregates all of the computations required for plot(df, Continuous()) Parameters ---------- srs one numerical column bins the number of bins in the histogram """ data: Dict[str, Any] = {} ## if cfg.stats_enable or cfg.hist_enable or # calculate the total number of rows then drop the missing values data["nrows"] = srs.shape[0] srs = srs.dropna() ## if cfg.stats_enable # number of not null (present) values data["npres"] = srs.shape[0] # remove infinite values srs = srs[~srs.isin({np.inf, -np.inf})] # shared computations ## if cfg.stats_enable or cfg.hist_enable or cfg.qqplot_enable and cfg.insights_enable: data["min"], data["max"] = srs.min(), srs.max() ## if cfg.hist_enable or cfg.qqplot_enable and cfg.ingsights_enable: data["hist"] = da.histogram(srs, bins=bins, range=[data["min"], data["max"]]) ## if cfg.insights_enable and (cfg.qqplot_enable or cfg.hist_enable): data["norm"] = normaltest(data["hist"][0]) ## if cfg.qqplot_enable data["qntls"] = srs.quantile(np.linspace(0.01, 0.99, 99)) ## elif cfg.stats_enable ## data["qntls"] = srs.quantile([0.05, 0.25, 0.5, 0.75, 0.95]) ## elif cfg.boxplot_enable ## data["qntls"] = srs.quantile([0.25, 0.5, 0.75]) ## if cfg.stats_enable or cfg.hist_enable and cfg.insights_enable: data["skew"] = skew(srs) # if cfg.stats_enable data["nuniq"] = srs.nunique() data["nreals"] = srs.shape[0] data["nzero"] = (srs == 0).sum() data["nneg"] = (srs < 0).sum() data["mean"] = srs.mean() data["std"] = srs.std() data["kurt"] = kurtosis(srs) data["mem_use"] = srs.memory_usage(deep=True) ## if cfg.hist_enable and cfg.insight_enable data["chisq"] = chisquare(data["hist"][0]) # compute the density histogram data["dens"] = da.histogram(srs, bins=bins, range=[data["min"], data["max"]], density=True) # gaussian kernel density estimate data["kde"] = gaussian_kde( srs.map_partitions(lambda x: x.sample(min(1000, x.shape[0])), meta=srs)) ## if cfg.box_enable data.update(calc_box(srs, data["qntls"])) return data