def test_percentile(): d = da.from_array(cupy.ones((16,)), chunks=(4,)) qs = np.array([0, 50, 100]) assert_eq( da.percentile(d, qs, interpolation="midpoint"), np.array([1, 1, 1], dtype=d.dtype), ) x = cupy.array([0, 0, 5, 5, 5, 5, 20, 20]) d = da.from_array(x, chunks=(3,)) result = da.percentile(d, qs, interpolation="midpoint") assert_eq(result, np.array([0, 5, 20], dtype=result.dtype)) # Currently fails, tokenize(cupy.array(...)) is not deterministic. # See https://github.com/dask/dask/issues/6718 # assert same_keys( # da.percentile(d, qs), # da.percentile(d, qs) # ) assert not same_keys( da.percentile(d, qs, interpolation="midpoint"), da.percentile(d, [0, 50], interpolation="midpoint"), )
def test_unknown_chunk_sizes(method): x = da.random.random(1000, chunks=(100,)) x._chunks = ((np.nan,) * 10,) result = da.percentile(x, 50, method=method).compute() assert 0.1 < result < 0.9 a, b = da.percentile(x, [40, 60], method=method).compute() assert 0.1 < a < 0.9 assert 0.1 < b < 0.9 assert a < b
def test_unknown_chunk_sizes(): x = da.random.random(1000, chunks=(100, )) x._chunks = ((np.nan, ) * 10, ) result = da.percentile(x, 50).compute() assert 0.1 < result < 0.9 a, b = da.percentile(x, [40, 60]).compute() assert 0.1 < a < 0.9 assert 0.1 < b < 0.9 assert a < b
def _calculate_summary_statistics(self): data = self._lazy_data() _raveled = data.ravel() _mean, _std, _min, _q1, _q2, _q3, _max = da.compute( da.nanmean(data), da.nanstd(data), da.nanmin(data), da.percentile(_raveled, [25, ]), da.percentile(_raveled, [50, ]), da.percentile(_raveled, [75, ]), da.nanmax(data), ) return _mean, _std, _min, _q1, _q2, _q3, _max
def test_percentile(): d = da.ones((16,), chunks=(4,)) assert eq(da.percentile(d, [0, 50, 100]), [1, 1, 1]) x = np.array([0, 0, 5, 5, 5, 5, 20, 20]) d = da.from_array(x, chunks=(3,)) assert eq(da.percentile(d, [0, 50, 100]), [0, 5, 20]) x = np.array(['a', 'a', 'd', 'd', 'd', 'e']) d = da.from_array(x, chunks=(3,)) assert eq(da.percentile(d, [0, 50, 100]), ['a', 'd', 'e'])
def test_percentile(): d = da.ones((16,), blockshape=(4,)) assert eq(da.percentile(d, [0, 50, 100]), [1, 1, 1]) x = np.array([0, 0, 5, 5, 5, 5, 20, 20]) d = da.from_array(x, blockshape=(3,)) assert eq(da.percentile(d, [0, 50, 100]), [0, 5, 20]) x = np.array(['a', 'a', 'd', 'd', 'd', 'e']) d = da.from_array(x, blockshape=(3,)) assert eq(da.percentile(d, [0, 50, 100]), ['a', 'd', 'e'])
def test_percentiles_with_unknown_chunk_sizes(): rs = da.random.RandomState(RandomState=cupy.random.RandomState) x = rs.random(1000, chunks=(100,)) x._chunks = ((np.nan,) * 10,) result = da.percentile(x, 50, interpolation="midpoint").compute() assert type(result) == cupy.core.core.ndarray assert 0.1 < result < 0.9 a, b = da.percentile(x, [40, 60], interpolation="midpoint").compute() assert type(a) == cupy.core.core.ndarray assert type(b) == cupy.core.core.ndarray assert 0.1 < a < 0.9 assert 0.1 < b < 0.9 assert a < b
def test_percentiles_with_empty_arrays(): x = da.from_array(cupy.ones(10), chunks=((5, 0, 5), )) res = da.percentile(x, [10, 50, 90], interpolation="midpoint") assert type(res._meta) == cupy.ndarray assert_eq(res, res) # Check that _meta and computed arrays match types assert_eq(res, np.array([1, 1, 1], dtype=x.dtype), check_type=False)
def _calculate_summary_statistics(self, rechunk=True): if rechunk is True: # Use dask auto rechunk instead of HyperSpy's one, what should be # better for these operations rechunk = "dask_auto" data = self._lazy_data(rechunk=rechunk) _raveled = data.ravel() _mean, _std, _min, _q1, _q2, _q3, _max = da.compute( da.nanmean(data), da.nanstd(data), da.nanmin(data), da.percentile(_raveled, [25, ]), da.percentile(_raveled, [50, ]), da.percentile(_raveled, [75, ]), da.nanmax(data), ) return _mean, _std, _min, _q1, _q2, _q3, _max
def test_percentile_with_categoricals(): try: import pandas as pd except ImportError: return x0 = pd.Categorical(["Alice", "Bob", "Charlie", "Dennis", "Alice", "Alice"]) x1 = pd.Categorical(["Alice", "Bob", "Charlie", "Dennis", "Alice", "Alice"]) dsk = {("x", 0): x0, ("x", 1): x1} x = da.Array(dsk, "x", chunks=((6, 6),)) p = da.percentile(x, [50]) assert (p.compute().categories == x0.categories).all() assert (p.compute().codes == [0]).all() assert same_keys(da.percentile(x, [50]), da.percentile(x, [50]))
def fit( self, X: Union[ArrayLike, DataFrameType], y: Optional[Union[ArrayLike, SeriesType]] = None, ) -> "RobustScaler": q_min, q_max = self.quantile_range if not 0 <= q_min <= q_max <= 100: raise ValueError("Invalid quantile range: %s" % str(self.quantile_range)) if isinstance(X, dd.DataFrame): n_columns = len(X.columns) partition_lengths = X.map_partitions(len).compute() dtype = np.find_common_type(X.dtypes, []) blocks = X.to_delayed() X = da.vstack([ da.from_delayed(block.values, shape=(length, n_columns), dtype=dtype) for block, length in zip(blocks, partition_lengths) ]) quantiles: Any = [ da.percentile(col, [q_min, 50.0, q_max]) for col in X.T ] quantiles = da.vstack(quantiles).compute() self.center_: List[float] = quantiles[:, 1] self.scale_: List[float] = quantiles[:, 2] - quantiles[:, 0] self.scale_ = _handle_zeros_in_scale(self.scale_, copy=False) self.n_features_in_: int = X.shape[1] return self
def fit(self, X, y=None): q_min, q_max = self.quantile_range if not 0 <= q_min <= q_max <= 100: raise ValueError("Invalid quantile range: %s" % str(self.quantile_range)) if isinstance(X, dd.DataFrame): n_columns = len(X.columns) partition_lengths = X.map_partitions(len).compute() dtype = np.find_common_type(X.dtypes, []) blocks = X.to_delayed() X = da.vstack( [ da.from_delayed( block.values, shape=(length, n_columns), dtype=dtype ) for block, length in zip(blocks, partition_lengths) ] ) quantiles = [da.percentile(col, [q_min, 50., q_max]) for col in X.T] quantiles = da.vstack(quantiles).compute() self.center_ = quantiles[:, 1] self.scale_ = quantiles[:, 2] - quantiles[:, 0] self.scale_ = skdata._handle_zeros_in_scale(self.scale_, copy=False) return self
def from_bcolz(x, chunksize=None, categorize=True, index=None, **kwargs): """ Read dask Dataframe from bcolz.ctable Parameters ---------- x : bcolz.ctable Input data chunksize : int (optional) The size of blocks to pull out from ctable. Ideally as large as can comfortably fit in memory categorize : bool (defaults to True) Automatically categorize all string dtypes index : string (optional) Column to make the index See Also -------- from_array: more generic function not optimized for bcolz """ import dask.array as da import bcolz if isinstance(x, (str, unicode)): x = bcolz.ctable(rootdir=x) bc_chunklen = max(x[name].chunklen for name in x.names) if chunksize is None and bc_chunklen > 10000: chunksize = bc_chunklen categories = dict() if categorize: for name in x.names: if (np.issubdtype(x.dtype[name], np.string_) or np.issubdtype(x.dtype[name], np.unicode_) or np.issubdtype(x.dtype[name], np.object_)): a = da.from_array(x[name], chunks=(chunksize * len(x.names),)) categories[name] = da.unique(a) columns = tuple(x.dtype.names) divisions = (0,) + tuple(range(-1, len(x), chunksize))[1:] if divisions[-1] != len(x) - 1: divisions = divisions + (len(x) - 1,) new_name = 'from_bcolz' + next(tokens) dsk = dict(((new_name, i), (dataframe_from_ctable, x, (slice(i * chunksize, (i + 1) * chunksize),), None, categories)) for i in range(0, int(ceil(len(x) / chunksize)))) result = DataFrame(dsk, new_name, columns, divisions) if index: assert index in x.names a = da.from_array(x[index], chunks=(chunksize * len(x.names),)) q = np.linspace(0, 100, len(x) // chunksize + 2) divisions = da.percentile(a, q).compute() return set_partition(result, index, divisions, **kwargs) else: return result
def test_percentiles_with_empty_q(): x = da.from_array(cupy.ones(10), chunks=((5, 0, 5),)) result = da.percentile(x, [], interpolation="midpoint") assert type(result._meta) == cupy.core.core.ndarray assert_eq(result, result) # Check that _meta and computed arrays match types assert_eq(result, np.array([], dtype=x.dtype))
def test_percentile_with_categoricals(): try: import pandas as pd except ImportError: return x0 = pd.Categorical(['Alice', 'Bob', 'Charlie', 'Dennis', 'Alice', 'Alice']) x1 = pd.Categorical(['Alice', 'Bob', 'Charlie', 'Dennis', 'Alice', 'Alice']) dsk = {('x', 0): x0, ('x', 1): x1} x = da.Array(dsk, 'x', chunks=((6, 6),)) p = da.percentile(x, [50]) assert (p.compute().categories == x0.categories).all() assert (p.compute().codes == [0]).all() assert same_keys(da.percentile(x, [50]), da.percentile(x, [50]))
def test_percentiles_with_scaler_percentile(internal_method, q): # Regression test to ensure da.percentile works with scalar percentiles # See #3020 d = da.ones((16, ), chunks=(4, )) assert_eq( da.percentile(d, q, internal_method=internal_method), np.array([1], dtype=d.dtype), )
def test_percentile(method): d = da.ones((16,), chunks=(4,)) qs = [0, 50, 100] assert_eq(da.percentile(d, qs, method=method), np.array([1, 1, 1], dtype=d.dtype)) x = np.array([0, 0, 5, 5, 5, 5, 20, 20]) d = da.from_array(x, chunks=(3,)) result = da.percentile(d, qs, method=method) assert_eq(result, np.array([0, 5, 20], dtype=result.dtype)) assert same_keys( da.percentile(d, qs, method=method), da.percentile(d, qs, method=method) ) assert not same_keys( da.percentile(d, qs, method=method), da.percentile(d, [0, 50], method=method) ) if method != "tdigest": x = np.array(["a", "a", "d", "d", "d", "e"]) d = da.from_array(x, chunks=(3,)) assert_eq( da.percentile(d, [0, 50, 100]), np.array(["a", "d", "e"], dtype=x.dtype) )
def test_percentiles_with_scaler_percentile(q): # Regression test to ensure da.percentile works with scalar percentiles # See #3020 d = da.from_array(cupy.ones((16,)), chunks=(4,)) result = da.percentile(d, q, interpolation="midpoint") assert type(result._meta) == cupy.core.core.ndarray assert_eq(result, result) # Check that _meta and computed arrays match types assert_eq(result, np.array([1], dtype=d.dtype))
def test_percentile(): d = da.from_array(cupy.ones((16,)), chunks=(4,)) qs = np.array([0, 50, 100]) assert_eq( da.percentile(d, qs, interpolation="midpoint"), np.array([1, 1, 1], dtype=d.dtype), ) x = cupy.array([0, 0, 5, 5, 5, 5, 20, 20]) d = da.from_array(x, chunks=(3,)) result = da.percentile(d, qs, interpolation="midpoint") assert_eq(result, np.array([0, 5, 20], dtype=result.dtype)) assert not same_keys( da.percentile(d, qs, interpolation="midpoint"), da.percentile(d, [0, 50], interpolation="midpoint"), )
def _run_dask_numpy_quantile(data, k): w = 100.0 / k p = da.arange(w, 100 + w, w) if p[-1] > 100.0: p[-1] = 100.0 q = da.percentile(data.flatten(), p) q = da.unique(q) return q
def statistics(self, data, pca_stats=None): # set headers if pca_stats: # for pca if pca_stats["eigenvals"] is not None: self.stats_header.setText("Eigenvalue: {} ({}%)".format( round(pca_stats["eigenvals"][self.pc_id - 1], 2), round(pca_stats["eigenvals_%"][self.pc_id - 1], 2))) self.stats_header.setToolTip( "It shows how are the dispersion of the data with respect to its component" ) else: self.stats_header.setText("Eigenvalue: --") self.stats_header.setToolTip( "Is only available when the components are computed with the plugin" ) else: # for aoi self.stats_header.setText("Pixels in AOI: {}".format( round(data.size if data.size > 1 else 0, 2))) self.stats_header.setToolTip("") # restore or compute the statistics if self.QCBox_StatsLayer.currentText( ) == self.pc_name and self.stats_pc is not None: min, max, std, p25, p50, p75 = self.stats_pc else: da_data = da.from_array(data, chunks=(8000000, )) min = da.min(da_data).compute() max = da.max(da_data).compute() std = da.std(da_data).compute() p25 = da.percentile(da_data, 25).compute()[0] p50 = da.percentile(da_data, 50).compute()[0] p75 = da.percentile(da_data, 75).compute()[0] if self.QCBox_StatsLayer.currentText() == self.pc_name: self.stats_pc = (min, max, std, p25, p50, p75) # set in dialog self.stats_min.setText(str(round(min, 2))) self.stats_max.setText(str(round(max, 2))) self.stats_std.setText(str(round(std, 2))) self.stats_p25.setText(str(round(p25, 2))) self.stats_p50.setText(str(round(p50, 2))) self.stats_p75.setText(str(round(p75, 2)))
def dasky_freedman_bin_width(data, return_bins=True): r"""Dask version of freedman_bin_width Parameters ---------- data : dask array the data return_bins : bool (optional) if True, then return the bin edges Returns ------- width : float optimal bin width using Scott's rule bins : ndarray bin edges: returned if `return_bins` is True Notes ----- The optimal bin width is .. math:: \Delta_b = \frac{2(q_{75} - q_{25})}{n^{1/3}} where :math:`q_{N}` is the :math:`N` percent quartile of the data, and :math:`n` is the number of data points. See Also -------- knuth_bin_width, scotts_bin_width, astroML.plotting.hist """ if not isinstance(data, da.Array): raise TypeError('data has to be a dask array') if data.ndim != 1: data = data.flatten() n = data.size v25, v75 = da.percentile(data, [25, 75]) dx = 2 * (v75 - v25) * 1. / (n ** (1. / 3)) c_dx, mx, mn = da.compute(dx, data.max(), data.min()) if return_bins: Nbins = np.ceil((mx - mn) * 1. / c_dx) Nbins = max(1, Nbins) bins = mn + c_dx * np.arange(Nbins + 1) return c_dx, bins else: return c_dx
def dasky_freedman_bin_width(data, return_bins=True): r"""Dask version of freedman_bin_width Parameters ---------- data : dask array the data return_bins : bool (optional) if True, then return the bin edges Returns ------- width : float optimal bin width using Scott's rule bins : ndarray bin edges: returned if `return_bins` is True Notes ----- The optimal bin width is .. math:: \Delta_b = \frac{2(q_{75} - q_{25})}{n^{1/3}} where :math:`q_{N}` is the :math:`N` percent quartile of the data, and :math:`n` is the number of data points. See Also -------- knuth_bin_width, scotts_bin_width, astroML.plotting.hist """ if not isinstance(data, da.Array): raise TypeError('data has to be a dask array') if data.ndim != 1: data = data.flatten() n = data.size v25, v75 = da.percentile(data, [25, 75]) dx = 2 * (v75 - v25) * 1. / (n**(1. / 3)) c_dx, mx, mn = da.compute(dx, data.max(), data.min()) if return_bins: Nbins = np.ceil((mx - mn) * 1. / c_dx) Nbins = max(1, Nbins) bins = mn + c_dx * np.arange(Nbins + 1) return c_dx, bins else: return c_dx
def stadistics(self): headers = ["group", "mean", "std dev", "min", "25%", "50%", "75%", "max", "nonzero", "nonan", "unique", "dtype"] self.chunksize = Chunks.build_from_shape(self.shape, self.dtypes) table = [] for group, (dtype, _) in self.dtypes.fields.items(): values = dict() values["dtype"] = dtype values["group"] = group darray = self.data[group].da if dtype == np.dtype(float) or dtype == np.dtype(int): da_mean = da.around(darray.mean(), decimals=3) da_std = da.around(darray.std(), decimals=3) da_min = da.around(darray.min(), decimals=3) da_max = da.around(darray.max(), decimals=3) result = dask.compute([da_mean, da_std, da_min, da_max])[0] values["mean"] = result[0] if not np.isnan(result[0]) else da.around(da.nanmean(darray), decimals=3).compute() values["std dev"] = result[1] if not np.isnan(result[0]) else da.around(da.nanstd(darray), decimals=3).compute() values["min"] = result[2] if not np.isnan(result[0]) else da.around(da.nanmin(darray), decimals=3).compute() values["max"] = result[3] if not np.isnan(result[0]) else da.around(da.nanmax(darray), decimals=3).compute() if len(self.shape[group]) == 1: da_percentile = da.around(da.percentile(darray, [25, 50, 75]), decimals=3) result = da_percentile.compute() values["25%"] = result[0] values["50%"] = result[1] values["75%"] = result[2] else: values["25%"] = "-" values["50%"] = "-" values["75%"] = "-" values["nonzero"] = da.count_nonzero(darray).compute() values["nonan"] = da.count_nonzero(da.notnull(darray)).compute() values["unique"] = "-" else: values["mean"] = "-" values["std dev"] = "-" values["min"] = "-" values["max"] = "-" values["25%"] = "-" values["50%"] = "-" values["75%"] = "-" values["nonzero"] = "-" values["nonan"] = da.count_nonzero(da.notnull(darray)).compute() vunique = darray.to_dask_dataframe().fillna('').nunique().compute() values["unique"] = vunique row = [] for column in headers: row.append(values[column]) table.append(row) print("# rows {}".format(self.shape[0])) return tabulate(table, headers)
def create_hv_dataset(ddf, stats, percentile=(1, 99)): _idNames = ("patch", "tract", "filter") _kdims = ("ra", "dec", "psfMag") _flags = [c for c in ddf.columns if ddf[c].dtype == np.dtype("bool")] kdims = [] vdims = [] for c in ddf.columns: if c in _kdims or c in _idNames or c in _flags: if c in ("ra", "dec", "psfMag"): cmin, cmax = stats[c]["min"].min(), stats[c]["max"].max() c = hv.Dimension(c, range=(cmin, cmax)) elif c in ("filter", "patch"): cvalues = list(ddf[c].unique()) c = hv.Dimension(c, values=cvalues) elif ddf[c].dtype.kind == "b": c = hv.Dimension(c, values=[True, False]) kdims.append(c) else: if percentile is not None: p0, p1 = percentile if f"{p0}%" in stats.index and f"{p1}%" in stats.index: cmin, cmax = stats[c][f"{p0}%"].min( ), stats[c][f"{p1}%"].max() else: print("percentiles not found in stats, computing") darray = ddf[c].values cmin, cmax = da.compute( da.percentile(darray, p0)[0], da.percentile(darray, p1)[0]) else: cmin, cmax = stats[c]["min"].min(), stats[c]["max"].max() c = hv.Dimension(c, range=(cmin, cmax)) vdims.append(c) return hv.Dataset(ddf, kdims=kdims, vdims=vdims)
def gram_rbf(X, threshold=1.0): if type(X) == torch.Tensor: dot_products = X @ X.t() sq_norms = dot_products.diag() sq_distances = -2*dot_products + sq_norms[:,None] + sq_norms[None,:] sq_median_distance = sq_distances.median() return torch.exp(-sq_distances / (2*threshold**2 * sq_median_distance)) elif type(X) == da.Array: dot_products = X @ X.T sq_norms = da.diag(dot_products) sq_distances = -2*dot_products + sq_norms[:,None] + sq_norms[None,:] sq_median_distance = da.percentile(sq_distances.ravel(), 50) return da.exp((-sq_distances / (2*threshold**2 * sq_median_distance))) else: raise ValueError
def _band_hist(band_data): cdf = da.arange(0., 1., 1. / nwidth, chunks=nwidth) if approximate: # need a 1D array flat_data = band_data.ravel() # replace with nanpercentile in the future, if available # dask < 0.17 returns all NaNs for this bins = da.percentile(flat_data[da.notnull(flat_data)], cdf * 100.) else: bins = dask.delayed(np.nanpercentile)(band_data, cdf * 100.) bins = da.from_delayed(bins, shape=(nwidth,), dtype=cdf.dtype) res = dask.delayed(np.interp)(band_data, bins, cdf) res = da.from_delayed(res, shape=band_data.shape, dtype=band_data.dtype) return res
def _freedman_bw_dask(data, return_bins=True): r"""Dask version of freedman_bin_width Parameters ---------- data : dask array the data return_bins : bool (optional) if True, then return the bin edges Returns ------- width : float optimal bin width using Scott's rule bins : ndarray bin edges: returned if `return_bins` is True Notes ----- The optimal bin width is .. math:: \Delta_b = \frac{2(q_{75} - q_{25})}{n^{1/3}} where :math:`q_{N}` is the :math:`N` percent quartile of the data, and :math:`n` is the number of data points. """ if not isinstance(data, da.Array): raise TypeError("Expected a dask array") if data.ndim != 1: data = data.flatten() n = data.size v25, v75 = da.percentile(data, [25, 75]) dx = 2 * (v75 - v25) * n**(-1.0 / 3.0) c_dx, mx, mn = da.compute(dx, data.max(), data.min()) if return_bins: Nbins = max(1, np.ceil((mx - mn) / c_dx)) bins = mn + c_dx * np.arange(Nbins + 1) return c_dx, bins else: return c_dx
def test_percentile(): d = da.ones((16,), chunks=(4,)) assert eq(da.percentile(d, [0, 50, 100]), [1, 1, 1]) x = np.array([0, 0, 5, 5, 5, 5, 20, 20]) d = da.from_array(x, chunks=(3,)) assert eq(da.percentile(d, [0, 50, 100]), [0, 5, 20]) assert same_keys(da.percentile(d, [0, 50, 100]), da.percentile(d, [0, 50, 100])) assert not same_keys(da.percentile(d, [0, 50, 100]), da.percentile(d, [0, 50])) x = np.array(['a', 'a', 'd', 'd', 'd', 'e']) d = da.from_array(x, chunks=(3,)) assert eq(da.percentile(d, [0, 50, 100]), ['a', 'd', 'e'])
def _band_hist(band_data): cdf = da.arange(0., 1., 1. / nwidth, chunks=nwidth) if approximate: # need a 1D array flat_data = band_data.ravel() # replace with nanpercentile in the future, if available # dask < 0.17 returns all NaNs for this bins = da.percentile(flat_data[da.notnull(flat_data)], cdf * 100.) else: bins = dask.delayed(np.nanpercentile)(band_data, cdf * 100.) bins = da.from_delayed(bins, shape=(nwidth, ), dtype=cdf.dtype) res = dask.delayed(np.interp)(band_data, bins, cdf) res = da.from_delayed(res, shape=band_data.shape, dtype=band_data.dtype) return res
def test_percentile(): d = da.ones((16,), chunks=(4,)) assert_eq(da.percentile(d, [0, 50, 100]), np.array([1, 1, 1], dtype=d.dtype)) x = np.array([0, 0, 5, 5, 5, 5, 20, 20]) d = da.from_array(x, chunks=(3,)) result = da.percentile(d, [0, 50, 100]) assert_eq(da.percentile(d, [0, 50, 100]), np.array([0, 5, 20], dtype=result.dtype)) assert same_keys(da.percentile(d, [0, 50, 100]), da.percentile(d, [0, 50, 100])) assert not same_keys(da.percentile(d, [0, 50, 100]), da.percentile(d, [0, 50])) x = np.array(['a', 'a', 'd', 'd', 'd', 'e']) d = da.from_array(x, chunks=(3,)) assert_eq(da.percentile(d, [0, 50, 100]), np.array(['a', 'd', 'e'], dtype=x.dtype))
def test_percentile(): d = da.ones((16, ), chunks=(4, )) assert_eq(da.percentile(d, [0, 50, 100]), np.array([1, 1, 1], dtype=d.dtype)) x = np.array([0, 0, 5, 5, 5, 5, 20, 20]) d = da.from_array(x, chunks=(3, )) result = da.percentile(d, [0, 50, 100]) assert_eq(da.percentile(d, [0, 50, 100]), np.array([0, 5, 20], dtype=result.dtype)) assert same_keys(da.percentile(d, [0, 50, 100]), da.percentile(d, [0, 50, 100])) assert not same_keys(da.percentile(d, [0, 50, 100]), da.percentile(d, [0, 50])) x = np.array(['a', 'a', 'd', 'd', 'd', 'e']) d = da.from_array(x, chunks=(3, )) assert_eq(da.percentile(d, [0, 50, 100]), np.array(['a', 'd', 'e'], dtype=x.dtype))
def single_channel_percentile_norm(data: da.core.Array, min_p: float = 50.0, max_p: float = 99.8, **kwargs) -> da.core.Array: # Enforce shape if len(data.shape) > 4: raise exceptions.InvalidShapeError(len(data.shape), 4) # Get the norm by values norm_by = da.percentile(data.flatten(), [min_p, min_p]).compute() # Norm normed = (data - norm_by[0]) / (norm_by[1] - norm_by[0]) # Clip any values outside of 0 and 1 clipped = da.clip(normed, 0, 1) # Scale them between 0 and 255 return clipped * 255
def test_percentile(method): d = da.ones((16,), chunks=(4,)) qs = [0, 50, 100] assert_eq(da.percentile(d, qs, method=method), np.array([1, 1, 1], dtype=d.dtype)) x = np.array([0, 0, 5, 5, 5, 5, 20, 20]) d = da.from_array(x, chunks=(3,)) result = da.percentile(d, qs, method=method) assert_eq(result, np.array([0, 5, 20], dtype=result.dtype)) assert same_keys(da.percentile(d, qs, method=method), da.percentile(d, qs, method=method)) assert not same_keys(da.percentile(d, qs, method=method), da.percentile(d, [0, 50], method=method)) if method != 'tdigest': x = np.array(['a', 'a', 'd', 'd', 'd', 'e']) d = da.from_array(x, chunks=(3,)) assert_eq(da.percentile(d, [0, 50, 100]), np.array(['a', 'd', 'e'], dtype=x.dtype))
def from_bcolz(x, chunksize=None, categorize=True, index=None, lock=lock, **kwargs): """ Read dask Dataframe from bcolz.ctable Parameters ---------- x : bcolz.ctable Input data chunksize : int, optional The size of blocks to pull out from ctable. Ideally as large as can comfortably fit in memory categorize : bool, defaults to True Automatically categorize all string dtypes index : string, optional Column to make the index lock: bool or Lock Lock to use when reading or False for no lock (not-thread-safe) See Also -------- from_array: more generic function not optimized for bcolz """ if lock is True: lock = Lock() import dask.array as da import bcolz if isinstance(x, (str, unicode)): x = bcolz.ctable(rootdir=x) bc_chunklen = max(x[name].chunklen for name in x.names) if chunksize is None and bc_chunklen > 10000: chunksize = bc_chunklen categories = dict() if categorize: for name in x.names: if (np.issubdtype(x.dtype[name], np.string_) or np.issubdtype(x.dtype[name], np.unicode_) or np.issubdtype(x.dtype[name], np.object_)): a = da.from_array(x[name], chunks=(chunksize * len(x.names),)) categories[name] = da.unique(a) columns = tuple(x.dtype.names) divisions = tuple(range(0, len(x), chunksize)) divisions = divisions + (len(x) - 1,) if x.rootdir: token = tokenize((x.rootdir, os.path.getmtime(x.rootdir)), chunksize, categorize, index, kwargs) else: token = tokenize((id(x), x.shape, x.dtype), chunksize, categorize, index, kwargs) new_name = 'from_bcolz-' + token dsk = dict(((new_name, i), (dataframe_from_ctable, x, (slice(i * chunksize, (i + 1) * chunksize),), columns, categories, lock)) for i in range(0, int(ceil(len(x) / chunksize)))) meta = dataframe_from_ctable(x, slice(0, 0), columns, categories, lock) result = DataFrame(dsk, new_name, meta, divisions) if index: assert index in x.names a = da.from_array(x[index], chunks=(chunksize * len(x.names),)) q = np.linspace(0, 100, len(x) // chunksize + 2) divisions = da.percentile(a, q).compute() return set_partition(result, index, divisions, **kwargs) else: return result
def from_bcolz(x, chunksize=None, categorize=True, index=None, lock=lock, **kwargs): """ Read BColz CTable into a Dask Dataframe BColz is a fast on-disk compressed column store with careful attention given to compression. https://bcolz.readthedocs.io/en/latest/ Parameters ---------- x : bcolz.ctable chunksize : int, optional The size(rows) of blocks to pull out from ctable. categorize : bool, defaults to True Automatically categorize all string dtypes index : string, optional Column to make the index lock: bool or Lock Lock to use when reading or False for no lock (not-thread-safe) See Also -------- from_array: more generic function not optimized for bcolz """ if lock is True: lock = Lock() import dask.array as da import bcolz if isinstance(x, (str, unicode)): x = bcolz.ctable(rootdir=x) bc_chunklen = max(x[name].chunklen for name in x.names) if chunksize is None and bc_chunklen > 10000: chunksize = bc_chunklen categories = dict() if categorize: for name in x.names: if (np.issubdtype(x.dtype[name], np.string_) or np.issubdtype(x.dtype[name], np.unicode_) or np.issubdtype(x.dtype[name], np.object_)): a = da.from_array(x[name], chunks=(chunksize * len(x.names), )) categories[name] = da.unique(a) columns = tuple(x.dtype.names) divisions = tuple(range(0, len(x), chunksize)) divisions = divisions + (len(x) - 1, ) if x.rootdir: token = tokenize((x.rootdir, os.path.getmtime(x.rootdir)), chunksize, categorize, index, kwargs) else: token = tokenize((id(x), x.shape, x.dtype), chunksize, categorize, index, kwargs) new_name = 'from_bcolz-' + token dsk = dict(((new_name, i), (dataframe_from_ctable, x, (slice(i * chunksize, (i + 1) * chunksize), ), columns, categories, lock)) for i in range(0, int(ceil(len(x) / chunksize)))) meta = dataframe_from_ctable(x, slice(0, 0), columns, categories, lock) result = DataFrame(dsk, new_name, meta, divisions) if index: assert index in x.names a = da.from_array(x[index], chunks=(chunksize * len(x.names), )) q = np.linspace(0, 100, len(x) // chunksize + 2) divisions = tuple(da.percentile(a, q).compute()) return set_partition(result, index, divisions, **kwargs) else: return result
def _dense_fit(self, X, random_state): references = self.references_ * 100 quantiles = [da.percentile(col, references) for col in X.T] self.quantiles_, = compute(da.vstack(quantiles).T)
def _dense_fit(self, X: Union[ArrayLike, DataFrameType], random_state: int) -> Union[ArrayLike, DataFrameType]: references = self.references_ * 100 quantiles = [da.percentile(col, references) for col in X.T] (self.quantiles_, ) = compute(da.vstack(quantiles).T) return None
def test_percentiles_with_empty_arrays(method): x = da.ones(10, chunks=((5, 0, 5),)) assert_eq(da.percentile(x, [10, 50, 90], method=method), np.array([1, 1, 1], dtype=x.dtype))
def test_percentiles_with_scaler_percentile(method, q): # Regression test to ensure da.percentile works with scalar percentiles # See #3020 d = da.ones((16,), chunks=(4,)) assert_eq(da.percentile(d, q, method=method), np.array([1], dtype=d.dtype))
def test_percentile_tokenize(): d = da.from_array(cupy.ones((16,)), chunks=(4,)) qs = np.array([0, 50, 100]) assert same_keys(da.percentile(d, qs), da.percentile(d, qs))
def test_percentiles_with_empty_arrays(): x = da.ones(10, blockdims=((5, 0, 5),)) assert da.percentile(x, [10, 50, 90]).compute().tolist() == [1, 1, 1]