Пример #1
0
def test_percentile():
    d = da.from_array(cupy.ones((16,)), chunks=(4,))
    qs = np.array([0, 50, 100])

    assert_eq(
        da.percentile(d, qs, interpolation="midpoint"),
        np.array([1, 1, 1], dtype=d.dtype),
    )

    x = cupy.array([0, 0, 5, 5, 5, 5, 20, 20])
    d = da.from_array(x, chunks=(3,))

    result = da.percentile(d, qs, interpolation="midpoint")
    assert_eq(result, np.array([0, 5, 20], dtype=result.dtype))

    # Currently fails, tokenize(cupy.array(...)) is not deterministic.
    # See https://github.com/dask/dask/issues/6718
    # assert same_keys(
    #     da.percentile(d, qs),
    #     da.percentile(d, qs)
    # )

    assert not same_keys(
        da.percentile(d, qs, interpolation="midpoint"),
        da.percentile(d, [0, 50], interpolation="midpoint"),
    )
Пример #2
0
def test_unknown_chunk_sizes(method):
    x = da.random.random(1000, chunks=(100,))
    x._chunks = ((np.nan,) * 10,)

    result = da.percentile(x, 50, method=method).compute()
    assert 0.1 < result < 0.9

    a, b = da.percentile(x, [40, 60], method=method).compute()
    assert 0.1 < a < 0.9
    assert 0.1 < b < 0.9
    assert a < b
def test_unknown_chunk_sizes():
    x = da.random.random(1000, chunks=(100, ))
    x._chunks = ((np.nan, ) * 10, )

    result = da.percentile(x, 50).compute()
    assert 0.1 < result < 0.9

    a, b = da.percentile(x, [40, 60]).compute()
    assert 0.1 < a < 0.9
    assert 0.1 < b < 0.9
    assert a < b
Пример #4
0
 def _calculate_summary_statistics(self):
     data = self._lazy_data()
     _raveled = data.ravel()
     _mean, _std, _min, _q1, _q2, _q3, _max = da.compute(
         da.nanmean(data),
         da.nanstd(data),
         da.nanmin(data),
         da.percentile(_raveled, [25, ]),
         da.percentile(_raveled, [50, ]),
         da.percentile(_raveled, [75, ]),
         da.nanmax(data), )
     return _mean, _std, _min, _q1, _q2, _q3, _max
Пример #5
0
def test_percentile():
    d = da.ones((16,), chunks=(4,))
    assert eq(da.percentile(d, [0, 50, 100]), [1, 1, 1])

    x = np.array([0, 0, 5, 5, 5, 5, 20, 20])
    d = da.from_array(x, chunks=(3,))

    assert eq(da.percentile(d, [0, 50, 100]), [0, 5, 20])

    x = np.array(['a', 'a', 'd', 'd', 'd', 'e'])
    d = da.from_array(x, chunks=(3,))
    assert eq(da.percentile(d, [0, 50, 100]), ['a', 'd', 'e'])
Пример #6
0
 def _calculate_summary_statistics(self):
     data = self._lazy_data()
     _raveled = data.ravel()
     _mean, _std, _min, _q1, _q2, _q3, _max = da.compute(
         da.nanmean(data),
         da.nanstd(data),
         da.nanmin(data),
         da.percentile(_raveled, [25, ]),
         da.percentile(_raveled, [50, ]),
         da.percentile(_raveled, [75, ]),
         da.nanmax(data), )
     return _mean, _std, _min, _q1, _q2, _q3, _max
Пример #7
0
def test_percentile():
    d = da.ones((16,), blockshape=(4,))
    assert eq(da.percentile(d, [0, 50, 100]), [1, 1, 1])

    x = np.array([0, 0, 5, 5, 5, 5, 20, 20])
    d = da.from_array(x, blockshape=(3,))

    assert eq(da.percentile(d, [0, 50, 100]), [0, 5, 20])

    x = np.array(['a', 'a', 'd', 'd', 'd', 'e'])
    d = da.from_array(x, blockshape=(3,))
    assert eq(da.percentile(d, [0, 50, 100]), ['a', 'd', 'e'])
Пример #8
0
def test_percentiles_with_unknown_chunk_sizes():
    rs = da.random.RandomState(RandomState=cupy.random.RandomState)
    x = rs.random(1000, chunks=(100,))
    x._chunks = ((np.nan,) * 10,)

    result = da.percentile(x, 50, interpolation="midpoint").compute()
    assert type(result) == cupy.core.core.ndarray
    assert 0.1 < result < 0.9

    a, b = da.percentile(x, [40, 60], interpolation="midpoint").compute()
    assert type(a) == cupy.core.core.ndarray
    assert type(b) == cupy.core.core.ndarray
    assert 0.1 < a < 0.9
    assert 0.1 < b < 0.9
    assert a < b
Пример #9
0
def test_percentiles_with_empty_arrays():
    x = da.from_array(cupy.ones(10), chunks=((5, 0, 5), ))
    res = da.percentile(x, [10, 50, 90], interpolation="midpoint")

    assert type(res._meta) == cupy.ndarray
    assert_eq(res, res)  # Check that _meta and computed arrays match types
    assert_eq(res, np.array([1, 1, 1], dtype=x.dtype), check_type=False)
Пример #10
0
 def _calculate_summary_statistics(self, rechunk=True):
     if rechunk is True:
         # Use dask auto rechunk instead of HyperSpy's one, what should be
         # better for these operations
         rechunk = "dask_auto"
     data = self._lazy_data(rechunk=rechunk)
     _raveled = data.ravel()
     _mean, _std, _min, _q1, _q2, _q3, _max = da.compute(
         da.nanmean(data),
         da.nanstd(data),
         da.nanmin(data),
         da.percentile(_raveled, [25, ]),
         da.percentile(_raveled, [50, ]),
         da.percentile(_raveled, [75, ]),
         da.nanmax(data), )
     return _mean, _std, _min, _q1, _q2, _q3, _max
Пример #11
0
def test_percentile_with_categoricals():
    try:
        import pandas as pd
    except ImportError:
        return
    x0 = pd.Categorical(["Alice", "Bob", "Charlie", "Dennis", "Alice", "Alice"])
    x1 = pd.Categorical(["Alice", "Bob", "Charlie", "Dennis", "Alice", "Alice"])

    dsk = {("x", 0): x0, ("x", 1): x1}

    x = da.Array(dsk, "x", chunks=((6, 6),))

    p = da.percentile(x, [50])
    assert (p.compute().categories == x0.categories).all()
    assert (p.compute().codes == [0]).all()
    assert same_keys(da.percentile(x, [50]), da.percentile(x, [50]))
Пример #12
0
    def fit(
        self,
        X: Union[ArrayLike, DataFrameType],
        y: Optional[Union[ArrayLike, SeriesType]] = None,
    ) -> "RobustScaler":
        q_min, q_max = self.quantile_range
        if not 0 <= q_min <= q_max <= 100:
            raise ValueError("Invalid quantile range: %s" %
                             str(self.quantile_range))

        if isinstance(X, dd.DataFrame):
            n_columns = len(X.columns)
            partition_lengths = X.map_partitions(len).compute()
            dtype = np.find_common_type(X.dtypes, [])
            blocks = X.to_delayed()
            X = da.vstack([
                da.from_delayed(block.values,
                                shape=(length, n_columns),
                                dtype=dtype)
                for block, length in zip(blocks, partition_lengths)
            ])

        quantiles: Any = [
            da.percentile(col, [q_min, 50.0, q_max]) for col in X.T
        ]
        quantiles = da.vstack(quantiles).compute()
        self.center_: List[float] = quantiles[:, 1]
        self.scale_: List[float] = quantiles[:, 2] - quantiles[:, 0]
        self.scale_ = _handle_zeros_in_scale(self.scale_, copy=False)
        self.n_features_in_: int = X.shape[1]
        return self
Пример #13
0
 def _calculate_summary_statistics(self, rechunk=True):
     if rechunk is True:
         # Use dask auto rechunk instead of HyperSpy's one, what should be
         # better for these operations
         rechunk = "dask_auto"
     data = self._lazy_data(rechunk=rechunk)
     _raveled = data.ravel()
     _mean, _std, _min, _q1, _q2, _q3, _max = da.compute(
         da.nanmean(data),
         da.nanstd(data),
         da.nanmin(data),
         da.percentile(_raveled, [25, ]),
         da.percentile(_raveled, [50, ]),
         da.percentile(_raveled, [75, ]),
         da.nanmax(data), )
     return _mean, _std, _min, _q1, _q2, _q3, _max
Пример #14
0
    def fit(self, X, y=None):
        q_min, q_max = self.quantile_range
        if not 0 <= q_min <= q_max <= 100:
            raise ValueError("Invalid quantile range: %s" % str(self.quantile_range))

        if isinstance(X, dd.DataFrame):
            n_columns = len(X.columns)
            partition_lengths = X.map_partitions(len).compute()
            dtype = np.find_common_type(X.dtypes, [])
            blocks = X.to_delayed()
            X = da.vstack(
                [
                    da.from_delayed(
                        block.values, shape=(length, n_columns), dtype=dtype
                    )
                    for block, length in zip(blocks, partition_lengths)
                ]
            )

        quantiles = [da.percentile(col, [q_min, 50., q_max]) for col in X.T]
        quantiles = da.vstack(quantiles).compute()
        self.center_ = quantiles[:, 1]
        self.scale_ = quantiles[:, 2] - quantiles[:, 0]
        self.scale_ = skdata._handle_zeros_in_scale(self.scale_, copy=False)
        return self
Пример #15
0
def from_bcolz(x, chunksize=None, categorize=True, index=None, **kwargs):
    """ Read dask Dataframe from bcolz.ctable

    Parameters
    ----------

    x : bcolz.ctable
        Input data
    chunksize : int (optional)
        The size of blocks to pull out from ctable.  Ideally as large as can
        comfortably fit in memory
    categorize : bool (defaults to True)
        Automatically categorize all string dtypes
    index : string (optional)
        Column to make the index

    See Also
    --------

    from_array: more generic function not optimized for bcolz
    """
    import dask.array as da
    import bcolz
    if isinstance(x, (str, unicode)):
        x = bcolz.ctable(rootdir=x)
    bc_chunklen = max(x[name].chunklen for name in x.names)
    if chunksize is None and bc_chunklen > 10000:
        chunksize = bc_chunklen

    categories = dict()
    if categorize:
        for name in x.names:
            if (np.issubdtype(x.dtype[name], np.string_) or
                    np.issubdtype(x.dtype[name], np.unicode_) or
                    np.issubdtype(x.dtype[name], np.object_)):
                a = da.from_array(x[name], chunks=(chunksize * len(x.names),))
                categories[name] = da.unique(a)

    columns = tuple(x.dtype.names)
    divisions = (0,) + tuple(range(-1, len(x), chunksize))[1:]
    if divisions[-1] != len(x) - 1:
        divisions = divisions + (len(x) - 1,)
    new_name = 'from_bcolz' + next(tokens)
    dsk = dict(((new_name, i),
                (dataframe_from_ctable,
                 x,
                 (slice(i * chunksize, (i + 1) * chunksize),),
                 None, categories))
               for i in range(0, int(ceil(len(x) / chunksize))))

    result = DataFrame(dsk, new_name, columns, divisions)

    if index:
        assert index in x.names
        a = da.from_array(x[index], chunks=(chunksize * len(x.names),))
        q = np.linspace(0, 100, len(x) // chunksize + 2)
        divisions = da.percentile(a, q).compute()
        return set_partition(result, index, divisions, **kwargs)
    else:
        return result
Пример #16
0
def test_percentiles_with_empty_q():
    x = da.from_array(cupy.ones(10), chunks=((5, 0, 5),))
    result = da.percentile(x, [], interpolation="midpoint")

    assert type(result._meta) == cupy.core.core.ndarray
    assert_eq(result, result)  # Check that _meta and computed arrays match types
    assert_eq(result, np.array([], dtype=x.dtype))
Пример #17
0
Файл: io.py Проект: ogrisel/dask
def from_bcolz(x, chunksize=None, categorize=True, index=None, **kwargs):
    """ Read dask Dataframe from bcolz.ctable

    Parameters
    ----------

    x : bcolz.ctable
        Input data
    chunksize : int (optional)
        The size of blocks to pull out from ctable.  Ideally as large as can
        comfortably fit in memory
    categorize : bool (defaults to True)
        Automatically categorize all string dtypes
    index : string (optional)
        Column to make the index

    See Also
    --------

    from_array: more generic function not optimized for bcolz
    """
    import dask.array as da
    import bcolz
    if isinstance(x, (str, unicode)):
        x = bcolz.ctable(rootdir=x)
    bc_chunklen = max(x[name].chunklen for name in x.names)
    if chunksize is None and bc_chunklen > 10000:
        chunksize = bc_chunklen

    categories = dict()
    if categorize:
        for name in x.names:
            if (np.issubdtype(x.dtype[name], np.string_) or
                    np.issubdtype(x.dtype[name], np.unicode_) or
                    np.issubdtype(x.dtype[name], np.object_)):
                a = da.from_array(x[name], chunks=(chunksize * len(x.names),))
                categories[name] = da.unique(a)

    columns = tuple(x.dtype.names)
    divisions = (0,) + tuple(range(-1, len(x), chunksize))[1:]
    if divisions[-1] != len(x) - 1:
        divisions = divisions + (len(x) - 1,)
    new_name = 'from_bcolz' + next(tokens)
    dsk = dict(((new_name, i),
                (dataframe_from_ctable,
                 x,
                 (slice(i * chunksize, (i + 1) * chunksize),),
                 None, categories))
               for i in range(0, int(ceil(len(x) / chunksize))))

    result = DataFrame(dsk, new_name, columns, divisions)

    if index:
        assert index in x.names
        a = da.from_array(x[index], chunks=(chunksize * len(x.names),))
        q = np.linspace(0, 100, len(x) // chunksize + 2)
        divisions = da.percentile(a, q).compute()
        return set_partition(result, index, divisions, **kwargs)
    else:
        return result
Пример #18
0
def test_percentile_with_categoricals():
    try:
        import pandas as pd
    except ImportError:
        return
    x0 = pd.Categorical(['Alice', 'Bob', 'Charlie', 'Dennis', 'Alice', 'Alice'])
    x1 = pd.Categorical(['Alice', 'Bob', 'Charlie', 'Dennis', 'Alice', 'Alice'])

    dsk = {('x', 0): x0, ('x', 1): x1}

    x = da.Array(dsk, 'x', chunks=((6, 6),))

    p = da.percentile(x, [50])
    assert (p.compute().categories == x0.categories).all()
    assert (p.compute().codes == [0]).all()
    assert same_keys(da.percentile(x, [50]),
                     da.percentile(x, [50]))
Пример #19
0
def test_percentile_with_categoricals():
    try:
        import pandas as pd
    except ImportError:
        return
    x0 = pd.Categorical(['Alice', 'Bob', 'Charlie', 'Dennis', 'Alice', 'Alice'])
    x1 = pd.Categorical(['Alice', 'Bob', 'Charlie', 'Dennis', 'Alice', 'Alice'])

    dsk = {('x', 0): x0, ('x', 1): x1}

    x = da.Array(dsk, 'x', chunks=((6, 6),))

    p = da.percentile(x, [50])
    assert (p.compute().categories == x0.categories).all()
    assert (p.compute().codes == [0]).all()
    assert same_keys(da.percentile(x, [50]),
                    da.percentile(x, [50]))
Пример #20
0
def test_percentiles_with_scaler_percentile(internal_method, q):
    # Regression test to ensure da.percentile works with scalar percentiles
    # See #3020
    d = da.ones((16, ), chunks=(4, ))
    assert_eq(
        da.percentile(d, q, internal_method=internal_method),
        np.array([1], dtype=d.dtype),
    )
Пример #21
0
def test_percentile(method):
    d = da.ones((16,), chunks=(4,))
    qs = [0, 50, 100]

    assert_eq(da.percentile(d, qs, method=method), np.array([1, 1, 1], dtype=d.dtype))

    x = np.array([0, 0, 5, 5, 5, 5, 20, 20])
    d = da.from_array(x, chunks=(3,))

    result = da.percentile(d, qs, method=method)
    assert_eq(result, np.array([0, 5, 20], dtype=result.dtype))

    assert same_keys(
        da.percentile(d, qs, method=method), da.percentile(d, qs, method=method)
    )
    assert not same_keys(
        da.percentile(d, qs, method=method), da.percentile(d, [0, 50], method=method)
    )

    if method != "tdigest":
        x = np.array(["a", "a", "d", "d", "d", "e"])
        d = da.from_array(x, chunks=(3,))
        assert_eq(
            da.percentile(d, [0, 50, 100]), np.array(["a", "d", "e"], dtype=x.dtype)
        )
Пример #22
0
def test_percentiles_with_scaler_percentile(q):
    # Regression test to ensure da.percentile works with scalar percentiles
    # See #3020
    d = da.from_array(cupy.ones((16,)), chunks=(4,))
    result = da.percentile(d, q, interpolation="midpoint")

    assert type(result._meta) == cupy.core.core.ndarray
    assert_eq(result, result)  # Check that _meta and computed arrays match types
    assert_eq(result, np.array([1], dtype=d.dtype))
Пример #23
0
def test_percentile():
    d = da.from_array(cupy.ones((16,)), chunks=(4,))
    qs = np.array([0, 50, 100])

    assert_eq(
        da.percentile(d, qs, interpolation="midpoint"),
        np.array([1, 1, 1], dtype=d.dtype),
    )

    x = cupy.array([0, 0, 5, 5, 5, 5, 20, 20])
    d = da.from_array(x, chunks=(3,))

    result = da.percentile(d, qs, interpolation="midpoint")
    assert_eq(result, np.array([0, 5, 20], dtype=result.dtype))

    assert not same_keys(
        da.percentile(d, qs, interpolation="midpoint"),
        da.percentile(d, [0, 50], interpolation="midpoint"),
    )
Пример #24
0
def _run_dask_numpy_quantile(data, k):
    w = 100.0 / k
    p = da.arange(w, 100 + w, w)

    if p[-1] > 100.0:
        p[-1] = 100.0

    q = da.percentile(data.flatten(), p)
    q = da.unique(q)
    return q
Пример #25
0
 def statistics(self, data, pca_stats=None):
     # set headers
     if pca_stats:  # for pca
         if pca_stats["eigenvals"] is not None:
             self.stats_header.setText("Eigenvalue: {} ({}%)".format(
                 round(pca_stats["eigenvals"][self.pc_id - 1], 2),
                 round(pca_stats["eigenvals_%"][self.pc_id - 1], 2)))
             self.stats_header.setToolTip(
                 "It shows how are the dispersion of the data with respect to its component"
             )
         else:
             self.stats_header.setText("Eigenvalue: --")
             self.stats_header.setToolTip(
                 "Is only available when the components are computed with the plugin"
             )
     else:  # for aoi
         self.stats_header.setText("Pixels in AOI: {}".format(
             round(data.size if data.size > 1 else 0, 2)))
         self.stats_header.setToolTip("")
     # restore or compute the statistics
     if self.QCBox_StatsLayer.currentText(
     ) == self.pc_name and self.stats_pc is not None:
         min, max, std, p25, p50, p75 = self.stats_pc
     else:
         da_data = da.from_array(data, chunks=(8000000, ))
         min = da.min(da_data).compute()
         max = da.max(da_data).compute()
         std = da.std(da_data).compute()
         p25 = da.percentile(da_data, 25).compute()[0]
         p50 = da.percentile(da_data, 50).compute()[0]
         p75 = da.percentile(da_data, 75).compute()[0]
         if self.QCBox_StatsLayer.currentText() == self.pc_name:
             self.stats_pc = (min, max, std, p25, p50, p75)
     # set in dialog
     self.stats_min.setText(str(round(min, 2)))
     self.stats_max.setText(str(round(max, 2)))
     self.stats_std.setText(str(round(std, 2)))
     self.stats_p25.setText(str(round(p25, 2)))
     self.stats_p50.setText(str(round(p50, 2)))
     self.stats_p75.setText(str(round(p75, 2)))
Пример #26
0
def dasky_freedman_bin_width(data, return_bins=True):
    r"""Dask version of freedman_bin_width

    Parameters
    ----------
    data : dask array
        the data
    return_bins : bool (optional)
        if True, then return the bin edges

    Returns
    -------
    width : float
        optimal bin width using Scott's rule
    bins : ndarray
        bin edges: returned if `return_bins` is True

    Notes
    -----
    The optimal bin width is

    .. math::

        \Delta_b = \frac{2(q_{75} - q_{25})}{n^{1/3}}

    where :math:`q_{N}` is the :math:`N` percent quartile of the data, and
    :math:`n` is the number of data points.

    See Also
    --------
    knuth_bin_width,
    scotts_bin_width,
    astroML.plotting.hist
    """
    if not isinstance(data, da.Array):
        raise TypeError('data has to be a dask array')
    if data.ndim != 1:
        data = data.flatten()

    n = data.size

    v25, v75 = da.percentile(data, [25, 75])
    dx = 2 * (v75 - v25) * 1. / (n ** (1. / 3))
    c_dx, mx, mn = da.compute(dx, data.max(), data.min())

    if return_bins:
        Nbins = np.ceil((mx - mn) * 1. / c_dx)
        Nbins = max(1, Nbins)
        bins = mn + c_dx * np.arange(Nbins + 1)
        return c_dx, bins
    else:
        return c_dx
Пример #27
0
def dasky_freedman_bin_width(data, return_bins=True):
    r"""Dask version of freedman_bin_width

    Parameters
    ----------
    data : dask array
        the data
    return_bins : bool (optional)
        if True, then return the bin edges

    Returns
    -------
    width : float
        optimal bin width using Scott's rule
    bins : ndarray
        bin edges: returned if `return_bins` is True

    Notes
    -----
    The optimal bin width is

    .. math::

        \Delta_b = \frac{2(q_{75} - q_{25})}{n^{1/3}}

    where :math:`q_{N}` is the :math:`N` percent quartile of the data, and
    :math:`n` is the number of data points.

    See Also
    --------
    knuth_bin_width,
    scotts_bin_width,
    astroML.plotting.hist
    """
    if not isinstance(data, da.Array):
        raise TypeError('data has to be a dask array')
    if data.ndim != 1:
        data = data.flatten()

    n = data.size

    v25, v75 = da.percentile(data, [25, 75])
    dx = 2 * (v75 - v25) * 1. / (n**(1. / 3))
    c_dx, mx, mn = da.compute(dx, data.max(), data.min())

    if return_bins:
        Nbins = np.ceil((mx - mn) * 1. / c_dx)
        Nbins = max(1, Nbins)
        bins = mn + c_dx * np.arange(Nbins + 1)
        return c_dx, bins
    else:
        return c_dx
Пример #28
0
Файл: ds.py Проект: elaeon/ML
    def stadistics(self):
        headers = ["group", "mean", "std dev", "min", "25%", "50%", "75%", "max", "nonzero", "nonan", "unique", "dtype"]
        self.chunksize = Chunks.build_from_shape(self.shape, self.dtypes)
        table = []
        for group, (dtype, _) in self.dtypes.fields.items():
            values = dict()
            values["dtype"] = dtype
            values["group"] = group
            darray = self.data[group].da
            if dtype == np.dtype(float) or dtype == np.dtype(int):
                da_mean = da.around(darray.mean(), decimals=3)
                da_std = da.around(darray.std(), decimals=3)
                da_min = da.around(darray.min(), decimals=3)
                da_max = da.around(darray.max(), decimals=3)
                result = dask.compute([da_mean, da_std, da_min, da_max])[0]
                values["mean"] = result[0] if not np.isnan(result[0]) else da.around(da.nanmean(darray), decimals=3).compute()
                values["std dev"] = result[1] if not np.isnan(result[0]) else da.around(da.nanstd(darray), decimals=3).compute()
                values["min"] = result[2] if not np.isnan(result[0]) else da.around(da.nanmin(darray), decimals=3).compute()
                values["max"] = result[3] if not np.isnan(result[0]) else da.around(da.nanmax(darray), decimals=3).compute()
                if len(self.shape[group]) == 1:
                    da_percentile = da.around(da.percentile(darray, [25, 50, 75]), decimals=3)
                    result = da_percentile.compute()
                    values["25%"] = result[0]
                    values["50%"] = result[1]
                    values["75%"] = result[2]
                else:
                    values["25%"] = "-"
                    values["50%"] = "-"
                    values["75%"] = "-"
                values["nonzero"] = da.count_nonzero(darray).compute()
                values["nonan"] = da.count_nonzero(da.notnull(darray)).compute()
                values["unique"] = "-"
            else:
                values["mean"] = "-"
                values["std dev"] = "-"
                values["min"] = "-"
                values["max"] = "-"
                values["25%"] = "-"
                values["50%"] = "-"
                values["75%"] = "-"
                values["nonzero"] = "-"
                values["nonan"] = da.count_nonzero(da.notnull(darray)).compute()
                vunique = darray.to_dask_dataframe().fillna('').nunique().compute()
                values["unique"] = vunique

            row = []
            for column in headers:
                row.append(values[column])
            table.append(row)

        print("# rows {}".format(self.shape[0]))
        return tabulate(table, headers)
Пример #29
0
def create_hv_dataset(ddf, stats, percentile=(1, 99)):

    _idNames = ("patch", "tract", "filter")
    _kdims = ("ra", "dec", "psfMag")
    _flags = [c for c in ddf.columns if ddf[c].dtype == np.dtype("bool")]

    kdims = []
    vdims = []

    for c in ddf.columns:
        if c in _kdims or c in _idNames or c in _flags:
            if c in ("ra", "dec", "psfMag"):
                cmin, cmax = stats[c]["min"].min(), stats[c]["max"].max()
                c = hv.Dimension(c, range=(cmin, cmax))
            elif c in ("filter", "patch"):
                cvalues = list(ddf[c].unique())
                c = hv.Dimension(c, values=cvalues)
            elif ddf[c].dtype.kind == "b":
                c = hv.Dimension(c, values=[True, False])
            kdims.append(c)
        else:
            if percentile is not None:
                p0, p1 = percentile
                if f"{p0}%" in stats.index and f"{p1}%" in stats.index:
                    cmin, cmax = stats[c][f"{p0}%"].min(
                    ), stats[c][f"{p1}%"].max()
                else:
                    print("percentiles not found in stats, computing")
                    darray = ddf[c].values
                    cmin, cmax = da.compute(
                        da.percentile(darray, p0)[0],
                        da.percentile(darray, p1)[0])
            else:
                cmin, cmax = stats[c]["min"].min(), stats[c]["max"].max()
            c = hv.Dimension(c, range=(cmin, cmax))
            vdims.append(c)

    return hv.Dataset(ddf, kdims=kdims, vdims=vdims)
Пример #30
0
 def gram_rbf(X, threshold=1.0):
     if type(X) == torch.Tensor:
         dot_products = X @ X.t()
         sq_norms = dot_products.diag()
         sq_distances = -2*dot_products + sq_norms[:,None] + sq_norms[None,:]
         sq_median_distance = sq_distances.median()
         return torch.exp(-sq_distances / (2*threshold**2 * sq_median_distance))
     elif type(X) == da.Array:
         dot_products = X @ X.T
         sq_norms = da.diag(dot_products)
         sq_distances = -2*dot_products + sq_norms[:,None] + sq_norms[None,:]
         sq_median_distance = da.percentile(sq_distances.ravel(), 50)
         return da.exp((-sq_distances / (2*threshold**2 * sq_median_distance)))
     else:
         raise ValueError
Пример #31
0
 def _band_hist(band_data):
     cdf = da.arange(0., 1., 1. / nwidth, chunks=nwidth)
     if approximate:
         # need a 1D array
         flat_data = band_data.ravel()
         # replace with nanpercentile in the future, if available
         # dask < 0.17 returns all NaNs for this
         bins = da.percentile(flat_data[da.notnull(flat_data)],
                              cdf * 100.)
     else:
         bins = dask.delayed(np.nanpercentile)(band_data, cdf * 100.)
         bins = da.from_delayed(bins, shape=(nwidth,), dtype=cdf.dtype)
     res = dask.delayed(np.interp)(band_data, bins, cdf)
     res = da.from_delayed(res, shape=band_data.shape,
                           dtype=band_data.dtype)
     return res
Пример #32
0
def _freedman_bw_dask(data, return_bins=True):
    r"""Dask version of freedman_bin_width

    Parameters
    ----------
    data : dask array
        the data
    return_bins : bool (optional)
        if True, then return the bin edges

    Returns
    -------
    width : float
        optimal bin width using Scott's rule
    bins : ndarray
        bin edges: returned if `return_bins` is True

    Notes
    -----
    The optimal bin width is

    .. math::

        \Delta_b = \frac{2(q_{75} - q_{25})}{n^{1/3}}

    where :math:`q_{N}` is the :math:`N` percent quartile of the data, and
    :math:`n` is the number of data points.

    """
    if not isinstance(data, da.Array):
        raise TypeError("Expected a dask array")

    if data.ndim != 1:
        data = data.flatten()

    n = data.size

    v25, v75 = da.percentile(data, [25, 75])
    dx = 2 * (v75 - v25) * n**(-1.0 / 3.0)
    c_dx, mx, mn = da.compute(dx, data.max(), data.min())

    if return_bins:
        Nbins = max(1, np.ceil((mx - mn) / c_dx))
        bins = mn + c_dx * np.arange(Nbins + 1)
        return c_dx, bins
    else:
        return c_dx
Пример #33
0
def test_percentile():
    d = da.ones((16,), chunks=(4,))
    assert eq(da.percentile(d, [0, 50, 100]), [1, 1, 1])

    x = np.array([0, 0, 5, 5, 5, 5, 20, 20])
    d = da.from_array(x, chunks=(3,))
    assert eq(da.percentile(d, [0, 50, 100]), [0, 5, 20])
    assert same_keys(da.percentile(d, [0, 50, 100]),
                    da.percentile(d, [0, 50, 100]))
    assert not same_keys(da.percentile(d, [0, 50, 100]),
                        da.percentile(d, [0, 50]))

    x = np.array(['a', 'a', 'd', 'd', 'd', 'e'])
    d = da.from_array(x, chunks=(3,))
    assert eq(da.percentile(d, [0, 50, 100]), ['a', 'd', 'e'])
Пример #34
0
 def _band_hist(band_data):
     cdf = da.arange(0., 1., 1. / nwidth, chunks=nwidth)
     if approximate:
         # need a 1D array
         flat_data = band_data.ravel()
         # replace with nanpercentile in the future, if available
         # dask < 0.17 returns all NaNs for this
         bins = da.percentile(flat_data[da.notnull(flat_data)],
                              cdf * 100.)
     else:
         bins = dask.delayed(np.nanpercentile)(band_data, cdf * 100.)
         bins = da.from_delayed(bins, shape=(nwidth, ), dtype=cdf.dtype)
     res = dask.delayed(np.interp)(band_data, bins, cdf)
     res = da.from_delayed(res,
                           shape=band_data.shape,
                           dtype=band_data.dtype)
     return res
Пример #35
0
def test_percentile():
    d = da.ones((16,), chunks=(4,))
    assert_eq(da.percentile(d, [0, 50, 100]),
              np.array([1, 1, 1], dtype=d.dtype))

    x = np.array([0, 0, 5, 5, 5, 5, 20, 20])
    d = da.from_array(x, chunks=(3,))
    result = da.percentile(d, [0, 50, 100])
    assert_eq(da.percentile(d, [0, 50, 100]),
              np.array([0, 5, 20], dtype=result.dtype))
    assert same_keys(da.percentile(d, [0, 50, 100]),
                     da.percentile(d, [0, 50, 100]))
    assert not same_keys(da.percentile(d, [0, 50, 100]),
                         da.percentile(d, [0, 50]))

    x = np.array(['a', 'a', 'd', 'd', 'd', 'e'])
    d = da.from_array(x, chunks=(3,))
    assert_eq(da.percentile(d, [0, 50, 100]),
              np.array(['a', 'd', 'e'], dtype=x.dtype))
def test_percentile():
    d = da.ones((16, ), chunks=(4, ))
    assert_eq(da.percentile(d, [0, 50, 100]), np.array([1, 1, 1],
                                                       dtype=d.dtype))

    x = np.array([0, 0, 5, 5, 5, 5, 20, 20])
    d = da.from_array(x, chunks=(3, ))
    result = da.percentile(d, [0, 50, 100])
    assert_eq(da.percentile(d, [0, 50, 100]),
              np.array([0, 5, 20], dtype=result.dtype))
    assert same_keys(da.percentile(d, [0, 50, 100]),
                     da.percentile(d, [0, 50, 100]))
    assert not same_keys(da.percentile(d, [0, 50, 100]),
                         da.percentile(d, [0, 50]))

    x = np.array(['a', 'a', 'd', 'd', 'd', 'e'])
    d = da.from_array(x, chunks=(3, ))
    assert_eq(da.percentile(d, [0, 50, 100]),
              np.array(['a', 'd', 'e'], dtype=x.dtype))
def single_channel_percentile_norm(data: da.core.Array,
                                   min_p: float = 50.0,
                                   max_p: float = 99.8,
                                   **kwargs) -> da.core.Array:
    # Enforce shape
    if len(data.shape) > 4:
        raise exceptions.InvalidShapeError(len(data.shape), 4)

    # Get the norm by values
    norm_by = da.percentile(data.flatten(), [min_p, min_p]).compute()

    # Norm
    normed = (data - norm_by[0]) / (norm_by[1] - norm_by[0])

    # Clip any values outside of 0 and 1
    clipped = da.clip(normed, 0, 1)

    # Scale them between 0 and 255
    return clipped * 255
Пример #38
0
def test_percentile(method):
    d = da.ones((16,), chunks=(4,))
    qs = [0, 50, 100]

    assert_eq(da.percentile(d, qs, method=method),
              np.array([1, 1, 1], dtype=d.dtype))

    x = np.array([0, 0, 5, 5, 5, 5, 20, 20])
    d = da.from_array(x, chunks=(3,))

    result = da.percentile(d, qs, method=method)
    assert_eq(result,
              np.array([0, 5, 20], dtype=result.dtype))

    assert same_keys(da.percentile(d, qs, method=method),
                     da.percentile(d, qs, method=method))
    assert not same_keys(da.percentile(d, qs, method=method),
                         da.percentile(d, [0, 50], method=method))

    if method != 'tdigest':
        x = np.array(['a', 'a', 'd', 'd', 'd', 'e'])
        d = da.from_array(x, chunks=(3,))
        assert_eq(da.percentile(d, [0, 50, 100]),
                  np.array(['a', 'd', 'e'], dtype=x.dtype))
Пример #39
0
def from_bcolz(x, chunksize=None, categorize=True, index=None, lock=lock,
               **kwargs):
    """ Read dask Dataframe from bcolz.ctable

    Parameters
    ----------

    x : bcolz.ctable
        Input data
    chunksize : int, optional
        The size of blocks to pull out from ctable.  Ideally as large as can
        comfortably fit in memory
    categorize : bool, defaults to True
        Automatically categorize all string dtypes
    index : string, optional
        Column to make the index
    lock: bool or Lock
        Lock to use when reading or False for no lock (not-thread-safe)

    See Also
    --------

    from_array: more generic function not optimized for bcolz
    """
    if lock is True:
        lock = Lock()

    import dask.array as da
    import bcolz

    if isinstance(x, (str, unicode)):
        x = bcolz.ctable(rootdir=x)
    bc_chunklen = max(x[name].chunklen for name in x.names)
    if chunksize is None and bc_chunklen > 10000:
        chunksize = bc_chunklen

    categories = dict()
    if categorize:
        for name in x.names:
            if (np.issubdtype(x.dtype[name], np.string_) or
                np.issubdtype(x.dtype[name], np.unicode_) or
                np.issubdtype(x.dtype[name], np.object_)):
                a = da.from_array(x[name], chunks=(chunksize * len(x.names),))
                categories[name] = da.unique(a)

    columns = tuple(x.dtype.names)
    divisions = tuple(range(0, len(x), chunksize))
    divisions = divisions + (len(x) - 1,)
    if x.rootdir:
        token = tokenize((x.rootdir, os.path.getmtime(x.rootdir)), chunksize,
                         categorize, index, kwargs)
    else:
        token = tokenize((id(x), x.shape, x.dtype), chunksize, categorize,
                         index, kwargs)
    new_name = 'from_bcolz-' + token

    dsk = dict(((new_name, i),
                (dataframe_from_ctable,
                 x,
                 (slice(i * chunksize, (i + 1) * chunksize),),
                 columns, categories, lock))
               for i in range(0, int(ceil(len(x) / chunksize))))

    meta = dataframe_from_ctable(x, slice(0, 0), columns, categories, lock)
    result = DataFrame(dsk, new_name, meta, divisions)

    if index:
        assert index in x.names
        a = da.from_array(x[index], chunks=(chunksize * len(x.names),))
        q = np.linspace(0, 100, len(x) // chunksize + 2)
        divisions = da.percentile(a, q).compute()
        return set_partition(result, index, divisions, **kwargs)
    else:
        return result
Пример #40
0
def from_bcolz(x,
               chunksize=None,
               categorize=True,
               index=None,
               lock=lock,
               **kwargs):
    """ Read BColz CTable into a Dask Dataframe

    BColz is a fast on-disk compressed column store with careful attention
    given to compression.  https://bcolz.readthedocs.io/en/latest/

    Parameters
    ----------
    x : bcolz.ctable
    chunksize : int, optional
        The size(rows) of blocks to pull out from ctable.
    categorize : bool, defaults to True
        Automatically categorize all string dtypes
    index : string, optional
        Column to make the index
    lock: bool or Lock
        Lock to use when reading or False for no lock (not-thread-safe)

    See Also
    --------
    from_array: more generic function not optimized for bcolz
    """
    if lock is True:
        lock = Lock()

    import dask.array as da
    import bcolz

    if isinstance(x, (str, unicode)):
        x = bcolz.ctable(rootdir=x)
    bc_chunklen = max(x[name].chunklen for name in x.names)
    if chunksize is None and bc_chunklen > 10000:
        chunksize = bc_chunklen

    categories = dict()
    if categorize:
        for name in x.names:
            if (np.issubdtype(x.dtype[name], np.string_)
                    or np.issubdtype(x.dtype[name], np.unicode_)
                    or np.issubdtype(x.dtype[name], np.object_)):
                a = da.from_array(x[name], chunks=(chunksize * len(x.names), ))
                categories[name] = da.unique(a)

    columns = tuple(x.dtype.names)
    divisions = tuple(range(0, len(x), chunksize))
    divisions = divisions + (len(x) - 1, )
    if x.rootdir:
        token = tokenize((x.rootdir, os.path.getmtime(x.rootdir)), chunksize,
                         categorize, index, kwargs)
    else:
        token = tokenize((id(x), x.shape, x.dtype), chunksize, categorize,
                         index, kwargs)
    new_name = 'from_bcolz-' + token

    dsk = dict(((new_name, i), (dataframe_from_ctable, x,
                                (slice(i * chunksize, (i + 1) * chunksize), ),
                                columns, categories, lock))
               for i in range(0, int(ceil(len(x) / chunksize))))

    meta = dataframe_from_ctable(x, slice(0, 0), columns, categories, lock)
    result = DataFrame(dsk, new_name, meta, divisions)

    if index:
        assert index in x.names
        a = da.from_array(x[index], chunks=(chunksize * len(x.names), ))
        q = np.linspace(0, 100, len(x) // chunksize + 2)
        divisions = tuple(da.percentile(a, q).compute())
        return set_partition(result, index, divisions, **kwargs)
    else:
        return result
Пример #41
0
 def _dense_fit(self, X, random_state):
     references = self.references_ * 100
     quantiles = [da.percentile(col, references) for col in X.T]
     self.quantiles_, = compute(da.vstack(quantiles).T)
Пример #42
0
 def _dense_fit(self, X: Union[ArrayLike, DataFrameType],
                random_state: int) -> Union[ArrayLike, DataFrameType]:
     references = self.references_ * 100
     quantiles = [da.percentile(col, references) for col in X.T]
     (self.quantiles_, ) = compute(da.vstack(quantiles).T)
     return None
Пример #43
0
def test_percentiles_with_empty_arrays(method):
    x = da.ones(10, chunks=((5, 0, 5),))
    assert_eq(da.percentile(x, [10, 50, 90], method=method), np.array([1, 1, 1], dtype=x.dtype))
Пример #44
0
def test_percentiles_with_scaler_percentile(method, q):
    # Regression test to ensure da.percentile works with scalar percentiles
    # See #3020
    d = da.ones((16,), chunks=(4,))
    assert_eq(da.percentile(d, q, method=method), np.array([1], dtype=d.dtype))
Пример #45
0
def test_percentile_tokenize():
    d = da.from_array(cupy.ones((16,)), chunks=(4,))
    qs = np.array([0, 50, 100])

    assert same_keys(da.percentile(d, qs), da.percentile(d, qs))
Пример #46
0
def test_percentiles_with_empty_arrays():
    x = da.ones(10, blockdims=((5, 0, 5),))
    assert da.percentile(x, [10, 50, 90]).compute().tolist() == [1, 1, 1]