예제 #1
0
def weight_block(block, blocksize, block_info=None):
    """
    """

    # compute fixed overlap size
    overlaps = np.array([int(round(x / 8)) for x in blocksize])

    # determine which faces need linear weighting
    core_shape = []
    pads = []
    block_index = block_info[0]['chunk-location']
    block_grid = block_info[0]['num-chunks']
    for i in range(3):
        p, bl = overlaps[i], blocksize[i]
        bi, bg = block_index[i], block_grid[i]
        pad, core = [2 * p + 1, 2 * p + 1], bl - 2 * p
        if bi == 0:
            pad[0], core = 0, core + 2 * p + 1
        if bi == bg - 1:
            pad[1], core = 0, core + 2 * p + 1
        pads.append(tuple(pad))
        core_shape.append(core)

    # create weights
    weights = da.ones(core_shape, dtype=np.float32)
    weights = da.pad(weights, pads, mode='linear_ramp', end_values=0)
    weights = weights[1:-1, 1:-1, 1:-1]
    weights = weights.reshape(weights.shape + (1, ))

    # multiply data by weights and return
    return da.multiply(block, weights)
예제 #2
0
def bw_corrcoef(image1, image2, block_shape, keep_shape=False):
    """
    Blockwise Pearson correlation coefficient.
    """
    # blockwise zero-mean
    image1_zm = image1 - bw_mean(image1, block_shape, keep_shape=True)
    image2_zm = image2 - bw_mean(image2, block_shape, keep_shape=True)
    # follow Pearson correlation coefficient formula
    numerator = bw_mean(da.multiply(image1_zm, image2_zm), block_shape)
    image1_std = bw_std(image1, block_shape)
    image2_std = bw_std(image2, block_shape)
    denominator = da.multiply(image1_std, image2_std)
    bwcc = da.divide(numerator, denominator)

    if keep_shape:
        bwcc = repeat_block(bwcc, block_shape)

    return bwcc
예제 #3
0
def weight_block(block, blocksize):
    """
    """

    overlaps = np.array([int(round(x/8)) for x in blocksize])
    weights = da.ones(blocksize - 2*overlaps, dtype=np.float32)
    pads = [(2*p, 2*p) for p in overlaps]
    weights = da.pad(weights, pads, mode='linear_ramp', end_values=0)
    weights = weights.reshape(weights.shape + (1,))
    return da.multiply(block, weights)
예제 #4
0
def apply_filter_vector_dask_true(_filter, arr, chunk=5):
    out = arr.copy()
    tc, slc = pad_next_square_size(out)
    tc = da.from_array(tc, (-1, -1, chunk))
    _filter = da.from_array(_filter)
    temp = dff.ifft2(
        da.multiply(dff.ifftshift(1 - _filter[:, :, None]),
                    dff.fft2(tc, axes=(0, 1))),
        axes=(0, 1),
    ).real
    return reverse_padding(arr, temp, slc)
예제 #5
0
def _remove_bad_pixels(dask_array, bad_pixel_array):
    """Replace values in bad pixels with mean of neighbors.

    Parameters
    ----------
    dask_array : Dask array
        Must be at least two dimensions
    bad_pixel_array : array-like
        Must either have the same shape as dask_array,
        or the same shape as the two last dimensions of dask_array.

    Returns
    -------
    data_output : Dask array

    Examples
    --------
    >>> import pyxem.utils.dask_tools as dt
    >>> s = pxm.dummy_data.dummy_data.get_dead_pixel_signal(lazy=True)
    >>> dead_pixels = dt._find_dead_pixels(s.data)
    >>> data_output = dt._remove_bad_pixels(s.data, dead_pixels)

    """
    if len(dask_array.shape) < 2:
        raise ValueError("dask_array {0} must be at least 2 dimensions".format(
            dask_array.shape))
    if bad_pixel_array.shape == dask_array.shape:
        pass
    elif bad_pixel_array.shape == dask_array.shape[-2:]:
        temp_array = da.zeros_like(dask_array)
        bad_pixel_array = da.add(temp_array, bad_pixel_array)
    else:
        raise ValueError(
            "bad_pixel_array {0} must either 2-D and have the same shape "
            "as the two last dimensions in dask_array {1}. Or be "
            "the same shape as dask_array {2}".format(bad_pixel_array.shape,
                                                      dask_array.shape[-2:],
                                                      dask_array.shape))
    dif0 = da.roll(dask_array, shift=1, axis=-2)
    dif1 = da.roll(dask_array, shift=-1, axis=-2)
    dif2 = da.roll(dask_array, shift=1, axis=-1)
    dif3 = da.roll(dask_array, shift=-1, axis=-1)

    dif = (dif0 + dif1 + dif2 + dif3) / 4
    dif = dif * bad_pixel_array

    data_output = da.multiply(dask_array, da.logical_not(bad_pixel_array))
    data_output = data_output + dif

    return data_output
예제 #6
0
def matern32(coords, lambda0):
    """ Matern 3/2 covariance kernel.

    Parameters
    ----------
    coords: (n_pts, n_dims) dask.array or Future
        Point coordinates.

    Returns
    -------
    covs: (n_pts, n_pts) delayed dask.array
        Pairwise covariance matrix.

    """
    dists = dask_distance.euclidean(coords)
    res = da.multiply(
            1 + (np.sqrt(3) / lambda0) * dists,
            da.exp(-(np.sqrt(3) / lambda0) * dists))
    return res
예제 #7
0
    def _scale_x(self, x, sym: bool = False) -> da.core.Array:
        """ Scales the product of a matrix multiplication instead of the matrix itself

        Let A be a matrix of shape (n by p) with non zero column stds, D of shape (p,).

        Matrix B could be constructed as follows with zero column std.
            B = A*Inv(Diag(D))
        However, this is inefficient if only the matrix product of B, with a matrix x is needed.
        Instead `_scale_x` implements:
            Ax*Inv(Diag(D))
            ^^
            x being passed in already computed as Ax
        with efficient broadcasting.

        Parameters
        ----------
        x : array_like
            Usually the product of Ax that needs to be scaled
        sym : bool
            Flag whether we are scaling twice in the case of AA'x
            The square of the column standard deviations must be removed

        Returns
        -------
        x_scaled : array_like

        """
        try:
            # self._array_moment.vector_width is not set until ScaledCenterArray is fit_x.
            if len(x.shape
                   ) == 2 and self._array_moment.vector_width == x.shape[1]:
                scale_matrix = self._array_moment.sym_scale_matrix if sym else self._array_moment.scale_matrix
                return da.multiply(scale_matrix, x)
        except ValueError:
            pass
        scale_vector = self._array_moment.sym_scale_vector if sym else self._array_moment.scale_vector
        x_scaled = diag_dot(scale_vector, x, return_diag=False)
        return x_scaled
예제 #8
0
파일: spectral.py 프로젝트: akoury/dask-ml
    def fit(self, X, y=None):
        X = self._check_array(X)
        n_components = self.n_components
        metric = self.affinity
        rng = check_random_state(self.random_state)
        n_clusters = self.n_clusters

        # kmeans for final clustering
        if isinstance(self.assign_labels, six.string_types):
            if self.assign_labels == "kmeans":
                km = KMeans(
                    n_clusters=n_clusters,
                    random_state=draw_seed(rng,
                                           np.iinfo("i4").max,
                                           dtype="uint"),
                )
            elif self.assign_labels == "sklearn-kmeans":
                km = sklearn.cluster.KMeans(n_clusters=n_clusters,
                                            random_state=rng)
            else:
                msg = "Unknown 'assign_labels' {!r}".format(self.assign_labels)
                raise ValueError(msg)
        elif isinstance(self.assign_labels, BaseEstimator):
            km = self.assign_labels
        else:
            raise TypeError("Invalid type {} for 'assign_labels'".format(
                type(self.assign_labels)))

        if self.kmeans_params:
            km.set_params(**self.kmeans_params)

        n = len(X)
        if n <= n_components:
            msg = ("'n_components' must be smaller than the number of samples."
                   " Got {} components and {} samples".format(n_components, n))
            raise ValueError(msg)

        params = self.kernel_params or {}
        params["gamma"] = self.gamma
        params["degree"] = self.degree
        params["coef0"] = self.coef0

        # indices for our exact / approximate blocks
        inds = np.arange(n)
        keep = rng.choice(inds, n_components, replace=False)
        keep.sort()
        rest = ~np.isin(inds, keep)

        # compute the exact blocks
        # these are done in parallel for dask arrays
        if isinstance(X, da.Array):
            X_keep = X[keep].rechunk(X.shape).persist()
        else:
            X_keep = X[keep]

        X_rest = X[rest]

        A, B = embed(X_keep, X_rest, n_components, metric, params)
        _log_array(logger, A, "A")
        _log_array(logger, B, "B")

        # now the approximation of C
        a = A.sum(0)  # (l,)
        b1 = B.sum(1)  # (l,)
        b2 = B.sum(0)  # (m,)

        # TODO: I think we have some unnecessary delayed wrapping of A here.
        A_inv = da.from_delayed(delayed(pinv)(A), A.shape, A.dtype)

        inner = A_inv.dot(b1)
        d1_si = 1 / da.sqrt(a + b1)

        d2_si = 1 / da.sqrt(b2 + B.T.dot(inner))  # (m,), dask array

        # d1, d2 are diagonal, so we can avoid large matrix multiplies
        # Equivalent to diag(d1_si) @ A @ diag(d1_si)
        A2 = d1_si.reshape(-1, 1) * A * d1_si.reshape(1, -1)  # (n, n)
        _log_array(logger, A2, "A2")
        # A2 = A2.rechunk(A2.shape)
        # Equivalent to diag(d1_si) @ B @ diag(d2_si)
        B2 = da.multiply(da.multiply(d1_si.reshape(-1, 1), B),
                         d2_si.reshape(1, -1))
        _log_array(logger, B2, "B2")

        U_A, S_A, V_A = delayed(svd, pure=True, nout=3)(A2)

        U_A = da.from_delayed(U_A, (n_components, n_components), A2.dtype)
        S_A = da.from_delayed(S_A, (n_components, ), A2.dtype)
        V_A = da.from_delayed(V_A, (n_components, n_components), A2.dtype)

        # Eq 16. This is OK when V2 is orthogonal
        V2 = da.sqrt(float(n_components) / n) * da.vstack(
            [A2, B2.T]).dot(U_A[:, :n_clusters]).dot(
                da.diag(1.0 / da.sqrt(S_A[:n_clusters])))  # (n, k)
        _log_array(logger, V2, "V2.1")

        if isinstance(B2, da.Array):
            V2 = V2.rechunk((B2.chunks[1][0], n_clusters))
            _log_array(logger, V2, "V2.2")

        # normalize (Eq. 4)
        U2 = (V2.T / da.sqrt((V2**2).sum(1))).T  # (n, k)

        _log_array(logger, U2, "U2.2")

        # Recover original indices
        U2 = _slice_mostly_sorted(U2, keep, rest, inds)  # (n, k)

        _log_array(logger, U2, "U2.3")

        if self.persist_embedding and isinstance(U2, da.Array):
            logger.info("Persisting array for k-means")
            U2 = U2.persist()
        elif isinstance(U2, da.Array):
            logger.info(
                "Consider persist_embedding. This will require %s",
                _format_bytes(U2.nbytes),
            )
            pass
        logger.info("k-means for assign_labels[starting]")
        km.fit(U2)
        logger.info("k-means for assign_labels[finished]")

        # Now... what to keep?
        self.assign_labels_ = km
        self.labels_ = km.labels_
        self.eigenvalues_ = S_A[:n_clusters]  # TODO: better name
        return self
예제 #9
0
def _center_of_mass_array(dask_array, threshold_value=None, mask_array=None):
    """Find center of mass of last two dimensions for a dask array.

    The center of mass can be calculated using a mask and threshold.

    Parameters
    ----------
    dask_array : Dask array
        Must have either 2, 3 or 4 dimensions.
    threshold_value : scalar, optional
    mask_array : NumPy array, optional
        Array with bool values. The True values will be masked
        (i.e. ignored). Must have the same shape as the two
        last dimensions in dask_array.

    Returns
    -------
    center_of_mask_dask_array : Dask array

    Examples
    --------
    >>> import dask.array as da
    >>> import pyxem.utils.dask_tools as dt
    >>> data = da.random.random(
    ...     size=(64, 64, 128, 128), chunks=(16, 16, 128, 128))
    >>> output_dask = dt._center_of_mass_array(data)
    >>> output = output_dask.compute()

    Masking everything except the center of the image

    >>> mask_array = np.ones(shape=(128, 128), dtype=bool)
    >>> mask_array[64-10:64+10, 64-10:64+10] = False
    >>> output_dask = dt._center_of_mass_array(data, mask_array=mask_array)
    >>> output = output_dask.compute()

    Masking and thresholding

    >>> output_dask = dt._center_of_mass_array(
    ...     data, mask_array=mask_array, threshold_value=3)
    >>> output = output_dask.compute()

    """
    det_shape = dask_array.shape[-2:]
    y_grad, x_grad = np.mgrid[0:det_shape[0], 0:det_shape[1]]
    y_grad, x_grad = y_grad.astype(np.float64), x_grad.astype(np.float64)
    sum_array = np.ones_like(x_grad)

    if mask_array is not None:
        if not mask_array.shape == det_shape:
            raise ValueError(
                "mask_array ({0}) must have same shape as last two "
                "dimensions of the dask_array ({1})".format(
                    mask_array.shape, det_shape))
        x_grad = x_grad * np.invert(mask_array)
        y_grad = y_grad * np.invert(mask_array)
        sum_array = sum_array * np.invert(mask_array)
    if threshold_value is not None:
        dask_array = _threshold_array(dask_array,
                                      threshold_value=threshold_value,
                                      mask_array=mask_array)

    x_shift = da.multiply(dask_array, x_grad, dtype=np.float64)
    y_shift = da.multiply(dask_array, y_grad, dtype=np.float64)
    sum_array = da.multiply(dask_array, sum_array, dtype=np.float64)

    x_shift = np.sum(x_shift, axis=(-2, -1), dtype=np.float64)
    y_shift = np.sum(y_shift, axis=(-2, -1), dtype=np.float64)
    sum_array = np.sum(sum_array, axis=(-2, -1), dtype=np.float64)

    beam_shifts = da.stack((x_shift, y_shift))
    beam_shifts = da.divide(beam_shifts[:], sum_array, dtype=np.float64)
    return beam_shifts
예제 #10
0
def test(
    size_per_proc=1000,
    num_procs=1,
    num_runs=1,
    ty="int64",
    key_length=10,
    scale_lhs_only=False,
    package="legate",
):
    if package == "legate":
        from legate import numpy as np, pandas as pd
        from legate.numpy.random import randn

    elif package == "cudf":
        import cudf as pd
        import cupy as np
        from cupy.random import randn

    elif package == "pandas":
        import numpy as np
        import pandas as pd
        from numpy.random import randn

    elif package == "dask" or package == "daskcudf":
        import dask.array as da
        import dask.dataframe as df
        import numpy as np

        if package == "daskcudf":
            import cudf

    else:
        print("Unknown dataframe package: %s" % package)
        assert False

    if package == "legate":
        from legate.timing import time

        def block(*args):
            pass

        def get_timestamp():
            return time()

        def compute_elapsed_time(start_ts, stop_ts):
            return (stop_ts - start_ts) / 1000.0

    elif package == "dask" or package == "daskcudf":
        import time

        def block(*args):
            for arg in args:
                arg.compute()

        get_timestamp = time.process_time

        def compute_elapsed_time(start_ts, stop_ts):
            return (stop_ts - start_ts) * 1000.0

    else:
        import time

        def block(*args):
            pass

        get_timestamp = time.process_time

        def compute_elapsed_time(start_ts, stop_ts):
            return (stop_ts - start_ts) * 1000.0

    if scale_lhs_only:
        size = size_per_proc * num_procs
        size_rhs = size // 3

        if package == "dask" or package == "daskcudf":
            # Dask array does not have randn so use arrange
            c1 = da.arange(size, dtype=np.float64, chunks=size_per_proc)
            c2 = da.arange(
                size_rhs,
                dtype=np.float64,
                chunks=(size_per_proc + num_procs - 1) // num_procs,
            )
        else:
            c1 = randn(size)
            c2 = randn(size_rhs)

        key_dtype = np.int64
        if package == "dask" or package == "daskcudf":
            key_left = (
                da.arange(size, dtype=key_dtype, chunks=size_per_proc)
                % size_per_proc
            )
            key_right = da.arange(
                size_rhs,
                dtype=key_dtype,
                chunks=(size_per_proc + num_procs - 1) // num_procs,
            )
            da.multiply(key_right, 3, out=key_right)
        else:
            key_left = np.arange(size, dtype=key_dtype) % size_per_proc
            key_right = np.arange(size_rhs, dtype=key_dtype)
            np.multiply(key_right, 3, out=key_right)

    else:
        size = size_per_proc * num_procs
        size_rhs = size

        if package == "dask" or package == "daskcudf":
            # Dask array does not have randn so use arrange
            c1 = da.arange(size, dtype=np.float64, chunks=size_per_proc)
            c2 = da.arange(size, dtype=np.float64, chunks=size_per_proc)
        else:
            c1 = randn(size)
            c2 = randn(size)

        key_dtype = np.int64
        if package == "dask" or package == "daskcudf":
            key_left = da.arange(size, dtype=key_dtype, chunks=size_per_proc)
            key_right = da.arange(size, dtype=key_dtype, chunks=size_per_proc)
        else:
            key_left = np.arange(size, dtype=key_dtype)
            key_right = np.arange(size, dtype=key_dtype)
        # np.floor_divide(key_right, 3, out=key_right)
        # np.multiply(key_right, 3, out=key_right)

    if package == "dask" or package == "daskcudf":
        df1 = df.multi.concat(
            [df.from_dask_array(a) for a in [c1, key_left]], axis=1
        )
        df1.columns = ["c1", "key"]
        df2 = df.multi.concat(
            [df.from_dask_array(a) for a in [c2, key_right]], axis=1
        )
        df2.columns = ["c2", "key"]
        if package == "daskcudf":
            df1 = df1.map_partitions(cudf.from_pandas)
            df2 = df2.map_partitions(cudf.from_pandas)
    else:
        df1 = pd.DataFrame({"c1": c1, "key": key_left})
        df2 = pd.DataFrame({"c2": c2, "key": key_right})
    df2["key"] = df2["key"] // 3 * 3

    if ty == "string":
        df1["key"] = (
            df1["key"]
            .astype("string")
            .str.pad(width=key_length, side="both", fillchar="0")
        )
        df2["key"] = (
            df2["key"]
            .astype("string")
            .str.pad(width=key_length, side="both", fillchar="0")
        )

    print(
        "Type: inner, Size: %u x %u, Key dtype: %s"
        % (size, size_rhs, str(key_dtype))
    )

    block(df1, df2)

    for i in range(num_runs):
        start_ts = get_timestamp()

        df_result = df1.merge(df2, on="key")

        block(df_result)

        stop_ts = get_timestamp()

        print(
            "[Run %d] Elapsed time: %lf ms"
            % (i + 1, compute_elapsed_time(start_ts, stop_ts))
        )

        del df_result