예제 #1
0
def show_images():

    x = plt.figure(1)
    plt.clf()
    plt.imshow(sky, vmin=da.min(sky), vmax=da.max(sky))
    plt.title('sky')
    plt.show(block=False)

    y = plt.figure(2)
    plt.clf()
    plt.imshow(psf, vmin=da.min(psf), vmax=da.max(psf))
    plt.title('psf')
    plt.show(block=False)

    z = plt.figure(3)
    plt.clf()
    plt.imshow(dirty, vmin=da.min(dirty), vmax=da.max(dirty))
    plt.title('dirty')
    plt.show(block=False)
    while (plt.fignum_exists(1) and plt.fignum_exists(2)
           and plt.fignum_exists(3)):
        try:
            plt.pause(10000000)
            plt.close("all")
        except:
            break
예제 #2
0
        def load_data(statistic, axis):
            import dask.array as da
            import numpy as np
            from glue.utils import view_shape
            x = da.from_zarr('/mnt/cephfs/zarr_data_full')
            f = 1500
            scale = 2

            lh = []
            for k in range(scale):
                lc = []
                for i in range(scale):
                    lr = []
                    for j in range(scale):
                        lr.append(x[f % 3500])
                        f = f + 1
                    lc.append(da.concatenate(lr))
                lh.append(da.concatenate(lc, 1))
            z = da.concatenate(lh, 2)

            if statistic == 'minimum':
                return da.min(z, axis).compute()
            elif statistic == 'maximum':
                return da.max(z, axis).compute()
            elif statistic == 'mean' or statistic == 'median':
                return da.mean(z, axis).compute()
            elif statistic == 'percentile':
                return percentile / 100
            elif statistic == 'sum':
                return da.sum(z.axis).compute()
            return 0
    def extract(self):

        df_path = pd.read_csv('path_to_file.csv', sep=';')

        df_path = df_path.rename(columns={'Unnamed: 0': 'id'})
        df_path = df_path.set_index('id')

        print(df_path)

        ds_batch = xr.open_mfdataset(df_path['path'],
                                     parallel=True)  #loading ncdf files

        print(ds_batch)

        print("--- Total size (GB):")
        print(ds_batch.nbytes * (2**-30))  # get size of the dataset in GB

        #getting average albedos over whole time period (used for maps and scatter plots)
        darr = ds_batch['QFLAG']  #getting data for specific band
        print(darr)

        #res = darr.mean(['lon','lat'])
        #res = da.count_nonzero( da.bitwise_and(darr//2**5, 1), ['lon','lat'])
        #res = (darr==32).sum(['lon','lat'])
        #res = xr.ufunc.bitwise_and(darr, 0b100000).sum(['lon','lat'])
        func = lambda x: np.bitwise_and(np.right_shift(x, 5), np.uint64(1))
        func = lambda x: np.bitwise_and(x, np.uint64(1))
        res = xr.apply_ufunc(func,
                             darr,
                             input_core_dims=[['lon', 'lat']],
                             dask='parallelized',
                             vectorize=True)
        #res = itwise_and(np.right_shift(darr, 5), 1).sum(['lon','lat])
        #res = (darr==32).max(['lon','lat'])
        print(np.array(res))

        sys.exit()

        da_count = ((da >> 5) & 1)  #calculate mean over time
        #da_mean_lowres = da_mean.sel(lat=slice(70, 30)).sel(lon=slice(-25, 70)) # this can be used to zoom in over Europe
        da_mean_lowres = da_mean.isel(lat=slice(None, None, 10)).isel(
            lon=slice(None, None, 10))  #downsampling for faster plotting

        #getting average, min and max albedos for each time step (used to plot timeline)
        da_timeline_mean = da.mean(['lon', 'lat'])
        da_timeline_max = da.max(['lon', 'lat'])
        da_timeline_min = da.min(['lon', 'lat'])

        #closing arrays to free memory
        DS.close()
        da.close()
        da_mean.close()

        return da_mean_lowres, da_timeline_mean, da_timeline_max, da_timeline_min

        da_mean_lowres.close()
        da_timeline_mean.close()
        da_timeline_max.close()
        da_timeline_min.close()
예제 #4
0
def plot_subfigure(X, Y, subplot, transform):
    if transform == "pca":
        X = PCA(n_components=2).fit_transform(X)
    elif transform == "cca":
        X = CCA(n_components=2).fit(X, Y).transform(X)
    else:
        raise ValueError

    min_x = da.min(X[:, 0])
    max_x = da.max(X[:, 0])

    min_y = da.min(X[:, 1])
    max_y = da.max(X[:, 1])

    classif = OneVsRestClassifier(LogisticRegression())
    classif.fit(X, Y)
    y_pred = classif.predict(X)

    print('{} + OneVsRestClassifier + LogisticRegression accuracy_score {}'.
          format(transform, accuracy_score(Y, y_pred)))

    plt.subplot(1, 2, subplot)
    plt.scatter(X[:, 0], X[:, 1], s=15, c='gray', edgecolors=(0, 0, 0))

    for i in da.unique(Y.argmax(axis=1)):
        class_ = da.where(Y[:, i])
        plt.scatter(X[class_, 0],
                    X[class_, 1],
                    s=25,
                    linewidths=2,
                    label='Class {}'.format(str(i)))

    for i in range(len(classif.estimators_)):
        plot_hyperplane(classif.estimators_[i], min_x, max_x, 'k--',
                        'Boundary\nfor class {}'.format(str(i)))

    plt.xticks(())
    plt.yticks(())

    plt.xlim(min_x - .1 * max_x, max_x + .1 * max_x)
    plt.ylim(min_y - .1 * max_y, max_y + .1 * max_y)
예제 #5
0
def add_data(workspace: String, dataset: String):
    import dask.array as da
    from survos2.improc.utils import optimal_chunksize
    ws = get(workspace)
    with dataset_from_uri(dataset, mode='r') as data:
        chunk_size = optimal_chunksize(data, Config['computing.chunk_size'])
        data = da.from_array(data, chunks=chunk_size)
        data -= da.min(data)
        data /= da.max(data)
        ds = ws.add_data(data)
    logger.info(type(ds))
    return ds
예제 #6
0
def show_images():

    plt.figure(1)
    plt.clf()
    plt.imshow(quad, vmin=da.min(quad), vmax=da.max(quad))
    plt.title('quad')
    plt.show(block=False)
    while (plt.fignum_exists(1)):
        try:
            plt.pause(100000)
            plt.close("all")
        except:
            break
예제 #7
0
def show_results():

    x = plt.figure(1)
    plt.clf()
    plt.imshow(hub, vmin=da.min(hub), vmax=da.max(hub))
    plt.title('huber')
    plt.show(block=False)

    while (plt.fignum_exists(1)):
        try:
            plt.pause(10000000)
            plt.close("all")
        except:
            break
예제 #8
0
def show_images():

    for i in range(len(dirty)):
        plt.figure(i+1)
        plt.clf()
        plt.imshow(quad[i], vmin = da.min(quad[i]), vmax = da.max(quad[i]))
        plt.title('quad' + str(i))
    plt.show(block=False)
    while(plt.fignum_exists(1)):
        try:
            plt.pause(100000)
            plt.close("all")
        except:
            break
예제 #9
0
def _perlin_dask_numpy(data: da.Array, freq: tuple, seed: int) -> da.Array:
    np.random.seed(seed)
    p = np.random.permutation(2**20)
    p = np.append(p, p)

    height, width = data.shape
    linx = da.linspace(0, freq[0], width, endpoint=False, dtype=np.float32)
    liny = da.linspace(0, freq[1], height, endpoint=False, dtype=np.float32)
    x, y = da.meshgrid(linx, liny)

    _func = partial(_perlin, p)
    data = da.map_blocks(_func, x, y, meta=np.array((), dtype=np.float32))

    data = (data - da.min(data)) / da.ptp(data)
    return data
예제 #10
0
def test_reductions():
    x = np.arange(5).astype('f4')
    a = da.from_array(x, blockshape=(2, ))

    assert eq(da.all(a), np.all(x))
    assert eq(da.any(a), np.any(x))
    assert eq(da.argmax(a, axis=0), np.argmax(x, axis=0))
    assert eq(da.argmin(a, axis=0), np.argmin(x, axis=0))
    assert eq(da.max(a), np.max(x))
    assert eq(da.mean(a), np.mean(x))
    assert eq(da.min(a), np.min(x))
    assert eq(da.nanargmax(a, axis=0), np.nanargmax(x, axis=0))
    assert eq(da.nanargmin(a, axis=0), np.nanargmin(x, axis=0))
    assert eq(da.nanmax(a), np.nanmax(x))
    assert eq(da.nanmin(a), np.nanmin(x))
    assert eq(da.nansum(a), np.nansum(x))
    assert eq(da.nanvar(a), np.nanvar(x))
    assert eq(da.nanstd(a), np.nanstd(x))
예제 #11
0
def test_reductions():
    x = np.arange(5).astype('f4')
    a = da.from_array(x, chunks=(2,))

    assert eq(da.all(a), np.all(x))
    assert eq(da.any(a), np.any(x))
    assert eq(da.argmax(a, axis=0), np.argmax(x, axis=0))
    assert eq(da.argmin(a, axis=0), np.argmin(x, axis=0))
    assert eq(da.max(a), np.max(x))
    assert eq(da.mean(a), np.mean(x))
    assert eq(da.min(a), np.min(x))
    assert eq(da.nanargmax(a, axis=0), np.nanargmax(x, axis=0))
    assert eq(da.nanargmin(a, axis=0), np.nanargmin(x, axis=0))
    assert eq(da.nanmax(a), np.nanmax(x))
    assert eq(da.nanmin(a), np.nanmin(x))
    assert eq(da.nansum(a), np.nansum(x))
    assert eq(da.nanvar(a), np.nanvar(x))
    assert eq(da.nanstd(a), np.nanstd(x))
예제 #12
0
def add_data(workspace: String, data_fname: String):
    import dask.array as da

    from survos2.improc.utils import optimal_chunksize

    ws = get(workspace)
    logger.info(f"Adding data to workspace {ws}")

    with dataset_from_uri(data_fname, mode="r") as data:

        chunk_size = optimal_chunksize(data, Config["computing.chunk_size"])
        logger.debug(
            f'Calculating optimal chunk size using chunk_size {Config["computing.chunk_size"]}: {chunk_size}'
        )

        data = da.from_array(data, chunks=chunk_size)
        data -= da.min(data)
        data /= da.max(data)
        ds = ws.add_data(data)
        # ds.set_attr("chunk_size", chunk_size)
    return ds
예제 #13
0
 def statistics(self, data, pca_stats=None):
     # set headers
     if pca_stats:  # for pca
         if pca_stats["eigenvals"] is not None:
             self.stats_header.setText("Eigenvalue: {} ({}%)".format(
                 round(pca_stats["eigenvals"][self.pc_id - 1], 2),
                 round(pca_stats["eigenvals_%"][self.pc_id - 1], 2)))
             self.stats_header.setToolTip(
                 "It shows how are the dispersion of the data with respect to its component"
             )
         else:
             self.stats_header.setText("Eigenvalue: --")
             self.stats_header.setToolTip(
                 "Is only available when the components are computed with the plugin"
             )
     else:  # for aoi
         self.stats_header.setText("Pixels in AOI: {}".format(
             round(data.size if data.size > 1 else 0, 2)))
         self.stats_header.setToolTip("")
     # restore or compute the statistics
     if self.QCBox_StatsLayer.currentText(
     ) == self.pc_name and self.stats_pc is not None:
         min, max, std, p25, p50, p75 = self.stats_pc
     else:
         da_data = da.from_array(data, chunks=(8000000, ))
         min = da.min(da_data).compute()
         max = da.max(da_data).compute()
         std = da.std(da_data).compute()
         p25 = da.percentile(da_data, 25).compute()[0]
         p50 = da.percentile(da_data, 50).compute()[0]
         p75 = da.percentile(da_data, 75).compute()[0]
         if self.QCBox_StatsLayer.currentText() == self.pc_name:
             self.stats_pc = (min, max, std, p25, p50, p75)
     # set in dialog
     self.stats_min.setText(str(round(min, 2)))
     self.stats_max.setText(str(round(max, 2)))
     self.stats_std.setText(str(round(std, 2)))
     self.stats_p25.setText(str(round(p25, 2)))
     self.stats_p50.setText(str(round(p50, 2)))
     self.stats_p75.setText(str(round(p75, 2)))
예제 #14
0
def test_workspace():
    ws = Workspace(".")
    workspace_fpath = "./newws1"
    ws = ws.create(workspace_fpath)
    data_fname = "./tmp/testvol_4x4x4b.h5"

    with dataset_from_uri(data_fname, mode="r") as data:
        chunk_size = optimal_chunksize(data, Config["computing.chunk_size"])
        data = da.from_array(data, chunks=chunk_size)
        data -= da.min(data)
        data /= da.max(data)
        ds = ws.add_data(data)
        # ds.set_attr("chunk_size", chunk_size)

    ws.add_dataset("testds", "float32")
    assert ws.exists(workspace_fpath)
    assert ws.has_data()
    assert ws.available_datasets() == ['testds']
    ws.add_session('newsesh')
    assert ws.has_session('newsesh')

    ws.delete()
예제 #15
0
def nearestPD(A, threads=1):
    """
    Find the nearest positive-definite matrix to input

    A Python/Numpy port of John D'Errico's `nearestSPD` MATLAB code [1], which
    credits [2] from Ahmed Fasih

    [1] https://www.mathworks.com/matlabcentral/fileexchange/42885-nearestspd

    [2] N.J. Higham, "Computing a nearest symmetric positive semidefinite
    matrix" (1988): https://doi.org/10.1016/0024-3795(88)90223-6
    """
    isPD = lambda x: da.all(np.linalg.eigvals(x) > 0).compute()
    B = (A + A.T) / 2
    _, s, V = da.linalg.svd(B)
    H = da.dot(V.T, da.dot(da.diag(s), V))
    A2 = (B + H) / 2
    A3 = (A2 + A2.T) / 2
    if isPD(A3):
        return A3
    spacing = da.spacing(da.linalg.norm(A))
    # The above is different from [1]. It appears that MATLAB's `chol` Cholesky
    # decomposition will accept matrixes with exactly 0-eigenvalue, whereas
    # Numpy's will not. So where [1] uses `eps(mineig)` (where `eps` is Matlab
    # for `np.spacing`), we use the above definition. CAVEAT: our `spacing`
    # will be much larger than [1]'s `eps(mineig)`, since `mineig` is usually on
    # the order of 1e-16, and `eps(1e-16)` is on the order of 1e-34, whereas
    # `spacing` will, for Gaussian random matrixes of small dimension, be on
    # othe order of 1e-16. In practice, both ways converge, as the unit test
    # below suggests.
    eye_chunk = estimate_chunks((A.shape[0], A.shape[0]), threads=threads)[0]
    I = da.eye(A.shape[0], chunks=eye_chunk)
    k = 1
    while not isPD(A3):
        mineig = da.min(da.real(np.linalg.eigvals(A3)))
        A3 += I * (-mineig * k**2 + spacing)
        k += 1
    return A3
예제 #16
0
def two_point_stats(arr1,
                    arr2,
                    periodic_boundary=True,
                    cutoff=None,
                    mask=None):
    r"""Calculate the 2-points stats for two arrays

    The discretized two point statistics are given by

    .. math::

       f[r \; \vert \; l, l'] = \frac{1}{S} \sum_s m[s, l] m[s + r, l']

    where :math:`f[r \; \vert \; l, l']` is the conditional
    probability of finding the local states :math:`l` and :math:`l` at
    a distance and orientation away from each other defined by the
    vector :math:`r`. `See this paper for more details on the
    notation. <https://doi.org/10.1007/s40192-017-0089-0>`_

    The array ``arr1[i]`` (state :math:`l`) is correlated with
    ``arr2[i]`` (state :math:`l'`) for each sample ``i``. Both arrays
    must have the same number of samples and nominal states (integer
    value) or continuous variables.

    To calculate multiple different correlations for each sample, see
    :func:`~pymks.correlations_multiple`.

    To use ``two_point_stats`` as part of a Scikit-learn pipeline, see
    :class:`~pymks.TwoPointCorrelation`.

    Args:
      arr1: array used to calculate cross-correlations, shape
        ``(n_samples,n_x,n_y)``
      arr2: array used to calculate cross-correlations, shape
        ``(n_samples,n_x,n_y)``
      periodic_boundary: whether to assume a periodic boundary
        (default is ``True``)
      cutoff: the subarray of the 2 point stats to keep
      mask: array specifying confidence in the measurement at a pixel,
        shape ``(n_samples,n_x,n_y)``. In range [0,1].

    Returns:
      the snipped 2-points stats

    If both arrays are Dask arrays then a Dask array is returned.

    >>> out = two_point_stats(
    ...     da.from_array(np.arange(10).reshape(2, 5), chunks=(2, 5)),
    ...     da.from_array(np.arange(10).reshape(2, 5), chunks=(2, 5)),
    ... )
    >>> out.chunks
    ((2,), (5,))
    >>> out.shape
    (2, 5)

    If either of the arrays are Numpy then a Numpy array is returned.

    >>> two_point_stats(
    ...     np.arange(10).reshape(2, 5),
    ...     np.arange(10).reshape(2, 5),
    ... )
    array([[ 3.,  4.,  6.,  4.,  3.],
           [48., 49., 51., 49., 48.]])

    Test masking

    >>> array = da.array([[[1, 0 ,0], [0, 1, 1], [1, 1, 0]]])
    >>> mask = da.array([[[1, 1, 1], [1, 1, 1], [1, 0, 0]]])
    >>> norm_mask = da.array([[[2, 4, 3], [4, 7, 4], [3, 4, 2]]])
    >>> expected = da.array([[[1, 0, 1], [1, 4, 1], [1, 0, 1]]]) / norm_mask
    >>> assert np.allclose(
    ...     two_point_stats(array, array, mask=mask, periodic_boundary=False)[:, 1:-1, 1:-1],
    ...     expected
    ... )

    The mask must be in the range 0 to 1.

    >>> array = da.array([[[1, 0], [0, 1]]])
    >>> mask =  da.array([[[2, 0], [0, 1]]])
    >>> two_point_stats(array, array, mask=mask)
    Traceback (most recent call last):
    ...
    RuntimeError: Mask must be in range [0,1]

    """  # noqa: #501

    n_is_even = 1 - np.array(arr1.shape[1:]) % 2
    padding = np.array(arr1.shape[1:]) // 2

    nonperiodic_padder = sequence(
        dapad(
            pad_width=[(0, 0)] + list(zip(padding, padding + n_is_even)),
            mode="constant",
            constant_values=0,
        ),
        lambda x: da.rechunk(x, (x.chunks[0], ) + x.shape[1:]),
    )

    padder = identity if periodic_boundary else nonperiodic_padder

    if mask is not None:
        if da.max(mask).compute() > 1.0 or da.min(mask).compute() < 0.0:
            raise RuntimeError("Mask must be in range [0,1]")

        mask_array = lambda arr: arr * mask

        normalize = lambda x: x / auto_correlation(padder(mask))
    else:
        mask_array = identity

        if periodic_boundary:
            # The periodic normalization could always be the
            # auto_correlation of the mask. But for the sake of
            # efficiency, we specify the periodic normalization in the
            # case there is no mask.
            normalize = sequence(
                lambda x: x / arr1[0].size,
                dapad(
                    pad_width=[(0, 0)] + list(zip(0 * n_is_even, n_is_even)),
                    mode="wrap",
                ),
                lambda x: da.rechunk(x, (x.chunks[0], ) + x.shape[1:]),
            )
        else:
            normalize = lambda x: x / auto_correlation(
                padder(np.ones_like(arr1)))

    return sequence(
        map_(mask_array),
        map_(padder),
        list,
        star(cross_correlation),
        normalize,
        center_slice(cutoff=cutoff),
    )([arr1, arr2])
예제 #17
0
def triclustering(Z,
                  nclusters_row,
                  nclusters_col,
                  nclusters_bnd,
                  errobj,
                  niters,
                  epsilon,
                  row_clusters_init=None,
                  col_clusters_init=None,
                  bnd_clusters_init=None):
    """
    Run the tri-clustering, Dask implementation

    :param Z: d x m x n data matrix
    :param nclusters_row: number of row clusters
    :param nclusters_col: number of column clusters
    :param nclusters_bnd: number of band clusters
    :param errobj: convergence threshold for the objective function
    :param niters: maximum number of iterations
    :param epsilon: numerical parameter, avoids zero arguments in log
    :param row_clusters_init: initial row cluster assignment
    :param col_clusters_init: initial column cluster assignment
    :param bnd_clusters_init: initial column cluster assignment
    :return: has converged, number of iterations performed. final row,
    column, and band clustering, error value
    """
    client = get_client()

    Z = da.array(Z) if not isinstance(Z, da.Array) else Z

    [d, m, n] = Z.shape
    bnd_chunks, row_chunks, col_chunks = Z.chunksize

    row_clusters = da.array(row_clusters_init) \
        if row_clusters_init is not None \
        else _initialize_clusters(m, nclusters_row, chunks=row_chunks)
    col_clusters = da.array(col_clusters_init) \
        if col_clusters_init is not None \
        else _initialize_clusters(n, nclusters_col, chunks=col_chunks)
    bnd_clusters = da.array(bnd_clusters_init) \
        if bnd_clusters_init is not None \
        else _initialize_clusters(d, nclusters_bnd, chunks=bnd_chunks)
    R = _setup_cluster_matrix(nclusters_row, row_clusters)
    C = _setup_cluster_matrix(nclusters_col, col_clusters)
    B = _setup_cluster_matrix(nclusters_bnd, bnd_clusters)

    e, old_e = 2 * errobj, 0
    s = 0
    converged = False

    Gavg = Z.mean()

    while (not converged) & (s < niters):
        logger.debug(f'Iteration # {s} ..')
        # Calculate number of elements in each tri-cluster
        nel_row_clusters = da.bincount(row_clusters, minlength=nclusters_row)
        nel_col_clusters = da.bincount(col_clusters, minlength=nclusters_col)
        nel_bnd_clusters = da.bincount(bnd_clusters, minlength=nclusters_bnd)
        logger.debug(
            'num of populated clusters: row {}, col {}, bnd {}'.format(
                da.sum(nel_row_clusters > 0).compute(),
                da.sum(nel_col_clusters > 0).compute(),
                da.sum(nel_bnd_clusters > 0).compute()))
        nel_clusters = da.einsum('i,j->ij', nel_row_clusters, nel_col_clusters)
        nel_clusters = da.einsum('i,jk->ijk', nel_bnd_clusters, nel_clusters)

        # calculate tri-cluster averages (epsilon takes care of empty clusters)
        # first sum values in each tri-cluster ..
        TriCavg = da.einsum('ij,ilm->jlm', B, Z)  # .. along band axis
        TriCavg = da.einsum('ij,kim->kjm', R, TriCavg)  # .. along row axis
        TriCavg = da.einsum('ij,kli->klj', C, TriCavg)  # .. along col axis
        # finally divide by number of elements in each tri-cluster
        TriCavg = (TriCavg + Gavg * epsilon) / (nel_clusters + epsilon)

        # unpack tri-cluster averages ..
        avg_unpck = da.einsum('ij,jkl->ikl', B, TriCavg)  # .. along band axis
        avg_unpck = da.einsum('ij,klj->kli', C, avg_unpck)  # .. along col axis
        # use these for the row cluster assignment
        idx = (1, 0, 2)
        d_row = _distance(Z.transpose(idx), avg_unpck.transpose(idx), epsilon)
        row_clusters = da.argmin(d_row, axis=1)
        R = _setup_cluster_matrix(nclusters_row, row_clusters)

        # unpack tri-cluster averages ..
        avg_unpck = da.einsum('ij,jkl->ikl', B, TriCavg)  # .. along band axis
        avg_unpck = da.einsum('ij,kjl->kil', R, avg_unpck)  # .. along row axis
        # use these for the col cluster assignment
        idx = (2, 0, 1)
        d_col = _distance(Z.transpose(idx), avg_unpck.transpose(idx), epsilon)
        col_clusters = da.argmin(d_col, axis=1)
        C = _setup_cluster_matrix(nclusters_col, col_clusters)

        # unpack tri-cluster averages ..
        avg_unpck = da.einsum('ij,kjl->kil', R, TriCavg)  # .. along row axis
        avg_unpck = da.einsum('ij,klj->kli', C, avg_unpck)  # .. along col axis
        # use these for the band cluster assignment
        d_bnd = _distance(Z, avg_unpck, epsilon)
        bnd_clusters = da.argmin(d_bnd, axis=1)
        B = _setup_cluster_matrix(nclusters_bnd, bnd_clusters)

        # Error value (actually just the band component really)
        old_e = e
        minvals = da.min(d_bnd, axis=1)
        # power 1 divergence, power 2 euclidean
        e = da.sum(da.power(minvals, 1))
        row_clusters, R, col_clusters, C, bnd_clusters, B, e = client.persist(
            [row_clusters, R, col_clusters, C, bnd_clusters, B, e])
        e = e.compute()
        logger.debug(f'Error = {e:+.15e}, dE = {e - old_e:+.15e}')
        converged = abs(e - old_e) < errobj
        s = s + 1
    if converged:
        logger.debug(f'Triclustering converged in {s} iterations')
    else:
        logger.debug(f'Triclustering not converged in {s} iterations')
    return converged, s, row_clusters, col_clusters, bnd_clusters, e
예제 #18
0
def plot_dataset(X,
                 y,
                 images=None,
                 labels=None,
                 gray=False,
                 save=None,
                 y_original=None):
    print('data size {}'.format(X.shape))
    uni_y = len(da.unique(y).compute())

    x_min, x_max = da.min(X, 0), da.max(X, 0)
    X = (X - x_min) / (x_max - x_min)
    #if save is not None:
    #plt.figure(figsize=(27,18), dpi=600)
    #else:
    fig = plt.figure(figsize=(27, 18), dpi=100)
    ax = plt.subplot(111)

    for i in tqdm(range(X.shape[0])):
        plt.text(X[i, 0],
                 X[i, 1],
                 str(y[i]),
                 color=plt.cm.Set1(y[i] / uni_y),
                 fontdict={
                     'weight': 'bold',
                     'size': 9
                 })

    if images is not None:
        if hasattr(offsetbox, 'AnnotationBbox'):
            # only print thumbnails with matplotlib > 1.0
            shown_images = da.array([[1., 1.]])  # just something big
            for i in range(X.shape[0]):
                dist = da.sum((X[i] - shown_images)**2, 1)
                if da.min(dist) < 4e-3:
                    # don't show points that are too close
                    continue

                if labels is not None:
                    if y_original is not None:
                        plt.text(X[i, 0] - 0.01,
                                 X[i, 1] - 0.033,
                                 labels[y_original[i]],
                                 fontdict={
                                     'weight': 'bold',
                                     'size': 15
                                 })
                    else:
                        plt.text(X[i, 0] - 0.01,
                                 X[i, 1] - 0.033,
                                 labels[y[i]],
                                 fontdict={
                                     'weight': 'bold',
                                     'size': 15
                                 })

                shown_images = da.r_[shown_images, [X[i]]]
                if gray:
                    image_ = offsetbox.OffsetImage(
                        da.expand_dims(util.invert(images[i]), axis=0))
                else:
                    image_ = offsetbox.OffsetImage(images[i],
                                                   cmap=plt.cm.gray_r)

                imagebox = offsetbox.AnnotationBbox(image_, X[i])

                ax.add_artist(imagebox)

    plt.xticks([]), plt.yticks([])

    for item in [fig, ax]:
        item.patch.set_visible(False)

    ax.axis('off')

    if save is not None:
        print('Saving Image {} ...'.format(save))
        plt.title('epoch ' + save.split('.')[0].split()[-1],
                  fontdict={'fontsize': 20},
                  loc='left')
        plt.savefig(save)
        plt.close()
    else:
        plt.show()
    del X, y, fig, ax

    gc.collect()
예제 #19
0
def two_point_stats(arr1,
                    arr2,
                    mask=None,
                    periodic_boundary=True,
                    cutoff=None):
    """Calculate the 2-points stats for two arrays

    Args:
      arr1: array used to calculate cross-correlations (n_samples,n_x,n_y)
      arr2: array used to calculate cross-correlations (n_samples,n_x,n_y)
      mask: array specifying confidence in the measurement at a pixel
        (n_samples,n_x,n_y).  In range [0,1].
      periodic_boundary: whether to assume a periodic boundary (default is true)
      cutoff: the subarray of the 2 point stats to keep

    Returns:
      the snipped 2-points stats

    >>> two_point_stats(
    ...     da.from_array(np.arange(10).reshape(2, 5), chunks=(2, 5)),
    ...     da.from_array(np.arange(10).reshape(2, 5), chunks=(2, 5)),
    ... ).shape
    (2, 5)

    Test masking

    >>> array = da.array([[[1, 0 ,0], [0, 1, 1], [1, 1, 0]]])
    >>> mask = da.array([[[1, 1, 1], [1, 1, 1], [1, 0, 0]]])
    >>> norm_mask = da.array([[[2, 4, 3], [4, 7, 4], [3, 4, 2]]])
    >>> expected = da.array([[[1, 0, 1], [1, 4, 1], [1, 0, 1]]]) / norm_mask
    >>> assert np.allclose(
    ...     two_point_stats(array, array, mask=mask, periodic_boundary=False),
    ...     expected
    ... )

    The mask must be in the range 0 to 1.

    >>> array = da.array([[[1, 0], [0, 1]]])
    >>> mask =  da.array([[[2, 0], [0, 1]]])
    >>> two_point_stats(array, array, mask)
    Traceback (most recent call last):
    ...
    RuntimeError: Mask must be in range [0,1]
    """

    cutoff_ = int((np.min(arr1.shape[1:]) - 1) / 2)
    if cutoff is None:
        cutoff = cutoff_
    cutoff = min(cutoff, cutoff_)

    nonperiodic_padder = sequence(
        dapad(
            pad_width=[(0, 0)] + [(cutoff, cutoff)] * (arr1.ndim - 1),
            mode="constant",
            constant_values=0,
        ),
        lambda x: da.rechunk(x, (x.chunks[0], ) + x.shape[1:]),
    )

    padder = identity if periodic_boundary else nonperiodic_padder

    if mask is not None:
        if da.max(mask).compute() > 1.0 or da.min(mask).compute() < 0.0:
            raise RuntimeError("Mask must be in range [0,1]")

        mask_array = lambda arr: arr * mask

        normalize = lambda x: x / auto_correlation(padder(mask))
    else:
        mask_array = identity

        if periodic_boundary:
            # The periodic normalization could always be the
            # auto_correlation of the mask. But for the sake of
            # efficiency, we specify the periodic normalization in the
            # case there is no mask.
            normalize = lambda x: x / arr1[0].size
        else:
            normalize = lambda x: x / auto_correlation(
                padder(np.ones_like(arr1)))

    return sequence(
        map_(mask_array),
        map_(padder),
        list,
        star(cross_correlation),
        normalize,
        center_slice(cutoff=cutoff),
    )([arr1, arr2])
예제 #20
0
def coclustering(Z,
                 nclusters_row,
                 nclusters_col,
                 errobj,
                 niters,
                 epsilon,
                 col_clusters_init=None,
                 row_clusters_init=None,
                 run_on_worker=False):
    """
    Run the co-clustering, Dask implementation

    :param Z: m x n data matrix
    :param nclusters_row: num row clusters
    :param nclusters_col: number of column clusters
    :param errobj: convergence threshold for the objective function
    :param niters: maximum number of iterations
    :param epsilon: numerical parameter, avoids zero arguments in log
    :param row_clusters_init: initial row cluster assignment
    :param col_clusters_init: initial column cluster assignment
    :param run_on_worker: whether the function is submitted to a Dask worker
    :return: has converged, number of iterations performed. final row and
    column clustering, error value
    """
    client = get_client()

    Z = da.array(Z) if not isinstance(Z, da.Array) else Z

    [m, n] = Z.shape
    row_chunks, col_chunks = Z.chunksize

    row_clusters = da.array(row_clusters_init) \
        if row_clusters_init is not None \
        else _initialize_clusters(m, nclusters_row, chunks=row_chunks)
    col_clusters = da.array(col_clusters_init) \
        if col_clusters_init is not None \
        else _initialize_clusters(n, nclusters_col, chunks=col_chunks)
    R = _setup_cluster_matrix(nclusters_row, row_clusters)
    C = _setup_cluster_matrix(nclusters_col, col_clusters)

    e, old_e = 2 * errobj, 0
    s = 0
    converged = False

    Gavg = Z.mean()

    while (not converged) & (s < niters):
        logger.debug(f'Iteration # {s} ..')
        # Calculate cluster based averages
        # nel_clusters is a matrix with the number of elements per co-cluster
        # originally computed as:  da.dot(da.dot(R.T, da.ones((m, n))), C)
        nel_row_clusters = da.bincount(row_clusters, minlength=nclusters_row)
        nel_col_clusters = da.bincount(col_clusters, minlength=nclusters_col)
        logger.debug('num of populated clusters: row {}, col {}'.format(
            da.sum(nel_row_clusters > 0).compute(),
            da.sum(nel_col_clusters > 0).compute()))
        nel_clusters = da.outer(nel_row_clusters, nel_col_clusters)
        CoCavg = (da.matmul(da.matmul(R.T, Z), C) + Gavg * epsilon) / \
                 (nel_clusters + epsilon)

        # Calculate distance based on row approximation
        d_row = _distance(Z, da.matmul(C, CoCavg.T), epsilon)
        # Assign to best row cluster
        row_clusters = da.argmin(d_row, axis=1)
        R = _setup_cluster_matrix(nclusters_row, row_clusters)

        # Calculate distance based on column approximation
        d_col = _distance(Z.T, da.matmul(R, CoCavg), epsilon)
        # Assign to best column cluster
        col_clusters = da.argmin(d_col, axis=1)
        C = _setup_cluster_matrix(nclusters_col, col_clusters)

        # Error value (actually just the column components really)
        old_e = e
        minvals = da.min(d_col, axis=1)
        # power 1 divergence, power 2 euclidean
        e = da.sum(da.power(minvals, 1))
        row_clusters, R, col_clusters, C, e = client.persist(
            [row_clusters, R, col_clusters, C, e])
        if run_on_worker:
            # this is workaround for e.compute() for a function that runs
            # on a worker with multiple threads
            # https://github.com/dask/distributed/issues/3827
            e = client.compute(e)
            secede()
            e = e.result()
            rejoin()
        else:
            e = e.compute()
        logger.debug(f'Error = {e:+.15e}, dE = {e - old_e:+.15e}')
        converged = abs(e - old_e) < errobj
        s = s + 1
    if converged:
        logger.debug(f'Coclustering converged in {s} iterations')
    else:
        logger.debug(f'Coclustering not converged in {s} iterations')
    return converged, s, row_clusters, col_clusters, e
예제 #21
0
def density_flux(population, total_population, carrying_capacity, distance,
                 csx, csy, **kwargs):
    """
    'density-based dispersion'

    Dispersal is calculated using the following sequence of methods:

    Portions of populations at each element (node, or grid cell) in the study area array (raster) are moved to
    surrounding elements (a neighbourhood) within a radius that is defined by the input distance (:math:`d`), as
    presented in the conceptual figure below.

        .. image:: images/density_flux_neighbourhood.png
            :align: center

    .. attention:: No dispersal will occur if the provided distance is less than the distance between elements (grid cells) in the model domain, as none will be included in the neighbourhood

    The mean density (:math:`\\rho`) of all elements in the neighbourhood is calculated as:

    .. math::
       \\rho=\\frac{\\sum_{i=1}^{n} \\frac{pop_T(i)}{k_T(i)}}{n}

    where,

    :math:`pop_T` is the total population (of the entire species) at each element (:math:`i`); and\n
    :math:`k_T` is the total carrying capacity for the species

    The density gradient at each element (:math:`\\Delta`) with respect to the mean is calculated as:

    .. math::
        \\Delta(i)=\\frac{pop_T(i)}{k_T(i)}-\\rho

    If the centroid element is above the mean :math:`[\\Delta(i_0) > 0]`, it is able to release a portion of its
    population to elements in the neighbourhood. The eligible population to be received by surrounding elements is equal
    to the sum of populations at elements with negative density gradients, the :math:`candidates`:

    .. math::
        candidates=\\sum_{i=1}^{n} \\Delta(i)[\\Delta(i) < 0]k_T(i)

    The minimum of either the population above the mean at the centroid element - :math:`source=\\Delta(i_0)*k_T(i_0)`,
    or the :math:`candidates` are used to determine the total population that is dispersed from the centroid element to
    the other elements in the neighbourhood:

    .. math::
        dispersal=min\{source, candidates\}

    The population at the centroid element becomes:

    .. math::
        pop_a(i_0)=pop_a(i_0)-\\frac{pop_a(i_0)}{pop_T(i_0)}dispersal

    where,

    :math:`pop_a` is the age (stage) group population, which is a sub-population of the total.

    The populations of the candidate elements in the neighbourhood become (a net gain due to negative gradients):

    .. math::
        pop_a(i)=pop_a(i)-\\frac{\\Delta(i)[\\Delta(i) < 0]k_T(i)}{candidates}dispersal\\frac{pop_a(i)}{pop_T(i)}

    :param da.Array population: Sub-population to redistribute (subset of the ``total_population``)
    :param da.Array total_population: Total population
    :param da.Array carrying_capacity: Total Carrying Capacity (k)
    :param float distance: Maximum dispersal distance
    :param float csx: Cell size of the domain in the x-direction
    :param float csy: Cell size of the domain in the y-direction

    .. Attention:: Ensure the cell sizes are in the same units as the specified direction

    :Keyword Arguments:
        **mask** (*array*) --
            A weighting mask that scales dispersal based on the normalized mask value (default: None)
    :return: Redistributed population
    """
    if any([
            not isinstance(a, da.Array)
            for a in [population, total_population, carrying_capacity]
    ]):
        raise DispersalError('Inputs must be a dask arrays')

    if distance == 0:
        # Don't do anything
        return population

    chunks = tuple(c[0] if c else 0 for c in population.chunks)[:2]

    mask = kwargs.get('mask', None)
    if mask is None:
        mask = da.ones(shape=population.shape, dtype='float32', chunks=chunks)

    # Normalize the mask
    mask_min = da.min(mask)
    _range = da.max(mask) - mask_min
    mask = da.where(_range > 0, (mask - mask_min) / _range, 1.)

    # Calculate the kernel indices and shape
    kernel = calculate_kernel(distance, csx, csy)
    if kernel is None:
        # Not enough distance to cover a grid cell
        return population
    kernel, m, n = kernel
    m = int(m)
    n = int(n)

    a = da.pad(da.dstack(
        [population, total_population, carrying_capacity, mask]),
               ((m, m), (n, n), (0, 0)),
               'constant',
               constant_values=0)
    _m = -m
    if m == 0:
        _m = None
    _n = -n
    if n == 0:
        _n = None
    output = delayed(density_flux_task)(a, kernel, m, n)[m:_m, n:_n, 0]
    output = da.from_delayed(output, population.shape, np.float32)

    return output.rechunk(chunks)