Пример #1
0
def position_grid(shape, blocksize):
    """
    """

    coords = da.meshgrid(*[range(x) for x in shape], indexing='ij')
    coords = da.stack(coords, axis=-1).astype(np.int16)
    return da.rechunk(coords, chunks=tuple(blocksize) + (3, ))
Пример #2
0
    def __call__(self, projectables, optional_datasets=None, **info):
        """Get the corrected reflectance when removing Rayleigh scattering.

        Uses pyspectral.
        """
        from pyspectral.rayleigh import Rayleigh
        if not optional_datasets or len(optional_datasets) != 4:
            vis, red = self.match_data_arrays(projectables)
            sata, satz, suna, sunz = self.get_angles(vis)
            red.data = da.rechunk(red.data, vis.data.chunks)
        else:
            vis, red, sata, satz, suna, sunz = self.match_data_arrays(
                projectables + optional_datasets)
            sata, satz, suna, sunz = optional_datasets
            # get the dask array underneath
            sata = sata.data
            satz = satz.data
            suna = suna.data
            sunz = sunz.data

        # First make sure the two azimuth angles are in the range 0-360:
        sata = sata % 360.
        suna = suna % 360.
        ssadiff = da.absolute(suna - sata)
        ssadiff = da.minimum(ssadiff, 360 - ssadiff)
        del sata, suna

        atmosphere = self.attrs.get('atmosphere', 'us-standard')
        aerosol_type = self.attrs.get('aerosol_type', 'marine_clean_aerosol')
        rayleigh_key = (vis.attrs['platform_name'], vis.attrs['sensor'],
                        atmosphere, aerosol_type)
        logger.info(
            "Removing Rayleigh scattering with atmosphere '%s' and "
            "aerosol type '%s' for '%s'", atmosphere, aerosol_type,
            vis.attrs['name'])
        if rayleigh_key not in self._rayleigh_cache:
            corrector = Rayleigh(vis.attrs['platform_name'],
                                 vis.attrs['sensor'],
                                 atmosphere=atmosphere,
                                 aerosol_type=aerosol_type)
            self._rayleigh_cache[rayleigh_key] = corrector
        else:
            corrector = self._rayleigh_cache[rayleigh_key]

        try:
            refl_cor_band = corrector.get_reflectance(sunz, satz, ssadiff,
                                                      vis.attrs['name'],
                                                      red.data)
        except (KeyError, IOError):
            logger.warning(
                "Could not get the reflectance correction using band name: %s",
                vis.attrs['name'])
            logger.warning(
                "Will try use the wavelength, however, this may be ambiguous!")
            refl_cor_band = corrector.get_reflectance(
                sunz, satz, ssadiff, vis.attrs['wavelength'][1], red.data)
        proj = vis - refl_cor_band
        proj.attrs = vis.attrs
        self.apply_modifier_info(vis, proj)
        return proj
Пример #3
0
def filt_blocks_da(dask_array, i_starts, i_end=None, func=None, *args):
    """
    Apply function to each block of numpy array separately (function is , can be provided other to, for example, filter array)
    :param dask_array: dask array, to filter, may be with unknown chunks as for dask series.values
    :param i_starts: numpy array, indexes of starts of bocks
    :param i_end: len(dask_array) if None then last element of i_starts must be equal to it else i_end should not be in i_starts
    # specifing this removes warning 'invalid value encountered in less'
    :param func: numpy.interp by default interp(NaNs) used if None
    returns: dask array of same size as x with func upplied

    >>> Pfilt = filt_blocks_da(a['P'].values, i_burst, i_end=len(a))
    ... sum(~isfinite(a['P'].values.compute())), sum(~isfinite(Pfilt))  # some nans was removed
    : (6, 0)
    # other values unchanged
    >>> allclose(Pfilt[isfinite(a['P'].values.compute())], a['P'].values[isfinite(a['P'].values)].compute())
    : True
    """
    if func is None:
        func = np.interp
    if i_end:
        i_starts = np.append(i_starts, i_end)
    else:
        i_end = i_starts[-1]

    if np.isnan(dask_array.size):  # unknown chunks delayed transformation
        dask_array = da.from_delayed(dask_array.to_delayed()[0],
                                     shape=(i_end, ),
                                     dtype=np.float64,
                                     name='filt')

    y = da.rechunk(dask_array, chunks=(tuple(np.diff(i_starts).tolist()), ))
    y_out = y.map_blocks(func, dtype=np.float64, name='blocks_da')
    return y_out
Пример #4
0
def zero_pad(arr, shape, chunks):
    """Zero pad an array with zeros

    Args:
      arr: the array to pad
      shape: the shape of the new array
      chunks: how to rechunk the new array

    Returns:
      the new padded version of the array

    >>> print(
    ...     zero_pad(
    ...         np.arange(4).reshape([1, 2, 2, 1]),
    ...         (1, 4, 5, 1),
    ...         None
    ...     )[0,...,0].compute()
    ... )
    [[0 0 0 0 0]
     [0 0 0 1 0]
     [0 0 2 3 0]
     [0 0 0 0 0]]
    >>> print(zero_pad(np.arange(4).reshape([2, 2]), (4, 5), None).compute())
    [[0 0 0 0 0]
     [0 0 0 1 0]
     [0 0 2 3 0]
     [0 0 0 0 0]]
    >>> zero_pad(zero_pad(np.arange(4).reshape([2, 2]), (4, 5, 1), None))
    Traceback (most recent call last):
    ...
    RuntimeError: length of shape is incorrect
    >>> zero_pad(zero_pad(np.arange(4).reshape([2, 2]), (1, 2), None))
    Traceback (most recent call last):
    ...
    RuntimeError: resize shape is too small

    >>> arr = da.from_array(np.arange(4).reshape((2, 2)), chunks=(2, 1))
    >>> out = zero_pad(arr, (4, 3), (-1, 1))
    >>> out.shape
    (4, 3)
    >>> out.chunks
    ((4,), (1, 1, 1))
    """
    if len(shape) != len(arr.shape):
        raise RuntimeError("length of shape is incorrect")

    if not np.all(shape >= arr.shape):
        raise RuntimeError("resize shape is too small")

    return pipe(
        np.array(shape) - np.array(arr.shape),
        lambda x: np.concatenate(
            ((x - (x // 2))[..., None], (x // 2)[..., None]), axis=1
        ),
        fmap(tuple),
        tuple,
        lambda x: da.pad(arr, x, "constant", constant_values=0),
        lambda x: da.rechunk(x, chunks=chunks or x.shape),
    )
Пример #5
0
def comb_distance(spec_dist, temp_dist, spat_dist):
    if logWeight == True:
        spec_dist = da.log(spec_dist + 1)
        temp_dist = da.log(temp_dist + 1)

    comb_dist = da.rechunk(spec_dist * temp_dist * spat_dist,
                           chunks=spec_dist.chunksize)
    print("Done comb distance!", comb_dist)

    return comb_dist
Пример #6
0
    def __call__(self, projectables, optional_datasets=None, **info):
        """Get the corrected reflectance when removing Rayleigh scattering.

        Uses pyspectral.
        """
        from pyspectral.rayleigh import Rayleigh
        if not optional_datasets or len(optional_datasets) != 4:
            vis, red = self.check_areas(projectables)
            sata, satz, suna, sunz = self.get_angles(vis)
            red.data = da.rechunk(red.data, vis.data.chunks)
        else:
            vis, red, sata, satz, suna, sunz = self.check_areas(
                projectables + optional_datasets)
            sata, satz, suna, sunz = optional_datasets
            # get the dask array underneath
            sata = sata.data
            satz = satz.data
            suna = suna.data
            sunz = sunz.data

        LOG.info('Removing Rayleigh scattering and aerosol absorption')

        # First make sure the two azimuth angles are in the range 0-360:
        sata = sata % 360.
        suna = suna % 360.
        ssadiff = da.absolute(suna - sata)
        ssadiff = da.minimum(ssadiff, 360 - ssadiff)
        del sata, suna

        atmosphere = self.attrs.get('atmosphere', 'us-standard')
        aerosol_type = self.attrs.get('aerosol_type', 'marine_clean_aerosol')
        rayleigh_key = (vis.attrs['platform_name'],
                        vis.attrs['sensor'], atmosphere, aerosol_type)
        if rayleigh_key not in self._rayleigh_cache:
            corrector = Rayleigh(vis.attrs['platform_name'], vis.attrs['sensor'],
                                 atmosphere=atmosphere,
                                 aerosol_type=aerosol_type)
            self._rayleigh_cache[rayleigh_key] = corrector
        else:
            corrector = self._rayleigh_cache[rayleigh_key]

        try:
            refl_cor_band = corrector.get_reflectance(sunz, satz, ssadiff,
                                                      vis.attrs['name'],
                                                      red.data)
        except (KeyError, IOError):
            LOG.warning("Could not get the reflectance correction using band name: %s", vis.attrs['name'])
            LOG.warning("Will try use the wavelength, however, this may be ambiguous!")
            refl_cor_band = corrector.get_reflectance(sunz, satz, ssadiff,
                                                      vis.attrs['wavelength'][1],
                                                      red.data)
        proj = vis - refl_cor_band
        proj.attrs = vis.attrs
        self.apply_modifier_info(vis, proj)
        return proj
Пример #7
0
    def _call_pandas_groupby_statistics(self,
                                        scipy_method,
                                        data,
                                        fill_value=None,
                                        skipna=None):
        """Calculate statistics (min/max) for each bin with drop-in-a-bucket resampling."""
        import dask.dataframe as dd
        import pandas as pd

        if isinstance(data, xr.DataArray):
            data = data.data
        data = data.ravel()

        # Remove NaN values from the data when used as weights
        weights = da.where(np.isnan(data), 0, data)

        # Rechunk indices to match the data chunking
        if weights.chunks != self.idxs.chunks:
            self.idxs = da.rechunk(self.idxs, weights.chunks)

        # Calculate the min of the data falling to each bin
        out_size = self.target_area.size

        # merge into one Dataframe
        df = dd.concat(
            [dd.from_dask_array(self.idxs),
             dd.from_dask_array(weights)],
            axis=1)
        df.columns = ['x', 'values']

        if scipy_method == 'min':
            statistics = df.map_partitions(lambda part: part.groupby(
                np.digitize(part.x, bins=np.linspace(0, out_size, out_size)))[
                    'values'].min())

        elif scipy_method == 'max':
            statistics = df.map_partitions(lambda part: part.groupby(
                np.digitize(part.x, bins=np.linspace(0, out_size, out_size)))[
                    'values'].max())

        # fill missed index
        statistics = (statistics + pd.Series(np.zeros(out_size))).fillna(0)

        counts = self.get_sum(np.logical_not(np.isnan(data)).astype(
            np.int64)).ravel()

        # TODO remove following line in favour of weights = data when dask histogram bug (issue #6935) is fixed
        statistics = self._mask_bins_with_nan_if_not_skipna(
            skipna, data, out_size, statistics)

        # set bin without data to fill value
        statistics = da.where(counts == 0, fill_value, statistics)

        return statistics.reshape(self.target_area.shape)
Пример #8
0
 def get_chunk_index(self, chunk_shape, nav_shape):
     if np.prod(nav_shape
                ) != self.image_dict["NumFrames"] and nav_shape is not None:
         num_frames = np.prod(nav_shape)
     else:
         num_frames = self.image_dict["NumFrames"]
     indexes = da.arange(num_frames)
     if nav_shape is not None:
         indexes = da.reshape(indexes, nav_shape)
     indexes = da.rechunk(indexes, chunks=chunk_shape)
     return indexes
Пример #9
0
def partition(image, folder):
    # create a dask array from the image in chunks (31 x 150)
    image_da = da.from_array(image, chunks = (windowSize,image.shape[1]))
    # padding the array before and after with 15 pixels
    image_pad = da.pad(image_da, windowSize//2, mode='constant')
    
    for i in range(0,windowSize):
        row = str(i)
        block_i = image_pad[i:,:]
        block_i_da = da.rechunk(block_i, chunks=(windowSize,image_pad.shape[1]))
        block_i_da.map_blocks(block2row, dtype=int, row=row, folder=folder).compute()
Пример #10
0
def partition(image, folder):
    image_da = da.from_array(image, chunks=(windowSize, image.shape[1]))
    image_pad = da.pad(image_da, windowSize // 2, mode='constant')

    for i in range(0, windowSize):
        row = str(i)
        block_i = image_pad[i:, :]
        block_i_da = da.rechunk(block_i,
                                chunks=(windowSize, image_pad.shape[1]))
        block_i_da.map_blocks(block2row, dtype=int, row=row,
                              folder=folder).compute()
Пример #11
0
 def _permute(self, array):
     # Ensure only a single chunk on the baseline axis, so that we can do
     # the permutation on a chunk-by-chunk basis. This is more efficient
     # than using dask to do the permutation.
     #
     # Currently katdal already does this step, but the code is left here
     # in case that changes in future.
     if array.numblocks[2] != 1:
         array = da.rechunk(array, chunks={2: -1})  # pragma: nocover
     index = np.s_[:, :, self._corr_product_permutation]
     return da.map_blocks(lambda block: block[index],
                          array,
                          chunks=(array.chunks[0], array.chunks[1],
                                  (len(self._corr_product_permutation), )),
                          dtype=array.dtype)
Пример #12
0
def da_stack(folder, shape):
    da_list = [] 
    full_path = path + folder
    max_blocks = shape[0]//windowSize + 1 
    
    for block in range(1,max_blocks + 1):
        for row in range(0,windowSize):
            name = str(block) + 'r' + str(row)
            full_name = full_path + name + '.zarr'
            try:
                da_array = da.from_zarr(full_name)
                da_list.append(da_array) 
            except Exception:
                continue
      
    return da.rechunk(da.concatenate(da_list, axis=0), chunks = (shape[1],windowSize**2))
Пример #13
0
def two_point_stats(arr1, arr2, periodic_boundary=True, cutoff=None):
    """Calculate the 2-points stats for two arrays

    Args:
      arr1: array used to calculate cross-correlations (n_samples,n_x,n_y)
      arr2: array used to calculate cross-correlations (n_samples,n_x,n_y)
      periodic_boundary: whether to assume a periodic boundary (default is true)
      cutoff: the subarray of the 2 point stats to keep

    Returns:
      the snipped 2-points stats

    >>> two_point_stats(
    ...     da.from_array(np.arange(10).reshape(2, 5), chunks=(2, 5)),
    ...     da.from_array(np.arange(10).reshape(2, 5), chunks=(2, 5)),
    ... ).shape
    (2, 5)

    """
    cutoff_ = int((np.min(arr1.shape[1:]) - 1) / 2)
    if cutoff is None:
        cutoff = cutoff_
    cutoff = min(cutoff, cutoff_)

    nonperiodic_padder = sequence(
        dapad(
            pad_width=[(0, 0)] + [(cutoff, cutoff)] * (arr1.ndim - 1),
            mode="constant",
            constant_values=0,
        ),
        lambda x: da.rechunk(x, (x.chunks[0], ) + x.shape[1:]),
    )

    padder = identity if periodic_boundary else nonperiodic_padder

    nonperiodic_normalize = lambda x: x / auto_correlation(
        padder(np.ones_like(arr1)))

    normalize = identity if periodic_boundary else nonperiodic_normalize

    return sequence(
        map_(padder),
        list,
        star(cross_correlation),
        normalize,
        center_slice(cutoff=cutoff),
    )([arr1, arr2])
Пример #14
0
    def get_sum(self, data, skipna=True):
        """Calculate sums for each bin with drop-in-a-bucket resampling.

        Parameters
        ----------
        data : Numpy or Dask array
            Data to be binned and summed.
        skipna : boolean (optional)
                If True, skips NaN values for the sum calculation
                (similarly to Numpy's `nansum`). Buckets containing only NaN are set to zero.
                If False, sets the bucket to NaN if one or more NaN values are present in the bucket
                (similarly to Numpy's `sum`).
                In both cases, empty buckets are set to 0.
                Default: True

        Returns
        -------
        data : Numpy or Dask array
            Bin-wise sums in the target grid
        """
        LOG.info("Get sum of values in each location")

        if isinstance(data, xr.DataArray):
            data = data.data
        data = data.ravel()

        # Remove NaN values from the data when used as weights
        weights = da.where(np.isnan(data), 0, data)

        # Rechunk indices to match the data chunking
        if weights.chunks != self.idxs.chunks:
            self.idxs = da.rechunk(self.idxs, weights.chunks)

        # Calculate the sum of the data falling to each bin
        out_size = self.target_area.size
        sums, _ = da.histogram(self.idxs,
                               bins=out_size,
                               range=(0, out_size),
                               weights=weights,
                               density=False)

        # TODO remove following line in favour of weights = data when dask histogram bug (issue #6935) is fixed
        sums = self._mask_bins_with_nan_if_not_skipna(skipna, data, out_size,
                                                      sums)

        return sums.reshape(self.target_area.shape)
Пример #15
0
def single_window(df,
                  rgeno,
                  tgeno,
                  threads=1,
                  max_memory=None,
                  justd=False,
                  extend=False):
    ridx = df.i_ref.values
    tidx = df.i_tar.values
    rg = rgeno[:, ridx]
    tg = tgeno[:, tidx]
    if extend:
        # extend the Genotpe at both end to avoid edge effects
        ridx_a, ridx_b = np.array_split(ridx, 2)
        tidx_a, tidx_b = np.array_split(tidx, 2)
        rg = da.concatenate(
            [rgeno[:, (ridx_a[::-1][:-1])], rg, rgeno[:, (ridx_b[::-1][1:])]],
            axis=1)
        tg = da.concatenate(
            [tgeno[:, (tidx_a[::-1][:-1])], tg, tgeno[:, (tidx_b[::-1][1:])]],
            axis=1)
        D_r = da.dot(rg.T, rg) / rg.shape[0]
        D_t = da.dot(tg.T, tg) / tg.shape[0]
        # remove the extras
        D_r = D_r[:, (ridx_a.shape[0] + 1):][:, :(ridx.shape[0])]
        D_r = D_r[(ridx_a.shape[0] + 1):, :][:(ridx.shape[0]), :]
        D_t = D_t[:, (tidx_a.shape[0] + 1):][:, :(tidx.shape[0])]
        D_t = D_t[(tidx_a.shape[0] + 1):, :][:(tidx.shape[0]), :]
        assert D_r.shape[1] == ridx.shape[0]
        assert D_t.shape[1] == tidx.shape[0]
    else:
        D_r = da.dot(rg.T, rg) / rg.shape[0]
        D_t = da.dot(tg.T, tg) / tg.shape[0]
    if justd:
        return df.snp, D_r, D_t
    cot = da.diag(da.dot(D_r, D_t))
    ref = da.diag(da.dot(D_r, D_r))
    tar = da.diag(da.dot(D_t, D_t))
    stacked = da.stack([df.snp, ref, tar, cot], axis=1)
    c_h_u_n_k_s = estimate_chunks(stacked.shape, threads, max_memory)
    stacked = da.rechunk(stacked, chunks=c_h_u_n_k_s)
    columns = ['snp', 'ref', 'tar', 'cotag']
    return dd.from_dask_array(stacked, columns=columns).compute()
Пример #16
0
    def executeLabeledTraining(self, client: Client = None):
        # Train Model over the labeled instances
        if client is not None:
            with joblib.parallel_backend("dask"):
                self._ml_technique.fit(
                    da.rechunk(self._X[self._label_idx.index, :]),
                    da.rechunk(self._Y[self._label_idx.index]))

                # predict the results over the labeled test instances
                if hasattr(self._ml_technique, 'predict_classes'):
                    label_pred = self._ml_technique.predict_classes(
                        da.rechunk(self._X[self._test_idx, :]))
                else:
                    label_pred = self._ml_technique.predict(
                        da.rechunk(self._X[self._test_idx, :]))
        else:
            self._ml_technique.fit(
                da.rechunk(self._X[self._label_idx.index, :]),
                da.rechunk(self._Y[self._label_idx.index]))
            # predict the results over the labeled test instances
            if hasattr(self._ml_technique, 'predict_classes'):
                label_pred = self._ml_technique.predict_classes(
                    da.rechunk(self._X[self._test_idx, :]))
            else:
                label_pred = self._ml_technique.predict(
                    da.rechunk(self._X[self._test_idx, :]))

        # performance calc for all metrics
        label_perf = []
        for metric in self._performance_metrics:
            value = delayed(
                metric.compute(y_true=self._Y[self._test_idx],
                               y_pred=label_pred))
            label_perf.append(
                delayed({
                    "name": metric.metric_name,
                    "value": value
                }))

        return label_pred, compute(label_perf)[0]
Пример #17
0
    def get_sum(self, data, mask_all_nan=False):
        """Calculate sums for each bin with drop-in-a-bucket resampling.

        Parameters
        ----------
        data : Numpy or Dask array
        mask_all_nan : boolean (optional)
            Mask bins that have only NaN results, default: False

        Returns
        -------
        data : Numpy or Dask array
            Bin-wise sums in the target grid
        """
        LOG.info("Get sum of values in each location")
        if isinstance(data, xr.DataArray):
            data = data.data
        data = data.ravel()
        # Remove NaN values from the data when used as weights
        weights = da.where(np.isnan(data), 0, data)

        # Rechunk indices to match the data chunking
        if weights.chunks != self.idxs.chunks:
            self.idxs = da.rechunk(self.idxs, weights.chunks)

        # Calculate the sum of the data falling to each bin
        out_size = self.target_area.size
        sums, _ = da.histogram(self.idxs,
                               bins=out_size,
                               range=(0, out_size),
                               weights=weights,
                               density=False)

        if mask_all_nan:
            nans = np.isnan(data)
            nan_sums, _ = da.histogram(self.idxs[nans],
                                       bins=out_size,
                                       range=(0, out_size))
            counts = self.get_count().ravel()
            sums = da.where(nan_sums == counts, np.nan, sums)

        return sums.reshape(self.target_area.shape)
Пример #18
0
    def dask_arr(self, videos, freq='S'):
        '''xararry representation of all videos in one folder (event)'''
        start_time = self._r_timestamp(videos[0])
        lazy = [self.read_video(video) for video in videos]
        sample = lazy[0].compute()
        _, h, w, c = sample.shape
        da_array = [
            da.from_delayed(arr, dtype=np.uint8, shape=sample.shape)
            for arr in lazy
        ]
        da_array = da.stack(da_array)
        # da_array= da.reshape(da_array, (da_array.shape[0]*da_array.shape[1], 1080,1920,3), chunks=(1,1080,1920,3))
        da_array = da_array.reshape(da_array.shape[0] * da_array.shape[1], h,
                                    w, c)
        da_array = da.rechunk(da_array, (1, h, w, c))
        print(da_array)
        end_time = start_time + datetime.timedelta(seconds=da_array.shape[0] -
                                                   1)

        return da_array, pd.date_range(start_time, end_time, freq='S')
Пример #19
0
def weighting(spec_dist, temp_dist, comb_dist, similar_pixels_filtered):
    # Assign max weight (1) when the temporal or spectral distance is zero
    zero_spec_dist = da.where(spec_dist[:,mid_idx][:,None] == 1, 1, 0)
    zero_temp_dist = da.where(temp_dist[:,mid_idx][:,None] == 1, 1, 0)
    zero_dist_mid = da.where((zero_spec_dist == 1), 
                             zero_spec_dist, zero_temp_dist)
    shape = da.subtract(spec_dist.shape,(0,1))
    zero_dist = da.zeros(shape, chunks=(spec_dist.shape[0],shape[1]))
    zero_dist = da.insert(zero_dist, [mid_idx], zero_dist_mid, axis=1)
    weights = da.where((da.sum(zero_dist,1)[:,None] == 1), zero_dist, comb_dist)
    
    # Calculate weights only for the filtered spectrally similar pixels
    weights_filt = weights*similar_pixels_filtered
    
    # Normalize weights
    norm_weights = da.rechunk(weights_filt/(da.sum(weights_filt,1)[:,None]), 
                              chunks = spec_dist.chunksize)
    
    print ("Done weighting!", norm_weights)
    
    return norm_weights
Пример #20
0
def dec10216(inbuf):
    """Decode 10 bits data into 16 bits words.

    ::

        /*
         * pack 4 10-bit words in 5 bytes into 4 16-bit words
         *
         * 0       1       2       3       4       5
         * 01234567890123456789012345678901234567890
         * 0         1         2         3         4
         */
        ip = &in_buffer[i];
        op = &out_buffer[j];
        op[0] = ip[0]*4 + ip[1]/64;
        op[1] = (ip[1] & 0x3F)*16 + ip[2]/16;
        op[2] = (ip[2] & 0x0F)*64 + ip[3]/4;
        op[3] = (ip[3] & 0x03)*256 +ip[4];

    """
    arr10 = inbuf.astype(np.uint16)
    arr16_len = int(len(arr10) * 4 / 5)
    arr10_len = int((arr16_len * 5) / 4)
    arr10 = arr10[:arr10_len]  # adjust size

    # dask is slow with indexing
    arr10_0 = arr10[::5]
    arr10_1 = arr10[1::5]
    arr10_2 = arr10[2::5]
    arr10_3 = arr10[3::5]
    arr10_4 = arr10[4::5]

    arr16_0 = (arr10_0 << 2) + (arr10_1 >> 6)
    arr16_1 = ((arr10_1 & 63) << 4) + (arr10_2 >> 4)
    arr16_2 = ((arr10_2 & 15) << 6) + (arr10_3 >> 2)
    arr16_3 = ((arr10_3 & 3) << 8) + arr10_4
    arr16 = da.stack([arr16_0, arr16_1, arr16_2, arr16_3], axis=-1).ravel()
    arr16 = da.rechunk(arr16, arr16.shape[0])

    return arr16
Пример #21
0
def dec10216(inbuf):
    """Decode 10 bits data into 16 bits words.

    ::

        /*
         * pack 4 10-bit words in 5 bytes into 4 16-bit words
         *
         * 0       1       2       3       4       5
         * 01234567890123456789012345678901234567890
         * 0         1         2         3         4
         */
        ip = &in_buffer[i];
        op = &out_buffer[j];
        op[0] = ip[0]*4 + ip[1]/64;
        op[1] = (ip[1] & 0x3F)*16 + ip[2]/16;
        op[2] = (ip[2] & 0x0F)*64 + ip[3]/4;
        op[3] = (ip[3] & 0x03)*256 +ip[4];

    """
    arr10 = inbuf.astype(np.uint16)
    arr16_len = int(len(arr10) * 4 / 5)
    arr10_len = int((arr16_len * 5) / 4)
    arr10 = arr10[:arr10_len]  # adjust size

    # dask is slow with indexing
    arr10_0 = arr10[::5]
    arr10_1 = arr10[1::5]
    arr10_2 = arr10[2::5]
    arr10_3 = arr10[3::5]
    arr10_4 = arr10[4::5]

    arr16_0 = (arr10_0 << 2) + (arr10_1 >> 6)
    arr16_1 = ((arr10_1 & 63) << 4) + (arr10_2 >> 4)
    arr16_2 = ((arr10_2 & 15) << 6) + (arr10_3 >> 2)
    arr16_3 = ((arr10_3 & 3) << 8) + arr10_4
    arr16 = da.stack([arr16_0, arr16_1, arr16_2, arr16_3], axis=-1).ravel()
    arr16 = da.rechunk(arr16, arr16.shape[0])

    return arr16
Пример #22
0
    def _call_bin_statistic(self,
                            statistic_method,
                            data,
                            fill_value=None,
                            skipna=None):
        """Calculate statistics (min/max) for each bin with drop-in-a-bucket resampling."""
        if isinstance(data, xr.DataArray):
            data = data.data
        data = data.ravel()

        # Rechunk indices to match the data chunking
        if data.chunks != self.idxs.chunks:
            self.idxs = da.rechunk(self.idxs, data.chunks)

        out_shape = self.target_area.shape

        statistics = da.from_delayed(_get_statistics(statistic_method, data,
                                                     self.idxs, out_shape),
                                     shape=out_shape,
                                     dtype=np.float64)

        return statistics
Пример #23
0
def correlations_multiple(data,
                          correlations,
                          periodic_boundary=True,
                          cutoff=None):
    """Calculate 2-point stats for a multiple auto/cross correlation

    Args:
      data: the discretized data (n_samples,n_x,n_y,n_correlation)
      correlation_pair: the correlation pairs
      periodic_boundary: whether to assume a periodic boudnary (default is true)
      cutoff: the subarray of the 2 point stats to keep

    Returns:
      the 2-points stats array

    >>> data = np.arange(18).reshape(1, 3, 3, 2)
    >>> out = correlations_multiple(data, [[0, 1], [1, 1]])
    >>> out
    dask.array<stack, shape=(1, 3, 3, 2), dtype=float64, chunksize=(1, 3, 3, 1)>
    >>> answer = np.array([[[58, 62, 58], [94, 98, 94], [58, 62, 58]]]) + 1. / 3.
    >>> assert(out.compute()[...,0], answer)
    """

    return pipe(
        range(data.shape[-1]),
        map_(lambda x: (0, x)),
        lambda x: correlations if correlations else x,
        map_(lambda x: two_point_stats(
            data[..., x[0]],
            data[..., x[1]],
            periodic_boundary=periodic_boundary,
            cutoff=cutoff,
        )),
        list,
        lambda x: da.stack(x, axis=-1),
        lambda x: da.rechunk(x, x.chunks[:-1] + (-1, )),
    )
Пример #24
0
def _apply(func,
           datasets,
           chunk=CHUNK,
           pad=None,
           relabel=False,
           stack=False,
           compute=True,
           out=None,
           normalize=False,
           **kwargs):
    """
    Appplies a function to a given set of datasets. Wraps a standard
    function call of the form:

        func(*datasets, **kwargs)

    Named parameters gives extra functionality.

    Parameters
    ----------
    func: callable
        Function to be mapped across datasets.
    datasets: list of numpy array-like
        Input datasets.
    chunk: boolean
        If `True` then input datasets will be assumed tobe `Dask.Array`s and
        the function will be mapped across arrays blocks.
    pad: None, int or iterable
        The padding to apply (only if `chunk = True`). If `pad != None` then
        `dask.array.ghost.map_overlap` will be used to map the function across
        overlapping blocks, otherwise `dask.array.map_blocks` will be used.
    relabel: boolean
        Some of the labelling functions will yield local labelling if `chunk=True`.
        If `func` is a labelling function, set `relabel = True` to map the result
        for global consistency. See `survos2.improc.utils.dask_relabel_chunks` for
        more details.
    compute: boolean
        If `True` the result will be computed and returned in numpy array form,
        otherwise a `dask.delayed` will be returned if `chunk = True`.
    out: None or numpy array-like
        if `out != None` then the result will be stored in there.
    **kwargs: other keyword arguments
        Arguments to be passed to `func`.

    Returns
    -------
    result: numpy array-like
        The computed result if `compute = True` or `chunk = False`, the result
        of the lazy wrapping otherwise.
    """
    if stack and len(datasets) > 1:
        dataset = da.stack(datasets, axis=0)
        dataset = da.rechunk(dataset,
                             chunks=(dataset.shape[0], ) + dataset.chunks[1:])
        datasets = [dataset]

    if chunk == True:
        kwargs.setdefault('dtype', out.dtype if out else datasets[0].dtype)
        kwargs.setdefault('drop_axis', 0 if stack else None)
        if pad is None or pad == False:
            result = da.map_blocks(func, *datasets, **kwargs)
        elif len(datasets) == 1:
            if np.isscalar(pad):
                pad = [pad] * datasets[0].ndim

            if stack:
                pad[0] = 0  # don't pad feature channel
                depth = {i: d for i, d in enumerate(pad)}
                trim = {i: d for i, d in enumerate(pad[1:])}
            else:
                depth = trim = {i: d for i, d in enumerate(pad)}

            g = da.ghost.ghost(datasets[0], depth=depth, boundary='reflect')
            r = g.map_blocks(func, **kwargs)
            result = da.ghost.trim_internal(r, trim)
        else:
            raise ValueError('`pad` only works with single')

        rchunks = result.chunks

        if not relabel and normalize:
            result = result / da.nanmax(da.fabs(result))

        if out is not None:
            result.store(out, compute=True)
        elif compute:
            result = result.compute()

        if relabel:
            if out is not None:
                result = dask_relabel_chunks(da.from_array(out,
                                                           chunks=rchunks))
                result.store(out, compute=True)
            else:
                result = dask_relabel_chunks(
                    da.from_array(result, chunks=rchunks))
                if compute:
                    result = result.compute()
    else:
        result = func(*datasets, **kwargs)
        if out is not None:
            out[...] = result

    if out is None:
        return result
Пример #25
0
    def image_tikhonov(self,
                       vis_arr,
                       sphere,
                       alpha,
                       scale=True,
                       usedask=False):
        n_s = sphere.pixels.shape[0]
        n_v = self.u_arr.shape[0]

        lambduh = alpha / np.sqrt(n_s)
        if not usedask:
            gamma = self.make_gamma(sphere)
            logger.info("augmented: {}".format(gamma.shape))

            vis_aux = vis_to_real(vis_arr)
            logger.info("vis mean: {} shape: {}".format(
                np.mean(vis_aux), vis_aux.shape))

            tol = min(alpha / 1e4, 1e-10)
            logger.info("Solving tol={} ...".format(tol))

            # reg = linear_model.ElasticNet(alpha=alpha/np.sqrt(n_s),
            # tol=1e-6,
            # l1_ratio = 0.01,
            # max_iter=100000,
            # positive=True)
            if False:
                (
                    sky,
                    lstop,
                    itn,
                    r1norm,
                    r2norm,
                    anorm,
                    acond,
                    arnorm,
                    xnorm,
                    var,
                ) = scipy.sparse.linalg.lsqr(gamma,
                                             vis_aux,
                                             damp=alpha,
                                             show=True)
                logger.info(
                    "Alpha: {}: Iterations: {}: rnorm: {}: xnorm: {}".format(
                        alpha, itn, r2norm, xnorm))
            else:
                reg = linear_model.Ridge(alpha=alpha,
                                         tol=tol,
                                         solver="lsqr",
                                         max_iter=100000)

                reg.fit(gamma, vis_aux)
                logger.info("    Solve Complete, iter={}".format(reg.n_iter_))

                sky = da.from_array(reg.coef_)

                residual = vis_aux - gamma @ sky

                sky, residual_norm, solution_norm = da.compute(
                    sky,
                    np.linalg.norm(residual)**2,
                    np.linalg.norm(sky)**2)

                score = reg.score(gamma, vis_aux)
                logger.info("Alpha: {}: Loss: {}: rnorm: {}: snorm: {}".format(
                    alpha, score, residual_norm, solution_norm))

        else:
            from dask_ml.linear_model import LinearRegression
            import dask_glm
            from dask.distributed import Client, LocalCluster
            from dask.diagnostics import ProgressBar
            import dask

            logger.info("Starting Dask Client")

            if True:
                cluster = LocalCluster(dashboard_address=":8231",
                                       processes=False)
                client = Client(cluster)
            else:
                client = Client("tcp://localhost:8786")

            logger.info("Client = {}".format(client))

            harmonic_list = []
            p2j = 2 * np.pi * 1.0j

            dl = sphere.l
            dm = sphere.m
            dn = sphere.n

            n_arr_minus_1 = dn - 1

            du = self.u_arr
            dv = self.v_arr
            dw = self.w_arr

            for u, v, w in zip(du, dv, dw):
                harmonic = da.from_array(
                    np.exp(p2j * (u * dl + v * dm + w * n_arr_minus_1)) /
                    np.sqrt(sphere.npix),
                    chunks=(n_s, ),
                )
                harminc = client.persist(harmonic)
                harmonic_list.append(harmonic)

            gamma = da.stack(harmonic_list)
            logger.info("Gamma Shape: {}".format(gamma.shape))
            # gamma = gamma.reshape((n_v, n_s))
            gamma = gamma.conj()
            gamma = client.persist(gamma)

            logger.info("Gamma Shape: {}".format(gamma.shape))

            logger.info("Building Augmented Operator...")
            proj_operator_real = da.real(gamma)
            proj_operator_imag = da.imag(gamma)
            proj_operator = da.block([[proj_operator_real],
                                      [proj_operator_imag]])

            proj_operator = client.persist(proj_operator)

            logger.info("Proj Operator shape {}".format(proj_operator.shape))
            vis_aux = da.from_array(
                np.array(
                    np.concatenate((np.real(vis_arr), np.imag(vis_arr))),
                    dtype=np.float32,
                ))

            # logger.info("Solving...")

            en = dask_glm.regularizers.ElasticNet(weight=0.01)
            en = dask_glm.regularizers.L2()
            # dT = da.from_array(proj_operator, chunks=(-1, 'auto'))
            ##dT = da.from_array(proj_operator, chunks=(-1, 'auto'))
            # dv = da.from_array(vis_aux)

            dask.config.set({"array.chunk-size": "1024MiB"})
            A = da.rechunk(proj_operator, chunks=("auto", n_s))
            A = client.persist(A)
            y = vis_aux  # da.rechunk(vis_aux, chunks=('auto', n_s))
            y = client.persist(y)
            # sky = dask_glm.algorithms.proximal_grad(A, y, regularizer=en, lambduh=alpha, max_iter=10000)

            logger.info("Rechunking completed.. A= {}.".format(A.shape))
            reg = LinearRegression(
                penalty=en,
                C=1.0 / lambduh,
                fit_intercept=False,
                solver="lbfgs",
                max_iter=1000,
                tol=1e-8,
            )
            sky = reg.fit(A, y)
            sky = reg.coef_
            score = reg.score(proj_operator, vis_aux)
            logger.info("Loss function: {}".format(score.compute()))

        logger.info("Solving Complete: sky = {}".format(sky.shape))

        sphere.set_visible_pixels(sky, scale=False)
        return sky.reshape(-1, 1)
Пример #26
0
    def fft_dask(self,
                 src_fname,
                 src_dset,
                 dst_fname,
                 dst_dset,
                 axis,
                 background_subtraction=True,
                 window=False):
        """Perform an out of core FFT along a given axes using the DASK module.
        Requires the data to be in a .hdf5 file.
        Allows FFT to be performed on large datasets that do not fit into memory.
        Takes the source .hdf5 file name and dataset as well as th destination file name and dataset as inputs.
        """

        if (src_fname == dst_fname):
            print('must write to new .hdf5 file')
            return 1

        # open the hdf5 files
        with hd.File(src_fname, 'r', libver='latest') as s:
            with hd.File(dst_fname, 'w', libver='latest') as d:

                # create a destination dataset
                dshape = s[src_dset].shape
                cshape = s[src_dset].chunks
                d.create_dataset(dst_dset,
                                 dshape,
                                 chunks=cshape,
                                 dtype=complex)

        # CAN WE CLOSE THE FILES HERE AND REOPEN THEM LATER?

        with hd.File(src_fname, 'r', libver='latest') as s:

            # make a dask array from the dset
            data = da.from_array(s[src_dset], s[src_dset].chunks)

            # weld chunks together to span the fft axis
            newcshape = sp.array(cshape)
            newcshape[axis] = dshape[axis]
            newcshape = tuple(newcshape)

            # rechunk dask array in order to perform fft
            data = da.rechunk(data, newcshape)

            # make optional background subtraction
            if (background_subtraction == True):
                background = data[:, :, :, :, 0]
                data = data - background[:, :, :, :, None]

            # make optional windowing before fourier transform
            if (window != False):
                try:
                    w = eval('signal.' + window + '(data.shape[axis])')
                    dim_arr = sp.ones((1, w.ndim), int).ravel()
                    dim_arr[axis] = -1
                    window_reshaped = w.reshape(dim_arr)
                    data = data * window_reshaped
                except:
                    print(
                        'invalid window function, skipping windowing.\nLook up scipy.signal docs'
                    )
                    pass

            # fft and write to destination dataset on disk
            fft_data = da.fft.fft(data, axis=axis)
            fft_data.dtype = 'complex64'

            with ProgressBar():
                fft_data.to_hdf5(
                    dst_fname, dst_dset, libver='latest'
                )  #, chunks=cshape, dtype=complex, compression='lzf')
        return 0
Пример #27
0
 def blocked_rank(array):
     chunks = list(array.chunks)
     chunks[axis] = -1
     array = array.rechunk(chunks)
     return dask.array.map_blocks(rank_along_axis, array)
Пример #28
0
def single_window(df,
                  rg,
                  tg,
                  threads=1,
                  max_memory=None,
                  justd=False,
                  extend=False):
    """
    Helper function to compute the correlation between variants from a genotype
    array
    :param df: Merged dataframe mapping of the positions in the genotypes
    :param rg: slice of Genotype array of the reference population
    :param tg: slice of Genotype array of the target population
    :param threads: Number of threads to estimate memory use
    :param max_memory: Memory limit
    :param justd: Return the raw LD matrices insted of its dot product
    :param extend: 'Circularize' the genome by extending both ends
    :return:
    """
    if not df.empty:
        # set Cache to protect memory spilling
        if max_memory is not None:
            available_memory = max_memory
        else:
            available_memory = psutil.virtual_memory().available / 2
        cache = Chest(available_memory=available_memory)
        # Make sure chunks make sense
        chunk_opts = dict(threads=threads, memory=available_memory)
        if not isinstance(rg, np.ndarray):
            rg = rg.rechunk(estimate_chunks(shape=rg.shape, **chunk_opts))
            tg = tg.rechunk(estimate_chunks(shape=tg.shape, **chunk_opts))
        # extend the genotype at both end to avoid edge effects
        if extend:
            # get the indices of the subset genotype array
            nidx = np.arange(rg.shape[1])
            # Split the array in half (approximately)
            idx_a, idx_b = np.array_split(nidx, 2)
            # Get the extednded indices
            i = np.concatenate([idx_a[::-1][:-1], nidx, idx_b[::-1][1:]])
            # Re-subset the genotype arrays with the extensions
            rg, tg = rg[:, i], tg[:, i]
            assert rg.shape[1] == tg.shape[1]
            # Compute the correltion as X'X/N
            rho_r = da.dot(rg.T, rg) / rg.shape[0]
            rho_t = da.dot(tg.T, tg) / tg.shape[0]
            # remove the extras
            idx = np.arange(i.shape[0])[idx_a.shape[0] - 1:(nidx.shape[0] +
                                                            idx_b.shape[0])]
            rho_r, rho_t = rho_r[idx, :], rho_t[idx, :]
            rho_r, rho_t = rho_r[:, idx], rho_t[:, idx]
            # Make sure the shape match
            assert rho_r.shape[1] == rho_r.shape[0]
            assert rho_t.shape[1] == rho_t.shape[0]
        else:
            # Just compute the correlations
            rho_r = da.dot(rg.T, rg) / rg.shape[0]
            rho_t = da.dot(tg.T, tg) / tg.shape[0]
        if justd:
            # return the raw LD matrices
            return df.snp, rho_r, rho_t
        gc.collect()
        # compute the cotagging/tagging scores
        cot = da.diag(da.dot(rho_r, rho_t))
        ref = da.diag(da.dot(rho_r, rho_r))
        tar = da.diag(da.dot(rho_t, rho_t))
        stacked = da.stack([df.snp, ref, tar, cot], axis=1)
        c_h_u_n_k_s = estimate_chunks(stacked.shape, threads, max_memory)
        stacked = da.rechunk(stacked, chunks=c_h_u_n_k_s)
        columns = ['snp', 'ref', 'tar', 'cotag']
        return dd.from_dask_array(stacked,
                                  columns=columns).compute(cache=cache)
Пример #29
0
def dask_safeslice(data, indices, chunks=None):
    """
    COPIED FROM https://github.com/dask/dask/issues/5540#issuecomment-601150129
    Added fancy indexing xarray.core.indexing.DaskIndexingAdapter

    Return a subset of a dask array, but with indexing applied independently to
    each slice of the input array, *prior* to their recombination to produce
    the result array.

    Args:

    * data (dask array):
        input data
    * indices (int or slice or tuple(int or slice)):
        required sub-section of the data.

    Kwargs:

    * chunks (list of (int or "auto")):
        chunking argument for 'rechunk' applied to the input.
        If set, forces the input to be rechunked as specified.
        ( This replaces the normal operation, which is to rechunk the input
        making the indexed dimensions undivided ).
        Mainly for testing on small arrays.

    .. note::

        'indices' currently does not support Ellipsis or newaxis.

    """

    from collections.abc import Iterable
    import dask.array as da

    # The idea is to "push down" the indexing operation to "underneath" the
    # result concatenation, so it gets done _before_ that.
    # This 'result concatenation' is actually implicit: the _implied_
    # concatenation of all the result chunks into a single output array.
    # We assume that any *one* chunk *can* be successfully computed.
    # By applying the indexing operation to each chunk, prior to the
    # complete result (re-)construction, we hope to make this work.

    # Normalise input to a list over all data dimensions.

    # NOTE: FOR NOW, this does not support Ellipsis.
    # TODO: that could easily be fixed.

    # Convert the slicing indices to a list of (int or slice).
    # ( NOTE: not supporting Ellipsis. )
    if not isinstance(indices, Iterable):
        # Convert a single key (slice or integer) to a length-1 list.
        indices = [indices]
    else:
        # Convert other iterable types to lists.
        indices = list(indices)

    n_data_dims = data.ndim
    assert len(indices) <= n_data_dims

    # Extend with ":" in all the additional (trailing) dims.
    all_slice = slice(None)
    indices += (n_data_dims - len(indices)) * [all_slice]

    assert len(indices) == n_data_dims

    # Discriminate indexed and non-indexed dims.
    # An "indexed" dim is where input index is *anything* other than a ":".
    dim_is_indexed = [index != all_slice for index in indices]

    # Work out which indices are simple integer values.
    # ( by definition, all of these will be "indexed" dims )
    dim_is_removed = [isinstance(key, int) for key in indices]

    # Replace single-value indices with length-1 indices, so the indexing
    # preserves all dimensions (as this makes reconstruction easier).
    # ( We use the above 'dim_is_removed' to correct this afterwards. )
    indices = [slice(key, key + 1) if isinstance(key, int) else key for key in indices]

    # We will now rechunk to get "our chunks" : but these must not be divided
    # in dimensions affected by the requested indexing.
    # So we rechunk, but insist that those dimensions are kept whole.
    # ( Obviously, not always optimal ... )
    # As the indexed dimensions will always be _reduced_ by the indexing, this
    # is obviously over-conservative + may give chunks which are rather too
    # small.  Let's just ignore that problem for now!
    if chunks is not None:
        rechunk_dim_specs = list(chunks)
    else:
        rechunk_dim_specs = ["auto"] * n_data_dims
    for i_dim in range(n_data_dims):
        if dim_is_indexed[i_dim]:
            rechunk_dim_specs[i_dim] = -1
    data = da.rechunk(data, chunks=rechunk_dim_specs)

    # Calculate multidimensional indexings of the original data array which
    # correspond to all these chunks.
    # Note: following the "-1"s in the above rechunking spec, the indexed dims
    # should all have only one chunk in them.
    assert all(
        len(data.chunks[i_dim]) == 1
        for i_dim in range(n_data_dims)
        if dim_is_removed[i_dim]
    )

    # Make an array of multidimensional indexes corresponding to all chunks.
    chunks_shape = [len(chunk_lengths) for chunk_lengths in data.chunks]
    chunks_shape += [n_data_dims]
    chunk_indices = np.zeros(chunks_shape, dtype=object)
    # The chunk_indices array ...
    #     * has dimensions of n-data-dims + 1
    #     * has shape of "chunks-shape" + (n_data_dims,)
    #     * each entry[i0, i1, iN-1] --> n_data_dims * slice-objects.

    # Pre-fill indexes array with [:, :, ...]
    chunk_indices[...] = all_slice
    # Set slice ranges for each dimension at a time.
    for i_dim in range(n_data_dims):
        # Fix all keys for this data dimension : chunk_indices[..., i_dim]
        dim_inds = [all_slice] * n_data_dims + [i_dim]
        if dim_is_indexed[i_dim]:
            # This is a user-indexed dim, so should be un-chunked.
            assert len(data.chunks[i_dim]) == 1
            # Set keys for this dim to the user-requested indexing.
            if EMBED_INDEXES:
                chunk_indices[tuple(dim_inds)] = indices[i_dim]
        else:
            # Replace keys for this dim with the slice range for the
            # relevant chunk, for each chunk in the dim.
            startend_positions = np.cumsum([0] + list(data.chunks[i_dim]))
            starts, ends = startend_positions[:-1], startend_positions[1:]
            for i_key, (i_start, i_end) in enumerate(zip(starts, ends)):
                dim_inds[i_dim] = i_key
                chunk_indices[tuple(dim_inds)] = slice(i_start, i_end)
                # E.G. chunk_indices[:, :, 1, :][2] = slice(3,6)

    # Make actual addressed chunks by indexing the original array, arrange them
    # in the same pattern, and re-combine them all to make a result array.
    # This needs to be a list-of-lists construction, as da.block requires it.
    # ( an array of arrays is presumably too confusing ?!? )
    def get_chunks(multidim_indices):
        if multidim_indices.ndim > 1:
            # Convert the "array of chunks" dims --> lists-of-lists
            result = [
                get_chunks(multidim_indices[i_part])
                for i_part in range(multidim_indices.shape[0])
            ]
        else:
            # Innermost dim contains n-dims * slice-objects
            # Convert these into a slice of the data array.
            result = data.__getitem__(tuple(multidim_indices))

            if not EMBED_INDEXES:
                # Now *also* apply the required indexing to this chunk.
                # It initially seemed *essential* that this be an independent
                # operation, so that the memory associated with the whole chunk
                # can be released.
                # But ACTUALLY this is not so, given the next step (see on).
                try:
                    result = result.__getitem__(tuple(indices))
                except NotImplementedError:
                    result = data
                    for axis, subkey in reversed(list(enumerate(tuple(indices)))):
                        result = result[(slice(None),) * axis + (subkey,)]

            # AND FINALLY : apply a numpy copy to this indexed-chunk.
            # This is essential, to release the source chunks ??
            # see: https://github.com/dask/dask/issues/3595#issuecomment-449546228
            result = result.map_blocks(np.copy)

        return result

    listoflists_of_chunks = get_chunks(chunk_indices)
    result = da.block(listoflists_of_chunks)

    assert result.ndim == n_data_dims  # Unchanged as 'da.block' concatenates.

    # Finally remove the extra dimensions for single-value indices.
    assert all(
        result.shape[i_dim] == 1
        for i_dim in range(n_data_dims)
        if dim_is_removed[i_dim]
    )
    all_dim_indices = [
        0 if dim_is_removed[i_dim] else all_slice for i_dim in range(n_data_dims)
    ]
    result = result.__getitem__(tuple(all_dim_indices))
    return result
Пример #30
0
def two_point_stats(arr1,
                    arr2,
                    mask=None,
                    periodic_boundary=True,
                    cutoff=None):
    """Calculate the 2-points stats for two arrays

    Args:
      arr1: array used to calculate cross-correlations (n_samples,n_x,n_y)
      arr2: array used to calculate cross-correlations (n_samples,n_x,n_y)
      mask: array specifying confidence in the measurement at a pixel
        (n_samples,n_x,n_y).  In range [0,1].
      periodic_boundary: whether to assume a periodic boundary (default is true)
      cutoff: the subarray of the 2 point stats to keep

    Returns:
      the snipped 2-points stats

    >>> two_point_stats(
    ...     da.from_array(np.arange(10).reshape(2, 5), chunks=(2, 5)),
    ...     da.from_array(np.arange(10).reshape(2, 5), chunks=(2, 5)),
    ... ).shape
    (2, 5)

    Test masking

    >>> array = da.array([[[1, 0 ,0], [0, 1, 1], [1, 1, 0]]])
    >>> mask = da.array([[[1, 1, 1], [1, 1, 1], [1, 0, 0]]])
    >>> norm_mask = da.array([[[2, 4, 3], [4, 7, 4], [3, 4, 2]]])
    >>> expected = da.array([[[1, 0, 1], [1, 4, 1], [1, 0, 1]]]) / norm_mask
    >>> assert np.allclose(
    ...     two_point_stats(array, array, mask=mask, periodic_boundary=False),
    ...     expected
    ... )

    The mask must be in the range 0 to 1.

    >>> array = da.array([[[1, 0], [0, 1]]])
    >>> mask =  da.array([[[2, 0], [0, 1]]])
    >>> two_point_stats(array, array, mask)
    Traceback (most recent call last):
    ...
    RuntimeError: Mask must be in range [0,1]
    """

    cutoff_ = int((np.min(arr1.shape[1:]) - 1) / 2)
    if cutoff is None:
        cutoff = cutoff_
    cutoff = min(cutoff, cutoff_)

    nonperiodic_padder = sequence(
        dapad(
            pad_width=[(0, 0)] + [(cutoff, cutoff)] * (arr1.ndim - 1),
            mode="constant",
            constant_values=0,
        ),
        lambda x: da.rechunk(x, (x.chunks[0], ) + x.shape[1:]),
    )

    padder = identity if periodic_boundary else nonperiodic_padder

    if mask is not None:
        if da.max(mask).compute() > 1.0 or da.min(mask).compute() < 0.0:
            raise RuntimeError("Mask must be in range [0,1]")

        mask_array = lambda arr: arr * mask

        normalize = lambda x: x / auto_correlation(padder(mask))
    else:
        mask_array = identity

        if periodic_boundary:
            # The periodic normalization could always be the
            # auto_correlation of the mask. But for the sake of
            # efficiency, we specify the periodic normalization in the
            # case there is no mask.
            normalize = lambda x: x / arr1[0].size
        else:
            normalize = lambda x: x / auto_correlation(
                padder(np.ones_like(arr1)))

    return sequence(
        map_(mask_array),
        map_(padder),
        list,
        star(cross_correlation),
        normalize,
        center_slice(cutoff=cutoff),
    )([arr1, arr2])
Пример #31
0
def two_point_stats(arr1,
                    arr2,
                    periodic_boundary=True,
                    cutoff=None,
                    mask=None):
    r"""Calculate the 2-points stats for two arrays

    The discretized two point statistics are given by

    .. math::

       f[r \; \vert \; l, l'] = \frac{1}{S} \sum_s m[s, l] m[s + r, l']

    where :math:`f[r \; \vert \; l, l']` is the conditional
    probability of finding the local states :math:`l` and :math:`l` at
    a distance and orientation away from each other defined by the
    vector :math:`r`. `See this paper for more details on the
    notation. <https://doi.org/10.1007/s40192-017-0089-0>`_

    The array ``arr1[i]`` (state :math:`l`) is correlated with
    ``arr2[i]`` (state :math:`l'`) for each sample ``i``. Both arrays
    must have the same number of samples and nominal states (integer
    value) or continuous variables.

    To calculate multiple different correlations for each sample, see
    :func:`~pymks.correlations_multiple`.

    To use ``two_point_stats`` as part of a Scikit-learn pipeline, see
    :class:`~pymks.TwoPointCorrelation`.

    Args:
      arr1: array used to calculate cross-correlations, shape
        ``(n_samples,n_x,n_y)``
      arr2: array used to calculate cross-correlations, shape
        ``(n_samples,n_x,n_y)``
      periodic_boundary: whether to assume a periodic boundary
        (default is ``True``)
      cutoff: the subarray of the 2 point stats to keep
      mask: array specifying confidence in the measurement at a pixel,
        shape ``(n_samples,n_x,n_y)``. In range [0,1].

    Returns:
      the snipped 2-points stats

    If both arrays are Dask arrays then a Dask array is returned.

    >>> out = two_point_stats(
    ...     da.from_array(np.arange(10).reshape(2, 5), chunks=(2, 5)),
    ...     da.from_array(np.arange(10).reshape(2, 5), chunks=(2, 5)),
    ... )
    >>> out.chunks
    ((2,), (5,))
    >>> out.shape
    (2, 5)

    If either of the arrays are Numpy then a Numpy array is returned.

    >>> two_point_stats(
    ...     np.arange(10).reshape(2, 5),
    ...     np.arange(10).reshape(2, 5),
    ... )
    array([[ 3.,  4.,  6.,  4.,  3.],
           [48., 49., 51., 49., 48.]])

    Test masking

    >>> array = da.array([[[1, 0 ,0], [0, 1, 1], [1, 1, 0]]])
    >>> mask = da.array([[[1, 1, 1], [1, 1, 1], [1, 0, 0]]])
    >>> norm_mask = da.array([[[2, 4, 3], [4, 7, 4], [3, 4, 2]]])
    >>> expected = da.array([[[1, 0, 1], [1, 4, 1], [1, 0, 1]]]) / norm_mask
    >>> assert np.allclose(
    ...     two_point_stats(array, array, mask=mask, periodic_boundary=False)[:, 1:-1, 1:-1],
    ...     expected
    ... )

    The mask must be in the range 0 to 1.

    >>> array = da.array([[[1, 0], [0, 1]]])
    >>> mask =  da.array([[[2, 0], [0, 1]]])
    >>> two_point_stats(array, array, mask=mask)
    Traceback (most recent call last):
    ...
    RuntimeError: Mask must be in range [0,1]

    """  # noqa: #501

    n_is_even = 1 - np.array(arr1.shape[1:]) % 2
    padding = np.array(arr1.shape[1:]) // 2

    nonperiodic_padder = sequence(
        dapad(
            pad_width=[(0, 0)] + list(zip(padding, padding + n_is_even)),
            mode="constant",
            constant_values=0,
        ),
        lambda x: da.rechunk(x, (x.chunks[0], ) + x.shape[1:]),
    )

    padder = identity if periodic_boundary else nonperiodic_padder

    if mask is not None:
        if da.max(mask).compute() > 1.0 or da.min(mask).compute() < 0.0:
            raise RuntimeError("Mask must be in range [0,1]")

        mask_array = lambda arr: arr * mask

        normalize = lambda x: x / auto_correlation(padder(mask))
    else:
        mask_array = identity

        if periodic_boundary:
            # The periodic normalization could always be the
            # auto_correlation of the mask. But for the sake of
            # efficiency, we specify the periodic normalization in the
            # case there is no mask.
            normalize = sequence(
                lambda x: x / arr1[0].size,
                dapad(
                    pad_width=[(0, 0)] + list(zip(0 * n_is_even, n_is_even)),
                    mode="wrap",
                ),
                lambda x: da.rechunk(x, (x.chunks[0], ) + x.shape[1:]),
            )
        else:
            normalize = lambda x: x / auto_correlation(
                padder(np.ones_like(arr1)))

    return sequence(
        map_(mask_array),
        map_(padder),
        list,
        star(cross_correlation),
        normalize,
        center_slice(cutoff=cutoff),
    )([arr1, arr2])
Пример #32
0
def correlations_multiple(data,
                          correlations,
                          periodic_boundary=True,
                          cutoff=None):
    r"""Calculate 2-point stats for a multiple auto/cross correlation

    The discretized two point statistics are given by

    .. math::

       f[r \; \vert \; l, l'] = \frac{1}{S} \sum_s m[s, l] m[s + r, l']

    where :math:`f[r \; \vert \; l, l']` is the conditional
    probability of finding the local states :math:`l` and :math:`l'`
    at a distance and orientation away from each other defined by the
    vector :math:`r`. `See this paper for more details on the
    notation. <https://doi.org/10.1007/s40192-017-0089-0>`_

    The correlations are calulated based on pairs given in
    ``correlations`` for each sample.

    To calculate a single correlation for two arrays, see
    :func:`~pymks.two_point_stats`.

    To use ``correlations_multiple`` as part of a Scikit-learn
    pipeline, see :class:`~pymks.TwoPointCorrelation`.

    Args:
      data: the discretized data with shape ``(n_samples, n_x, n_y, n_state)``
      correlations: the correlation pairs, ``[[i0, j0], [i1, j1], ...]``
      periodic_boundary: whether to assume a periodic boundary (default is true)
      cutoff: the subarray of the 2 point stats to keep

    Returns:
      the 2-points stats array

    If ``data`` is a Numpy array then ``correlations_multiple`` will
    return a Numpy array.

    >>> data = np.arange(18).reshape(1, 3, 3, 2)
    >>> out_np = correlations_multiple(data, [[0, 1], [1, 1]])
    >>> out_np.shape
    (1, 3, 3, 2)
    >>> answer = np.array([[[58, 62, 58], [94, 98, 94], [58, 62, 58]]]) + 2. / 3.
    >>> assert np.allclose(out_np[..., 0], answer)

    However, if ``data`` is a Dask array then a Dask array is
    returned.

    >>> data = da.from_array(data, chunks=(1, 3, 3, 2))
    >>> out = correlations_multiple(data, [[0, 1], [1, 1]])
    >>> out.shape
    (1, 3, 3, 2)
    >>> out.chunks
    ((1,), (3,), (3,), (2,))
    >>> assert np.allclose(out[..., 0], answer)

    """

    return pipe(
        range(data.shape[-1]),
        map_(lambda x: (0, x)),
        lambda x: correlations if correlations else x,
        map_(lambda x: two_point_stats(
            data[..., x[0]],
            data[..., x[1]],
            periodic_boundary=periodic_boundary,
            cutoff=cutoff,
        )),
        list,
        lambda x: da.stack(x, axis=-1),
        lambda x: da.rechunk(x, x.chunks[:-1] + (-1, )),
    )