Exemplo n.º 1
0
def pearson_influence(xarr: da.Array, yarr: da.Array) -> da.Array:
    """Calculating the influence for deleting a point on the pearson correlation"""

    if xarr.shape != yarr.shape:
        raise ValueError(
            f"The shape of xarr and yarr should be same, got {xarr.shape}, {yarr.shape}"
        )

    # Fast calculating the influence for removing one element on the correlation
    n = xarr.shape[0]

    x2, y2 = da.square(xarr), da.square(yarr)
    xy = xarr * yarr

    # The influence is vectorized on xarr and yarr, so we need to repeat all the sums for n times

    xsum = da.ones(n) * da.sum(xarr)
    ysum = da.ones(n) * da.sum(yarr)
    xysum = da.ones(n) * da.sum(xy)
    x2sum = da.ones(n) * da.sum(x2)
    y2sum = da.ones(n) * da.sum(y2)

    # Note: in we multiply (n-1)^2 to both denominator and numerator to avoid divisions.
    numerator = (n - 1) * (xysum - xy) - (xsum - xarr) * (ysum - yarr)

    varx = (n - 1) * (x2sum - x2) - da.square(xsum - xarr)
    vary = (n - 1) * (y2sum - y2) - da.square(ysum - yarr)
    denominator = da.sqrt(varx * vary)

    return da.map_blocks(itruediv,
                         numerator,
                         denominator,
                         dtype=numerator.dtype)
Exemplo n.º 2
0
def average(a, weights, **kwargs):
    """
    compute the weighted average
    """
    avg = da.sum(a * weights, **kwargs)
    tot = da.sum(weights, **kwargs)
    res = avg / tot
    return res
Exemplo n.º 3
0
    def _joint_log_likelihood(self, X):
        jll = []
        for i in range(np.size(self.classes_)):
            jointi = da.log(self.class_prior_[i])
            n_ij = -0.5 * da.sum(da.log(2.0 * np.pi * self.sigma_[i, :]))
            n_ij -= 0.5 * da.sum(
                ((X - self.theta_[i, :])**2) / (self.sigma_[i, :]), 1)
            jll.append(jointi + n_ij)

        joint_log_likelihood = da.stack(jll).T
        return joint_log_likelihood
Exemplo n.º 4
0
def dask_getJtJdiag(self, m, W=None):
    """
        Return the diagonal of JtJ
    """
    if self.gtgdiag is None:

        # Need to check if multiplying weights makes sense
        if W is None:
            self.gtgdiag = da.sum(self.getJ(m)**2, axis=0).compute()
        else:
            w = da.from_array(W.diagonal())[:, None]
            self.gtgdiag = da.sum((w * self.getJ(m))**2, axis=0).compute()

    return self.gtgdiag
Exemplo n.º 5
0
    def get_snp_mask(self):
        # keep sites that pass the snp prevalence threshold
        # Together with self.general_mask, this filter should produce exactly the
        # same sites as parse_midas_data.parse_snps

        # Need to use the alt and depth arr again
        alt_arr = da.from_zarr('{}/full_alt.zarr'.format(self.data_dir))
        depth_arr = da.from_zarr('{}/full_depth.zarr'.format(self.data_dir))
        # increase chunk size to reduce overhead
        rechunked_alt_arr = alt_arr.rechunk((1000000, 10))
        rechunked_depth_arr = depth_arr.rechunk((1000000, 10))
        filtered_depth = rechunked_depth_arr[:, self.sample_mask]
        filtered_alt = rechunked_alt_arr[:, self.sample_mask]

        # Some snps need to be polarized according to pop_freqs
        from plos_bio_scripts import calculate_snp_prevalences
        population_freqs = calculate_snp_prevalences.parse_population_freqs(
            self.species_name, polarize_by_consensus=False)
        all_pop_freqs = np.array(
            map(lambda x: population_freqs.get(x, 0),
                zip(self.chromosomes, self.locations)))
        sites_to_flip = all_pop_freqs > 0.5

        # take care of sites need not polarize
        round_1_mask = self.general_mask & np.invert(sites_to_flip)
        alt_threshold = da.ceil(
            filtered_depth[round_1_mask, :] * config.parse_snps_min_freq) + 0.5
        passed_snp_mask1 = da.sum(
            filtered_alt[round_1_mask, :] > alt_threshold, axis=1) > 0

        # then flip alt
        round_2_mask = self.general_mask & sites_to_flip
        alt_threshold2 = da.ceil(
            filtered_depth[round_2_mask, :] * config.parse_snps_min_freq) + 0.5
        polarized_alts = filtered_depth[round_2_mask, :] - filtered_alt[
            round_2_mask, :]
        passed_snp_mask2 = da.sum(polarized_alts > alt_threshold2, axis=1) > 0

        # perform dask computation
        passed_snp_mask1 = passed_snp_mask1.compute()
        passed_snp_mask2 = passed_snp_mask2.compute()
        final_mask = self.general_mask.copy()
        final_mask[self.general_mask
                   & np.invert(sites_to_flip)] = passed_snp_mask1
        final_mask[self.general_mask & sites_to_flip] = passed_snp_mask2
        print("%d sites left after applying snp filter" % np.sum(final_mask))

        return final_mask[self.general_mask]
Exemplo n.º 6
0
        def load_data(statistic, axis):
            import dask.array as da
            import numpy as np
            from glue.utils import view_shape
            x = da.from_zarr('/mnt/cephfs/zarr_data_full')
            f = 1500
            scale = 2

            lh = []
            for k in range(scale):
                lc = []
                for i in range(scale):
                    lr = []
                    for j in range(scale):
                        lr.append(x[f % 3500])
                        f = f + 1
                    lc.append(da.concatenate(lr))
                lh.append(da.concatenate(lc, 1))
            z = da.concatenate(lh, 2)

            if statistic == 'minimum':
                return da.min(z, axis).compute()
            elif statistic == 'maximum':
                return da.max(z, axis).compute()
            elif statistic == 'mean' or statistic == 'median':
                return da.mean(z, axis).compute()
            elif statistic == 'percentile':
                return percentile / 100
            elif statistic == 'sum':
                return da.sum(z.axis).compute()
            return 0
Exemplo n.º 7
0
def test_make_regression(n_samples, n_features, n_informative, n_targets, bias,
                         effective_rank, tail_strength, noise, shuffle, coef,
                         random_state, n_parts, cluster):
    c = Client(cluster)
    try:
        from cuml.dask.datasets import make_regression

        result = make_regression(n_samples=n_samples,
                                 n_features=n_features,
                                 n_informative=n_informative,
                                 n_targets=n_targets,
                                 bias=bias,
                                 effective_rank=effective_rank,
                                 noise=noise,
                                 shuffle=shuffle,
                                 coef=coef,
                                 random_state=random_state,
                                 n_parts=n_parts)

        if coef:
            out, values, coefs = result
        else:
            out, values = result

        assert out.shape == (n_samples, n_features), "out shape mismatch"

        if n_targets > 1:
            assert values.shape == (n_samples, n_targets), \
                   "values shape mismatch"
        else:
            assert values.shape == (n_samples, ), "values shape mismatch"

        assert len(out.chunks[0]) == n_parts
        assert len(out.chunks[1]) == 1

        if coef:
            if n_targets > 1:
                assert coefs.shape == (n_features, n_targets), \
                       "coefs shape mismatch"
                assert len(coefs.chunks[1]) == 1
            else:
                assert coefs.shape == (n_features, ), "coefs shape mismatch"
                assert len(coefs.chunks[0]) == 1

            test1 = da.all(da.sum(coefs != 0.0, axis=0) == n_informative)

            std_test2 = da.std(values - (da.dot(out, coefs) + bias), axis=0)

            test1, std_test2 = da.compute(test1, std_test2)

            diff = cp.abs(1.0 - std_test2)
            test2 = cp.all(diff < 1.5 * 10**(-1.))

            assert test1, \
                "Unexpected number of informative features"

            assert test2, "Unexpectedly incongruent outputs"

    finally:
        c.close()
Exemplo n.º 8
0
def _response(x_data, n_space, n_state):
    return pipe(
        np.linspace(0, 1, n_state),
        lambda h: da.maximum(1 - abs(x_data[:, :, None] - h) /
                             (h[1] - h[0]), 0), dafft(axis=1),
        lambda fx: da.sum(_fcoeff(n_space, n_state)[None] * fx, axis=-1),
        daifft(axis=1)).real
Exemplo n.º 9
0
    def test_get_bounding_corners_dask(self):
        """Test finding surrounding bounding corners."""
        import dask.array as da
        from pyresample.bilinear.xarr import (_get_input_xy_dask,
                                              _get_bounding_corners_dask)
        from pyresample._spatial_mp import Proj
        from pyresample import CHUNK_SIZE

        proj = Proj(self.target_def.proj_str)
        out_x, out_y = self.target_def.get_proj_coords(chunks=CHUNK_SIZE)
        out_x = da.ravel(out_x)
        out_y = da.ravel(out_y)
        in_x, in_y = _get_input_xy_dask(self.source_def, proj,
                                        da.from_array(self.valid_input_index),
                                        da.from_array(self.index_array))
        pt_1, pt_2, pt_3, pt_4, ia_ = _get_bounding_corners_dask(
            in_x, in_y, out_x, out_y, self.neighbours,
            da.from_array(self.index_array))

        self.assertTrue(pt_1.shape == pt_2.shape == pt_3.shape == pt_4.shape ==
                        (self.target_def.size, 2))
        self.assertTrue(ia_.shape == (self.target_def.size, 4))

        # Check which of the locations has four valid X/Y pairs by
        # finding where there are non-NaN values
        res = da.sum(pt_1 + pt_2 + pt_3 + pt_4, axis=1).compute()
        self.assertEqual(np.sum(~np.isnan(res)), 10)
Exemplo n.º 10
0
    def calc_dispersion(self,
                        src,
                        dst,
                        axis=2,
                        window=False,
                        save_frequencies=False):
        # t to f
        self.fft_dask(src, 'mag', 'f_mag.hdf5', 'fft_1', -1, window)
        # x to k
        self.fft_dask('f_mag.hdf5', 'fft_1', 'temp2.hdf5', 'fft_2', axis,
                      window)

        with hd.File('temp2.hdf5', 'r', libver='latest') as temp:
            disp_arr = da.from_array(temp['fft_2'],
                                     chunks=temp['fft_2'].chunks)
            dispersion = da.sum(
                sp.absolute(disp_arr),
                axis=tuple([a for a in range(5) if a not in (axis, 4)]))
            with hd.File(dst, 'w', libver='latest') as d:
                pass

            dispersion.to_hdf5(dst, 'disp')

        # delete the intermediary values from longterm memory
        if save_frequencies:
            os.remove('temp2.hdf5')
        else:
            os.remove('temp1.hdf5')
            os.remove('temp2.hdf5')

        return 0
Exemplo n.º 11
0
 def compute(fieldset):
     # Calculating vertical weighted average
     for f in [fieldset.U, fieldset.V]:
         for tind in f.loaded_time_indices:
             data = da.sum(f.data[tind, :] * DZ, axis=0) / sum(dz)
             data = da.broadcast_to(data, (1, f.grid.zdim, f.grid.ydim, f.grid.xdim))
             f.data = f.data_concatenate(f.data, data, tind)
Exemplo n.º 12
0
def compute_adjoint_dask(rays, g, dobs, i0, K_ne, m_tci, m_prior, CdCt,
                         sigma_m, Nkernel, size_cell):
    L_m = Nkernel * size_cell
    #     #i not eq i0 mask
    #     mask = np.ones(rays.shape[0],dtype=np.bool)
    #     mask[i0] = False
    #     rays = rays[mask,:,:,:,:]
    #     g = g[mask,:,:]
    #     dobs = dobs[mask,:,:]
    #     CdCt = CdCt[mask,:,:]
    #residuals
    #g.shape, dobs.shape [Na,Nt,Nd]
    dd = g - dobs
    #weighted residuals
    #Cd.shape [Na,Nt,Nd] i.e. diagonal
    #CdCt^-1 = 1./CdCt
    dd /= (CdCt + 1e-15)
    #get ray info
    Na, Nt, Nd, _, Ns = rays.shape
    #parallelize over directions
    gradient = da.sum(da.stack([
        da.from_delayed(delayed(do_adjoint)(
            rays[:, :, d, :, :], dd[:, :, d], K_ne, m_tci, sigma_m, Nkernel,
            size_cell, i0), (m_tci.nx, m_tci.ny, m_tci.nz),
                        dtype=np.double) for d in range(Nd)
    ],
                               axis=-1),
                      axis=-1)
    gradient = gradient.compute(get=get)
    gradient += m_tci.M
    gradient -= m_prior

    return gradient
Exemplo n.º 13
0
def predict(fine_image_t0, coarse_image_t0, coarse_image_t1, shape=None):
    spec = spectral_distance(fine_image_t0, coarse_image_t0)
    spec_diff = spec[0]
    spec_dist = spec[1]
    temp = temporal_distance(coarse_image_t0, coarse_image_t1)
    temp_diff = temp[0]
    temp_dist = temp[1]
    spat_dist = spatial_distance(fine_image_t0)
    print("spec_dist.shape: {} temp_dist.shape: {} spat_dist.shape: {}".format(
        spec_dist.shape, temp_dist.shape, spat_dist.shape))
    comb_dist = comb_distance(spec_dist, temp_dist, spat_dist)
    similar_pixels = filtering(fine_image_t0, spec_dist, temp_dist, spec_diff,
                               temp_diff)
    weights = weighting(spec_dist, temp_dist, comb_dist, similar_pixels)
    pred_refl = fine_image_t0 + temp_diff
    weighted_pred_refl = da.sum(pred_refl * weights, axis=1)

    if shape is None:
        prediction = weighted_pred_refl
    else:
        prediction = da.reshape(weighted_pred_refl, shape)

    print("Done prediction!")

    return prediction
Exemplo n.º 14
0
def wavg_full(data, flags, weights, axis=0, threshold=0.8):
    """Perform weighted average of data, flags and weights, applying flags, over axis.

    Parameters
    ----------
    data       : array of complex
    flags      : array of uint8 or boolean
    weights    : array of floats
    axis       : int
    threshold  : int

    Returns
    -------
    av_data    : weighted average of data
    av_flags   : weighted average of flags
    av_weights : weighted average of weights
    """
    weighted_data, flagged_weights = weight_data(data, flags, weights)

    av_data, av_weights = _wavg_axis(weighted_data, flagged_weights, axis)
    # Update flags to include all invalid data, ie vis = 0j and weights > 1e15
    updated_flags = flagged_weights == 0
    n_flags = da.sum(updated_flags, axis)

    av_flags = n_flags >= flags.shape[axis] * threshold
    return av_data, av_flags, av_weights
Exemplo n.º 15
0
def logsumexp(arr, axis=0):
    """Computes the sum of arr assuming arr is in the log domain.
    Returns log(sum(exp(arr))) while minimizing the possibility of
    over/underflow.
    Examples
    --------
    >>> import numpy as np
    >>> from sklearn.utils.extmath import logsumexp
    >>> a = np.arange(10)
    >>> np.log(np.sum(np.exp(a)))
    9.4586297444267107
    >>> logsumexp(a)
    9.4586297444267107
    """
    if axis == 0:
        pass
    elif axis == 1:
        arr = arr.T
    else:
        raise NotImplementedError
    # Use the max to normalize, as with the log this is what accumulates
    # the less errors
    vmax = arr.max(axis=0)
    out = da.log(da.sum(da.exp(arr - vmax), axis=0))
    out += vmax
    return out
Exemplo n.º 16
0
def test_0d_array():
    x = da.mean(da.ones(4, chunks=4), axis=0).compute()
    y = np.mean(np.ones(4))
    assert type(x) == type(y)

    x = da.sum(da.zeros(4, chunks=1)).compute()
    y = np.sum(np.zeros(4))
    assert type(x) == type(y)
Exemplo n.º 17
0
def test_0d_array():
    x = da.mean(da.ones(4, chunks=4), axis=0).compute()
    y = np.mean(np.ones(4))
    assert type(x) == type(y)

    x = da.sum(da.zeros(4, chunks=1)).compute()
    y = np.sum(np.zeros(4))
    assert type(x) == type(y)
Exemplo n.º 18
0
def compute_gradient_dask(rays,
                          g,
                          dobs,
                          i0,
                          K_ne,
                          m_tci,
                          m_prior,
                          CdCt,
                          sigma_m,
                          Nkernel,
                          size_cell,
                          cov_obj=None):
    L_m = Nkernel * size_cell
    #     #i not eq i0 mask
    #     mask = np.ones(rays.shape[0],dtype=np.bool)
    #     mask[i0] = False
    #     rays = rays[mask,:,:,:,:]
    #     g = g[mask,:,:]
    #     dobs = dobs[mask,:,:]
    #     CdCt = CdCt[mask,:,:]
    #residuals
    #g.shape, dobs.shape [Na,Nt,Nd]
    dd = g - dobs
    #weighted residuals
    #Cd.shape [Na,Nt,Nd] i.e. diagonal
    #CdCt^-1 = 1./CdCt
    dd /= (CdCt + 1e-15)
    #get ray info
    Na, Nt, Nd, _, Ns = rays.shape
    #     if Na < Nd:
    #         #parallelize over antennas
    #         gradient = da.sum(da.stack([da.from_delayed(delayed(do_gradient)(rays[i,:,:,:,:], dd[i,:,:], K_ne, m_tci,
    #                                          sigma_m, Nkernel, size_cell),(m_tci.nx,m_tci.ny,m_tci.nz),dtype=np.double) for i in range(Na)],axis=-1),axis=-1)
    #     else:
    #         #parallelize over directions
    #         gradient = da.sum(da.stack([da.from_delayed(delayed(do_gradient)(rays[:,:,d,:,:], dd[:,:,d], K_ne, m_tci,
    #                                           sigma_m, Nkernel, size_cell),(m_tci.nx,m_tci.ny,m_tci.nz),dtype=np.double) for d in range(Nd)],axis=-1),axis=-1)
    #parallelize over directions
    ne_tci = m_tci.copy()
    np.exp(ne_tci.M, out=ne_tci.M)
    ne_tci.M *= K_ne / TECU
    gradient = da.sum(da.stack([
        da.from_delayed(delayed(do_gradient)(
            rays[:, :, d, :, :], dd[:, :, d], ne_tci, sigma_m, Nkernel,
            size_cell, i0), (m_tci.nx, m_tci.ny, m_tci.nz),
                        dtype=np.double) for d in range(Nd)
    ],
                               axis=-1),
                      axis=-1)
    gradient = gradient.compute(get=get)
    gradient -= gradient[i0, ...]
    if cov_obj is not None:
        dm = m_tci.M - m_prior
        gradient + cov_obj.contract(dm)
    #gradient += m_tci.M
    #gradient -= m_prior

    return gradient
Exemplo n.º 19
0
    def grid_glm_data(self, flashes):
        """
        Aggregate the point flashes into a grid of flash counts occurring within each grid box.

        Args:
            flashes (:class:`pandas.DataFrame`): Contains the longitudes and latitudes of each flash

        Returns:
            :class:`numpy.ndarray` [y, x]: The number of flashes occurring at each grid point.
        """
        flash_x, flash_y = self.glm_proj(flashes["flash_lon"].values,
                                         flashes["flash_lat"].values)
        flash_x /= 1000
        flash_y /= 1000
        valid_flashes = np.where(
            (flash_x >= self.x_points.min() - self.dx_km / 2)
            & (flash_x <= self.x_points.max() + self.dx_km / 2)
            & (flash_y >= self.y_points.min() - self.dx_km / 2)
            & (flash_y <= self.y_points.max() + self.dx_km / 2))[0]
        if valid_flashes.size > 0:
            if PARALLEL:
                x_grid_flat = da.from_array(self.x_grid.reshape(
                    (self.x_grid.size, 1)),
                                            chunks=512)
                y_grid_flat = da.from_array(self.y_grid.reshape(
                    (self.x_grid.size, 1)),
                                            chunks=512)
                flash_x_flat = da.from_array(flash_x[valid_flashes].reshape(
                    1, valid_flashes.size),
                                             chunks=512)
                flash_y_flat = da.from_array(flash_y[valid_flashes].reshape(
                    1, valid_flashes.size),
                                             chunks=512)
                x_dist = da.fabs(x_grid_flat - flash_x_flat)
                y_dist = da.fabs(y_grid_flat - flash_y_flat)
                flash_grid_counts = da.sum(
                    (x_dist <= self.dx_km / 2) & (y_dist <= self.dx_km / 2),
                    axis=1)
                flash_grid = flash_grid_counts.reshape(
                    self.lon_grid.shape).astype(np.int32).compute()
            else:
                x_grid_flat = self.x_grid.reshape((self.x_grid.size, 1))
                y_grid_flat = self.y_grid.reshape((self.x_grid.size, 1))
                flash_x_flat = flash_x[valid_flashes].reshape(
                    1, valid_flashes.size)
                flash_y_flat = flash_y[valid_flashes].reshape(
                    1, valid_flashes.size)
                x_dist = np.abs(x_grid_flat - flash_x_flat)
                y_dist = np.abs(y_grid_flat - flash_y_flat)
                flash_grid_counts = np.sum(
                    (x_dist <= self.dx_km / 2) & (y_dist <= self.dx_km / 2),
                    axis=1)
                flash_grid = flash_grid_counts.reshape(
                    self.lon_grid.shape).astype(np.int32)
        else:
            flash_grid = np.zeros(self.lon_grid.shape, dtype=np.int32)
        return flash_grid
Exemplo n.º 20
0
def _run_dummy_task_on_dask(*, client):
    """
    Runs small task on Dask client. Starting from v2021.7.0, Dask Distributed does not always
    close HDF5 files, that are open in read-only mode for loading raw data. Submitting and
    computing a small unrelated tasks seem to prompt the client to release the resources from
    the previous task and close the files.
    """
    rfut = da.sum(da.random.random((1000, ),
                                   chunks=(10, ))).persist(scheduler=client)
    rfut.compute(scheduler=client)
Exemplo n.º 21
0
def pis_mVc(x,y,beta):
    '''
    rewrite mVc and mMSE,share the 'p' and 'dif'!!!!
    '''
    p=logistic_func(beta, x)
    dif=da.absolute(y-p) 
    xnorm=da.linalg.norm(x,axis=1)
    pis=dif*xnorm
    pi=pis/da.sum(pis)
    return pi
def compute_spectrum(samples, freq, N=FFT_SIZE):
    spec = da.sum(np.abs(
        da.fft.fftshift(da.fft.fft(samples.reshape((-1, N))), axes=1)**2),
                  axis=0).compute()
    return xr.DataArray(spec,
                        dims='freq',
                        coords={
                            'freq':
                            freq +
                            np.fft.fftshift(np.fft.fftfreq(N, 1 / SAMPRATE))
                        })
Exemplo n.º 23
0
def compose_position_fields(fields,
                            spacing,
                            output,
                            blocksize=[
                                256,
                            ] * 3,
                            displacement=None):
    """
    """

    with distributed.distributedState() as ds:

        # get number of jobs needed
        block_grid = np.ceil(np.array(fields[0].shape[:-1]) /
                             blocksize).astype(int)
        nblocks = np.prod(block_grid)

        # set up the cluster
        ds.initializeLSFCluster(job_extra=["-P multifish"])
        ds.initializeClient()
        ds.scaleCluster(njobs=nblocks)

        # wrap fields as dask arrays
        fields_da = da.stack(
            [da.from_array(f, chunks=blocksize + [
                3,
            ]) for f in fields])

        # accumulate
        composed = da.sum(fields_da, axis=0)

        # modify for multiple position fields
        if displacement is not None:
            raise NotImplementedError(
                "composing displacement fields not implemented yet")
        else:
            grid = position_grid_dask(composed.shape[:3],
                                      blocksize) * spacing.astype(np.float32)
            composed = composed - (len(fields) - 1) * grid

        # write in parallel as 3D array to zarr file
        compressor = Blosc(cname='zstd', clevel=9, shuffle=Blosc.BITSHUFFLE)
        composed_disk = zarr.open(
            output,
            'w',
            shape=composed.shape,
            chunks=composed.chunksize,
            dtype=composed.dtype,
            compressor=compressor,
        )
        da.to_zarr(composed, composed_disk)

        # return pointer to zarr file
        return composed_disk
Exemplo n.º 24
0
def test_turn_off_fusion():
    x = da.ones(10, chunks=(5, ))
    y = da.sum(x + 1 + 2 + 3)

    a = y._optimize(y.dask, y._keys())

    with dask.set_options(fuse_ave_width=0):
        b = y._optimize(y.dask, y._keys())

    assert dask.get(a, y._keys()) == dask.get(b, y._keys())
    assert len(a) < len(b)
def test_turn_off_fusion():
    x = da.ones(10, chunks=(5, ))
    y = da.sum(x + 1 + 2 + 3)

    a = y.__dask_optimize__(y.dask, y.__dask_keys__())

    with dask.config.set({"optimization.fuse.ave-width": 0}):
        b = y.__dask_optimize__(y.dask, y.__dask_keys__())

    assert dask.get(a, y.__dask_keys__()) == dask.get(b, y.__dask_keys__())
    assert len(a) < len(b)
Exemplo n.º 26
0
def test_turn_off_fusion():
    x = da.ones(10, chunks=(5,))
    y = da.sum(x + 1 + 2 + 3)

    a = y.__dask_optimize__(y.dask, y.__dask_keys__())

    with dask.config.set(fuse_ave_width=0):
        b = y.__dask_optimize__(y.dask, y.__dask_keys__())

    assert dask.get(a, y.__dask_keys__()) == dask.get(b, y.__dask_keys__())
    assert len(a) < len(b)
def compute_power(samples, sample_time, bandwidth, avg_win=PWR_AVERAGE):
    samples_filt = filter_signal(samples, bandwidth)
    pwr = da.sum((np.abs(samples_filt[:samples_filt.size // avg_win *
                                      avg_win])**2).reshape((-1, avg_win)),
                 axis=1).compute()
    return xr.DataArray(pwr,
                        dims='time',
                        coords={
                            'time':
                            sample_time[:samples_filt.size // avg_win *
                                        avg_win][::avg_win].compute()
                        })
Exemplo n.º 28
0
def weighting(spec_dist, temp_dist, comb_dist, similar_pixels_filtered):
    # Assign max weight (1) when the temporal or spectral distance is zero
    zero_spec_dist = da.where(spec_dist[:,mid_idx][:,None] == 1, 1, 0)
    zero_temp_dist = da.where(temp_dist[:,mid_idx][:,None] == 1, 1, 0)
    zero_dist_mid = da.where((zero_spec_dist == 1), 
                             zero_spec_dist, zero_temp_dist)
    shape = da.subtract(spec_dist.shape,(0,1))
    zero_dist = da.zeros(shape, chunks=(spec_dist.shape[0],shape[1]))
    zero_dist = da.insert(zero_dist, [mid_idx], zero_dist_mid, axis=1)
    weights = da.where((da.sum(zero_dist,1)[:,None] == 1), zero_dist, comb_dist)
    
    # Calculate weights only for the filtered spectrally similar pixels
    weights_filt = weights*similar_pixels_filtered
    
    # Normalize weights
    norm_weights = da.rechunk(weights_filt/(da.sum(weights_filt,1)[:,None]), 
                              chunks = spec_dist.chunksize)
    
    print ("Done weighting!", norm_weights)
    
    return norm_weights
def test_dask_yarn():
    try:
        from dask_yarn import YarnCluster
    except:
        return

    # Validate dask_yarn configuration
    cluster = YarnCluster()
    client = Client(cluster)

    cluster.scale(4)
    x = da.sum(np.ones(5))
    x.compute()
Exemplo n.º 30
0
    def agreement(self, estimators):
        """
        Implementation of Query By Committee strategy, variant: Vote entropy.

        The vote entropy approach is used for measuring the level of disagreement.

        I. Dagan and S. Engelson. Committee-based sampling for training probabilistic
        classifiers. In Proceedings of the International Conference on Machine
        Learning (ICML), pages 150–157. Morgan Kaufmann, 1995.

        :param estimators:
        :return:
        """
        score = []
        input_shape, committee_size = QueryByCommitteeStategy.check_committee_results(
            estimators)
        if len(input_shape) == 2:
            ele_uni = da.unique(estimators).compute()
            if not (len(ele_uni) == 2 and 0 in ele_uni and 1 in ele_uni):
                raise ValueError(
                    "The predicted label matrix must only contain 0 and 1")

            # calc each instance
            for i in range(input_shape[0]):
                instance_mat = da.from_array(
                    np.array([X[i, :] for X in estimators
                              if X is not None])).compute()
                voting = da.sum(instance_mat, axis=0)

                tmp = []
                for vote in voting:
                    if vote != 0:
                        tmp.append(
                            delayed(vote / len(estimators) *
                                    np.log(vote / len(estimators))))
                score.append(-delayed(sum)(tmp))
        else:
            input_mat = da.from_array(
                np.array([X for X in estimators if X is not None])).compute()
            # for each instance
            for i in range(input_shape[0]):
                count_dict = collections.Counter(input_mat[:, i])
                tmp = []
                for key in count_dict:
                    tmp.append(
                        delayed(count_dict[key] / committee_size *
                                np.log(count_dict[key] / committee_size)))
                score.append(-delayed(sum)(tmp))

        return compute(score)[0]
Exemplo n.º 31
0
def nearest_neighbour(test_images, train_images, train_labels, k=1):
    pred = np.zeros(test_images.shape[0])
    for i in range(test_images.shape[0]):
        test_image = test_images[i, :]
        nn = da.sum(np.abs(train_images - test_image), axis=1, keepdims=True)
        if k == 1:
            nn = da.argmin(nn, axis=0)
            pred[i] = train_labels[nn]
        else:
            nn = np.array(nn)
            min_idx = np.argsort(nn, 0)[:k]
            labels = np.array([train_labels[i] for i in min_idx])
            labels = np.reshape(labels, [-1])
            lab = Counter(labels).most_common()[0][0]
            pred[i] = lab
    return pred
Exemplo n.º 32
0
def calibrate_posterior_predictive(post_pred, qc):
    """ Function to calibrate posterior predictive.

    This allows the calibrated model to make predictions. This function is required to compute
    mean and log likelihood of the calibrated model.

    Args:
        post_pred: posterior predictive of shape (num samples, num X values)
        qc: calibration object as defined in class QuantileCalibration

    Returns:
        calibrated posterior predictive of shape (num samples, num X values)
    """

    # Need to convert from jax array to dask array to avoid
    # out of memory error (on a 32GB machine for 8000 samples) in the next step.
    # This also helps to parallelize the task to all cpu cores.
    post_pred_shape = post_pred.shape
    res_main_post_pred = da.from_array(
        np.array(post_pred),
        chunks=(
            1000,  # reduce this value if out of memory!
            np.ceil(post_pred_shape[1] / dask.system.cpu_count()),
        ),
    )
    # expand to 3D: axis 0: num observations; axis 1: num samples; axis 2: num samples
    uncalibrated_pp_quantiles = (
        da.sum(res_main_post_pred.T[:, :, np.newaxis] <=
               res_main_post_pred.T[:, np.newaxis, :],
               axis=1).T / post_pred_shape[0])

    # calculate inverse R
    inverse_calibrated_pp_quantiles = da.apply_along_axis(
        qc.inverse_transform, 0, uncalibrated_pp_quantiles)

    # inverse CDF by looking up existing samples with np.quantile()
    da_combined = da.vstack(
        [res_main_post_pred,
         inverse_calibrated_pp_quantiles.compute()])
    calibrated_post_pred = da.apply_along_axis(
        lambda q: np.quantile(
            q[:post_pred_shape[0]], q[post_pred_shape[0]:], axis=0),
        0,
        da_combined,
    ).compute()

    return calibrated_post_pred
Exemplo n.º 33
0
def _sum_of_squares(a, axis=0):
    """
    Squares each element of the input array, and returns the sum(s) of that.
    Parameters
    ----------
    a : array_like
        Input array.
    axis : int or None, optional
        Axis along which to calculate. Default is 0. If None, compute over
        the whole array `a`.
    Returns
    -------
    sum_of_squares : ndarray
        The sum along the given axis for (a**2).
    See also
    --------
    _square_of_sums : The square(s) of the sum(s) (the opposite of
    `_sum_of_squares`).
    """
    return da.sum(a * a, axis)
Exemplo n.º 34
0
def _square_of_sums(a, axis=0):
    """
    Sums elements of the input array, and returns the square(s) of that sum.
    Parameters
    ----------
    a : array_like
        Input array.
    axis : int or None, optional
        Axis along which to calculate. Default is 0. If None, compute over
        the whole array `a`.
    Returns
    -------
    square_of_sums : float or ndarray
        The square of the sum over `axis`.
    See also
    --------
    _sum_of_squares : The sum of squares (the opposite of `square_of_sums`).
    """
    s = da.sum(a, axis)
    return s * s
Exemplo n.º 35
0
    def get_bil_info(self):
        """Return neighbour info.

        Returns
        -------
        t__ : numpy array
            Vertical fractional distances from corner to the new points
        s__ : numpy array
            Horizontal fractional distances from corner to the new points
        input_idxs : numpy array
            Valid indices in the input data
        idx_arr : numpy array
            Mapping array from valid source points to target points

        """
        if self.source_geo_def.size < self.neighbours:
            warnings.warn('Searching for %s neighbours in %s data points' %
                          (self.neighbours, self.source_geo_def.size))

        # Create kd-tree
        valid_input_idx, resample_kdtree = self._create_resample_kdtree()
        # This is a numpy array
        self.valid_input_index = valid_input_idx

        if resample_kdtree.n == 0:
            # Handle if all input data is reduced away
            bilinear_t, bilinear_s, valid_input_index, index_array = \
                _create_empty_bil_info(self.source_geo_def,
                                       self.target_geo_def)
            self.bilinear_t = bilinear_t
            self.bilinear_s = bilinear_s
            self.valid_input_index = valid_input_idx
            self.index_array = index_array

            return bilinear_t, bilinear_s, valid_input_index, index_array

        target_lons, target_lats = self.target_geo_def.get_lonlats()
        valid_output_idx = ((target_lons >= -180) & (target_lons <= 180) &
                            (target_lats <= 90) & (target_lats >= -90))

        index_array, distance_array = self._query_resample_kdtree(
            resample_kdtree, target_lons, target_lats, valid_output_idx)

        # Reduce index reference
        input_size = da.sum(self.valid_input_index)
        index_mask = index_array == input_size
        index_array = da.where(index_mask, 0, index_array)

        # Get output projection as pyproj object
        proj = Proj(self.target_geo_def.proj_str)

        # Get output x/y coordinates
        out_x, out_y = _get_output_xy_dask(self.target_geo_def, proj)

        # Get input x/y coordinates
        in_x, in_y = _get_input_xy_dask(self.source_geo_def, proj,
                                        self.valid_input_index, index_array)

        # Get the four closest corner points around each output location
        pt_1, pt_2, pt_3, pt_4, index_array = \
            _get_bounding_corners_dask(in_x, in_y, out_x, out_y,
                                       self.neighbours, index_array)

        # Calculate vertical and horizontal fractional distances t and s
        t__, s__ = _get_ts_dask(pt_1, pt_2, pt_3, pt_4, out_x, out_y)
        self.bilinear_t, self.bilinear_s = t__, s__

        self.valid_output_index = valid_output_idx
        self.index_array = index_array
        self.distance_array = distance_array

        return (self.bilinear_t, self.bilinear_s, self.valid_input_index,
                self.index_array)