def weight_block(block, blocksize, block_info=None): """ """ # compute fixed overlap size overlaps = np.array([int(round(x / 8)) for x in blocksize]) # determine which faces need linear weighting core_shape = [] pads = [] block_index = block_info[0]['chunk-location'] block_grid = block_info[0]['num-chunks'] for i in range(3): p, bl = overlaps[i], blocksize[i] bi, bg = block_index[i], block_grid[i] pad, core = [2 * p + 1, 2 * p + 1], bl - 2 * p if bi == 0: pad[0], core = 0, core + 2 * p + 1 if bi == bg - 1: pad[1], core = 0, core + 2 * p + 1 pads.append(tuple(pad)) core_shape.append(core) # create weights weights = da.ones(core_shape, dtype=np.float32) weights = da.pad(weights, pads, mode='linear_ramp', end_values=0) weights = weights[1:-1, 1:-1, 1:-1] weights = weights.reshape(weights.shape + (1, )) # multiply data by weights and return return da.multiply(block, weights)
def bw_corrcoef(image1, image2, block_shape, keep_shape=False): """ Blockwise Pearson correlation coefficient. """ # blockwise zero-mean image1_zm = image1 - bw_mean(image1, block_shape, keep_shape=True) image2_zm = image2 - bw_mean(image2, block_shape, keep_shape=True) # follow Pearson correlation coefficient formula numerator = bw_mean(da.multiply(image1_zm, image2_zm), block_shape) image1_std = bw_std(image1, block_shape) image2_std = bw_std(image2, block_shape) denominator = da.multiply(image1_std, image2_std) bwcc = da.divide(numerator, denominator) if keep_shape: bwcc = repeat_block(bwcc, block_shape) return bwcc
def weight_block(block, blocksize): """ """ overlaps = np.array([int(round(x/8)) for x in blocksize]) weights = da.ones(blocksize - 2*overlaps, dtype=np.float32) pads = [(2*p, 2*p) for p in overlaps] weights = da.pad(weights, pads, mode='linear_ramp', end_values=0) weights = weights.reshape(weights.shape + (1,)) return da.multiply(block, weights)
def apply_filter_vector_dask_true(_filter, arr, chunk=5): out = arr.copy() tc, slc = pad_next_square_size(out) tc = da.from_array(tc, (-1, -1, chunk)) _filter = da.from_array(_filter) temp = dff.ifft2( da.multiply(dff.ifftshift(1 - _filter[:, :, None]), dff.fft2(tc, axes=(0, 1))), axes=(0, 1), ).real return reverse_padding(arr, temp, slc)
def _remove_bad_pixels(dask_array, bad_pixel_array): """Replace values in bad pixels with mean of neighbors. Parameters ---------- dask_array : Dask array Must be at least two dimensions bad_pixel_array : array-like Must either have the same shape as dask_array, or the same shape as the two last dimensions of dask_array. Returns ------- data_output : Dask array Examples -------- >>> import pyxem.utils.dask_tools as dt >>> s = pxm.dummy_data.dummy_data.get_dead_pixel_signal(lazy=True) >>> dead_pixels = dt._find_dead_pixels(s.data) >>> data_output = dt._remove_bad_pixels(s.data, dead_pixels) """ if len(dask_array.shape) < 2: raise ValueError("dask_array {0} must be at least 2 dimensions".format( dask_array.shape)) if bad_pixel_array.shape == dask_array.shape: pass elif bad_pixel_array.shape == dask_array.shape[-2:]: temp_array = da.zeros_like(dask_array) bad_pixel_array = da.add(temp_array, bad_pixel_array) else: raise ValueError( "bad_pixel_array {0} must either 2-D and have the same shape " "as the two last dimensions in dask_array {1}. Or be " "the same shape as dask_array {2}".format(bad_pixel_array.shape, dask_array.shape[-2:], dask_array.shape)) dif0 = da.roll(dask_array, shift=1, axis=-2) dif1 = da.roll(dask_array, shift=-1, axis=-2) dif2 = da.roll(dask_array, shift=1, axis=-1) dif3 = da.roll(dask_array, shift=-1, axis=-1) dif = (dif0 + dif1 + dif2 + dif3) / 4 dif = dif * bad_pixel_array data_output = da.multiply(dask_array, da.logical_not(bad_pixel_array)) data_output = data_output + dif return data_output
def matern32(coords, lambda0): """ Matern 3/2 covariance kernel. Parameters ---------- coords: (n_pts, n_dims) dask.array or Future Point coordinates. Returns ------- covs: (n_pts, n_pts) delayed dask.array Pairwise covariance matrix. """ dists = dask_distance.euclidean(coords) res = da.multiply( 1 + (np.sqrt(3) / lambda0) * dists, da.exp(-(np.sqrt(3) / lambda0) * dists)) return res
def _scale_x(self, x, sym: bool = False) -> da.core.Array: """ Scales the product of a matrix multiplication instead of the matrix itself Let A be a matrix of shape (n by p) with non zero column stds, D of shape (p,). Matrix B could be constructed as follows with zero column std. B = A*Inv(Diag(D)) However, this is inefficient if only the matrix product of B, with a matrix x is needed. Instead `_scale_x` implements: Ax*Inv(Diag(D)) ^^ x being passed in already computed as Ax with efficient broadcasting. Parameters ---------- x : array_like Usually the product of Ax that needs to be scaled sym : bool Flag whether we are scaling twice in the case of AA'x The square of the column standard deviations must be removed Returns ------- x_scaled : array_like """ try: # self._array_moment.vector_width is not set until ScaledCenterArray is fit_x. if len(x.shape ) == 2 and self._array_moment.vector_width == x.shape[1]: scale_matrix = self._array_moment.sym_scale_matrix if sym else self._array_moment.scale_matrix return da.multiply(scale_matrix, x) except ValueError: pass scale_vector = self._array_moment.sym_scale_vector if sym else self._array_moment.scale_vector x_scaled = diag_dot(scale_vector, x, return_diag=False) return x_scaled
def fit(self, X, y=None): X = self._check_array(X) n_components = self.n_components metric = self.affinity rng = check_random_state(self.random_state) n_clusters = self.n_clusters # kmeans for final clustering if isinstance(self.assign_labels, six.string_types): if self.assign_labels == "kmeans": km = KMeans( n_clusters=n_clusters, random_state=draw_seed(rng, np.iinfo("i4").max, dtype="uint"), ) elif self.assign_labels == "sklearn-kmeans": km = sklearn.cluster.KMeans(n_clusters=n_clusters, random_state=rng) else: msg = "Unknown 'assign_labels' {!r}".format(self.assign_labels) raise ValueError(msg) elif isinstance(self.assign_labels, BaseEstimator): km = self.assign_labels else: raise TypeError("Invalid type {} for 'assign_labels'".format( type(self.assign_labels))) if self.kmeans_params: km.set_params(**self.kmeans_params) n = len(X) if n <= n_components: msg = ("'n_components' must be smaller than the number of samples." " Got {} components and {} samples".format(n_components, n)) raise ValueError(msg) params = self.kernel_params or {} params["gamma"] = self.gamma params["degree"] = self.degree params["coef0"] = self.coef0 # indices for our exact / approximate blocks inds = np.arange(n) keep = rng.choice(inds, n_components, replace=False) keep.sort() rest = ~np.isin(inds, keep) # compute the exact blocks # these are done in parallel for dask arrays if isinstance(X, da.Array): X_keep = X[keep].rechunk(X.shape).persist() else: X_keep = X[keep] X_rest = X[rest] A, B = embed(X_keep, X_rest, n_components, metric, params) _log_array(logger, A, "A") _log_array(logger, B, "B") # now the approximation of C a = A.sum(0) # (l,) b1 = B.sum(1) # (l,) b2 = B.sum(0) # (m,) # TODO: I think we have some unnecessary delayed wrapping of A here. A_inv = da.from_delayed(delayed(pinv)(A), A.shape, A.dtype) inner = A_inv.dot(b1) d1_si = 1 / da.sqrt(a + b1) d2_si = 1 / da.sqrt(b2 + B.T.dot(inner)) # (m,), dask array # d1, d2 are diagonal, so we can avoid large matrix multiplies # Equivalent to diag(d1_si) @ A @ diag(d1_si) A2 = d1_si.reshape(-1, 1) * A * d1_si.reshape(1, -1) # (n, n) _log_array(logger, A2, "A2") # A2 = A2.rechunk(A2.shape) # Equivalent to diag(d1_si) @ B @ diag(d2_si) B2 = da.multiply(da.multiply(d1_si.reshape(-1, 1), B), d2_si.reshape(1, -1)) _log_array(logger, B2, "B2") U_A, S_A, V_A = delayed(svd, pure=True, nout=3)(A2) U_A = da.from_delayed(U_A, (n_components, n_components), A2.dtype) S_A = da.from_delayed(S_A, (n_components, ), A2.dtype) V_A = da.from_delayed(V_A, (n_components, n_components), A2.dtype) # Eq 16. This is OK when V2 is orthogonal V2 = da.sqrt(float(n_components) / n) * da.vstack( [A2, B2.T]).dot(U_A[:, :n_clusters]).dot( da.diag(1.0 / da.sqrt(S_A[:n_clusters]))) # (n, k) _log_array(logger, V2, "V2.1") if isinstance(B2, da.Array): V2 = V2.rechunk((B2.chunks[1][0], n_clusters)) _log_array(logger, V2, "V2.2") # normalize (Eq. 4) U2 = (V2.T / da.sqrt((V2**2).sum(1))).T # (n, k) _log_array(logger, U2, "U2.2") # Recover original indices U2 = _slice_mostly_sorted(U2, keep, rest, inds) # (n, k) _log_array(logger, U2, "U2.3") if self.persist_embedding and isinstance(U2, da.Array): logger.info("Persisting array for k-means") U2 = U2.persist() elif isinstance(U2, da.Array): logger.info( "Consider persist_embedding. This will require %s", _format_bytes(U2.nbytes), ) pass logger.info("k-means for assign_labels[starting]") km.fit(U2) logger.info("k-means for assign_labels[finished]") # Now... what to keep? self.assign_labels_ = km self.labels_ = km.labels_ self.eigenvalues_ = S_A[:n_clusters] # TODO: better name return self
def _center_of_mass_array(dask_array, threshold_value=None, mask_array=None): """Find center of mass of last two dimensions for a dask array. The center of mass can be calculated using a mask and threshold. Parameters ---------- dask_array : Dask array Must have either 2, 3 or 4 dimensions. threshold_value : scalar, optional mask_array : NumPy array, optional Array with bool values. The True values will be masked (i.e. ignored). Must have the same shape as the two last dimensions in dask_array. Returns ------- center_of_mask_dask_array : Dask array Examples -------- >>> import dask.array as da >>> import pyxem.utils.dask_tools as dt >>> data = da.random.random( ... size=(64, 64, 128, 128), chunks=(16, 16, 128, 128)) >>> output_dask = dt._center_of_mass_array(data) >>> output = output_dask.compute() Masking everything except the center of the image >>> mask_array = np.ones(shape=(128, 128), dtype=bool) >>> mask_array[64-10:64+10, 64-10:64+10] = False >>> output_dask = dt._center_of_mass_array(data, mask_array=mask_array) >>> output = output_dask.compute() Masking and thresholding >>> output_dask = dt._center_of_mass_array( ... data, mask_array=mask_array, threshold_value=3) >>> output = output_dask.compute() """ det_shape = dask_array.shape[-2:] y_grad, x_grad = np.mgrid[0:det_shape[0], 0:det_shape[1]] y_grad, x_grad = y_grad.astype(np.float64), x_grad.astype(np.float64) sum_array = np.ones_like(x_grad) if mask_array is not None: if not mask_array.shape == det_shape: raise ValueError( "mask_array ({0}) must have same shape as last two " "dimensions of the dask_array ({1})".format( mask_array.shape, det_shape)) x_grad = x_grad * np.invert(mask_array) y_grad = y_grad * np.invert(mask_array) sum_array = sum_array * np.invert(mask_array) if threshold_value is not None: dask_array = _threshold_array(dask_array, threshold_value=threshold_value, mask_array=mask_array) x_shift = da.multiply(dask_array, x_grad, dtype=np.float64) y_shift = da.multiply(dask_array, y_grad, dtype=np.float64) sum_array = da.multiply(dask_array, sum_array, dtype=np.float64) x_shift = np.sum(x_shift, axis=(-2, -1), dtype=np.float64) y_shift = np.sum(y_shift, axis=(-2, -1), dtype=np.float64) sum_array = np.sum(sum_array, axis=(-2, -1), dtype=np.float64) beam_shifts = da.stack((x_shift, y_shift)) beam_shifts = da.divide(beam_shifts[:], sum_array, dtype=np.float64) return beam_shifts
def test( size_per_proc=1000, num_procs=1, num_runs=1, ty="int64", key_length=10, scale_lhs_only=False, package="legate", ): if package == "legate": from legate import numpy as np, pandas as pd from legate.numpy.random import randn elif package == "cudf": import cudf as pd import cupy as np from cupy.random import randn elif package == "pandas": import numpy as np import pandas as pd from numpy.random import randn elif package == "dask" or package == "daskcudf": import dask.array as da import dask.dataframe as df import numpy as np if package == "daskcudf": import cudf else: print("Unknown dataframe package: %s" % package) assert False if package == "legate": from legate.timing import time def block(*args): pass def get_timestamp(): return time() def compute_elapsed_time(start_ts, stop_ts): return (stop_ts - start_ts) / 1000.0 elif package == "dask" or package == "daskcudf": import time def block(*args): for arg in args: arg.compute() get_timestamp = time.process_time def compute_elapsed_time(start_ts, stop_ts): return (stop_ts - start_ts) * 1000.0 else: import time def block(*args): pass get_timestamp = time.process_time def compute_elapsed_time(start_ts, stop_ts): return (stop_ts - start_ts) * 1000.0 if scale_lhs_only: size = size_per_proc * num_procs size_rhs = size // 3 if package == "dask" or package == "daskcudf": # Dask array does not have randn so use arrange c1 = da.arange(size, dtype=np.float64, chunks=size_per_proc) c2 = da.arange( size_rhs, dtype=np.float64, chunks=(size_per_proc + num_procs - 1) // num_procs, ) else: c1 = randn(size) c2 = randn(size_rhs) key_dtype = np.int64 if package == "dask" or package == "daskcudf": key_left = ( da.arange(size, dtype=key_dtype, chunks=size_per_proc) % size_per_proc ) key_right = da.arange( size_rhs, dtype=key_dtype, chunks=(size_per_proc + num_procs - 1) // num_procs, ) da.multiply(key_right, 3, out=key_right) else: key_left = np.arange(size, dtype=key_dtype) % size_per_proc key_right = np.arange(size_rhs, dtype=key_dtype) np.multiply(key_right, 3, out=key_right) else: size = size_per_proc * num_procs size_rhs = size if package == "dask" or package == "daskcudf": # Dask array does not have randn so use arrange c1 = da.arange(size, dtype=np.float64, chunks=size_per_proc) c2 = da.arange(size, dtype=np.float64, chunks=size_per_proc) else: c1 = randn(size) c2 = randn(size) key_dtype = np.int64 if package == "dask" or package == "daskcudf": key_left = da.arange(size, dtype=key_dtype, chunks=size_per_proc) key_right = da.arange(size, dtype=key_dtype, chunks=size_per_proc) else: key_left = np.arange(size, dtype=key_dtype) key_right = np.arange(size, dtype=key_dtype) # np.floor_divide(key_right, 3, out=key_right) # np.multiply(key_right, 3, out=key_right) if package == "dask" or package == "daskcudf": df1 = df.multi.concat( [df.from_dask_array(a) for a in [c1, key_left]], axis=1 ) df1.columns = ["c1", "key"] df2 = df.multi.concat( [df.from_dask_array(a) for a in [c2, key_right]], axis=1 ) df2.columns = ["c2", "key"] if package == "daskcudf": df1 = df1.map_partitions(cudf.from_pandas) df2 = df2.map_partitions(cudf.from_pandas) else: df1 = pd.DataFrame({"c1": c1, "key": key_left}) df2 = pd.DataFrame({"c2": c2, "key": key_right}) df2["key"] = df2["key"] // 3 * 3 if ty == "string": df1["key"] = ( df1["key"] .astype("string") .str.pad(width=key_length, side="both", fillchar="0") ) df2["key"] = ( df2["key"] .astype("string") .str.pad(width=key_length, side="both", fillchar="0") ) print( "Type: inner, Size: %u x %u, Key dtype: %s" % (size, size_rhs, str(key_dtype)) ) block(df1, df2) for i in range(num_runs): start_ts = get_timestamp() df_result = df1.merge(df2, on="key") block(df_result) stop_ts = get_timestamp() print( "[Run %d] Elapsed time: %lf ms" % (i + 1, compute_elapsed_time(start_ts, stop_ts)) ) del df_result