def prepare_dataset(X): len_ = X.shape[0] shape_ = X.shape d = int(da.sqrt(X.flatten().reshape(X.shape[0], -1).shape[1])) if len(shape_) == 4: d = int(da.sqrt(X.flatten().reshape(X.shape[0], -1).shape[1] / 3)) X = da.reshape(X, [-1, d, d, 3]) elif d == shape_[1] and len(shape_) == 3: X = da.reshape(X, [-1, d, d]) X = da.array(list(map(lambda x: grey2rgb(x), X)), dtype=da.float32) else: r = d ** 2 - X.shape[1] train_padding = da.zeros((shape_[0], r)) X = da.vstack([X, train_padding]) X = da.reshape(X, [-1, d, d]) X = da.array(list(map(lambda x: grey2rgb(x), X)), dtype=da.float32) print('Scaling dataset') if scalar is not None: X = scaler.transform(X.flatten().reshape(-1, 1).astype(da.float32)).reshape(X.shape) else: scaler = MinMaxScaler() X = scaler.fit_transform(X.flatten().reshape(-1, 1).astype(da.float32)).reshape(X.shape) return X
def xyz2lonlat(x__, y__, z__): """Get longitudes from cartesian coordinates. """ R = 6370997.0 lons = da.rad2deg(da.arccos(x__ / da.sqrt(x__ ** 2 + y__ ** 2))) * da.sign(y__) lats = da.sign(z__) * (90 - da.rad2deg(da.arcsin(da.sqrt(x__ ** 2 + y__ ** 2) / R))) return lons, lats
def pearson_influence(xarr: da.Array, yarr: da.Array) -> da.Array: """Calculating the influence for deleting a point on the pearson correlation""" if xarr.shape != yarr.shape: raise ValueError( f"The shape of xarr and yarr should be same, got {xarr.shape}, {yarr.shape}" ) # Fast calculating the influence for removing one element on the correlation n = xarr.shape[0] x2, y2 = da.square(xarr), da.square(yarr) xy = xarr * yarr # The influence is vectorized on xarr and yarr, so we need to repeat all the sums for n times xsum = da.ones(n) * da.sum(xarr) ysum = da.ones(n) * da.sum(yarr) xysum = da.ones(n) * da.sum(xy) x2sum = da.ones(n) * da.sum(x2) y2sum = da.ones(n) * da.sum(y2) # Note: in we multiply (n-1)^2 to both denominator and numerator to avoid divisions. numerator = (n - 1) * (xysum - xy) - (xsum - xarr) * (ysum - yarr) varx = (n - 1) * (x2sum - x2) - da.square(xsum - xarr) vary = (n - 1) * (y2sum - y2) - da.square(ysum - yarr) denominator = da.sqrt(varx * vary) return da.map_blocks(itruediv, numerator, denominator, dtype=numerator.dtype)
def corr(*args, axis=None, **kwargs): """ Pearson's correlation coefficient """ c = cov(*args, axis=axis, **kwargs) std = da.sqrt(np.diagonal(c, axis1=-1, axis2=-2))[..., None] return c / std / std.swapaxes(-1, -2)
def test_PowerMethod_std_method(): N = 1000 P = 100 k = 10 array = da.random.randint(0, 3, size=(N, P)) for method in ['norm', 'binom']: new_array = make_snp_array(array, std_method=method) PM = PowerMethod(k=k, scoring_method='q-vals', tol=1e-13, factor=None) U_PM, S_PM, V_PM = PM.svd(array=new_array) mean = array.mean(axis=0) if method == 'norm': std = array.std(axis=0) else: p = mean / 2 std = da.sqrt(2 * p * (1 - p)) x = (array - mean).dot(np.diag(1 / std)) U, S, V = da.linalg.svd(x) U_k, S_k, V_k = svd_to_trunc_svd(U, S, V, k=k) np.testing.assert_almost_equal(subspace_dist(U_PM, U_k, S_k), 0, decimal=3) np.testing.assert_almost_equal(subspace_dist(V_PM, V_k, S_k), 0, decimal=3) np.testing.assert_array_almost_equal(S_k, S_PM, decimal=2)
def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False, X_norm_squared=None): if X_norm_squared is not None: XX = X_norm_squared if XX.shape == (1, X.shape[0]): XX = XX.T elif XX.shape != (X.shape[0], 1): raise ValueError( "Incompatible dimensions for X and X_norm_squared") else: XX = row_norms(X, squared=True)[:, np.newaxis] if X is Y: YY = XX.T elif Y_norm_squared is not None: if Y_norm_squared.ndim < 2: YY = Y_norm_squared[:, np.newaxis] else: YY = Y_norm_squared if YY.shape != (1, Y.shape[0]): raise ValueError( "Incompatiable dimensions for Y and Y_norm_squared") else: YY = row_norms(Y, squared=True)[np.newaxis, :] # TODO: this often emits a warning. Silence it here? distances = -2 * X.dot(Y.T) + XX + YY distances = da.maximum(distances, 0) # TODO: scikit-learn sets the diagonal to 0 when X is Y. return distances if squared else da.sqrt(distances)
def euclidean(XA, XB): """Returns the distance between points using Euclidean distance (2-norm) as the distance metric between the points. Find the Euclidean distances between four 2-D coordinates: >>> coords = [(35.0456, -85.2672), ... (35.1174, -89.9711), ... (35.9728, -83.9422), ... (36.1667, -86.7833)] >>> euclidean(coords, coords) array([[ 0. , 4.7044, 1.6172, 1.8856], [ 4.7044, 0. , 6.0893, 3.3561], [ 1.6172, 6.0893, 0. , 2.8477], [ 1.8856, 3.3561, 2.8477, 0. ]]) """ mA = (XA.shape)[0] mB = (XB.shape)[0] distances = [] for i in range(0, mA): dm = np.zeros(shape=(1, mB), dtype=np.double) for j in range(0, mB): XA_XB = XA[i, :] - XB[j, :] dm[0, j] = da.sqrt(da.dot(XA_XB, XA_XB)) distances.append( da.from_array(dm, chunks=(mA + mB) / multiprocessing.cpu_count())) return da.concatenate(distances, axis=0)
def uirdftn(inarray, ndim=None, *args, **kwargs): """N-dim real unitary discrete Fourier transform This transform consider the Hermitian property of the transform from complex to real real input. Parameters ---------- inarray : ndarray The array to transform. ndim : int, optional The `ndim` last axis along wich to compute the transform. All axes by default. Returns ------- outarray : array-like (the last ndim as (N - 1) * 2 lenght) """ if not ndim: ndim = inarray.ndim return dask_irfftn(inarray, axes=range(-ndim, 0), *args, ** kwargs) * da.sqrt( da.prod(da.asarray(inarray.shape[-ndim:-1])) * (inarray.shape[-1] - 1) * 2)
def fit( self, X: Union[ArrayLike, DataFrameType], y: Optional[Union[ArrayLike, SeriesType]] = None, ) -> "StandardScaler": self._reset() attributes = OrderedDict() if isinstance(X, (pd.DataFrame, dd.DataFrame)): X = X.values if self.with_mean: mean_ = nanmean(X, 0) attributes["mean_"] = mean_ if self.with_std: var_ = nanvar(X, 0) scale_ = var_.copy() scale_[scale_ == 0] = 1 scale_ = da.sqrt(scale_) attributes["scale_"] = scale_ attributes["var_"] = var_ attributes["n_samples_seen_"] = np.nan values = compute(*attributes.values()) for k, v in zip(attributes, values): setattr(self, k, v) self.n_features_in_ = X.shape[1] return self
def da_linregress(x, y): """ Refactor of the scipy linregress with numba, less checks for speed sake and done with dask arrays :param x: array for independent :param y: :return: """ TINY = 1.0e-20 # x = np.asarray(x) # y = np.asarray(y) arr = da.stack([x, y], axis=1) n = len(x) # average sum of squares: ssxm, ssxym, ssyxm, ssym = (da.dot(arr.T, arr) / n).ravel() r_num = ssxym r_den = np.sqrt(ssxm * ssym) if r_den == 0.0: r = 0.0 else: r = r_num / r_den # test for numerical error propagation if r > 1.0: r = 1.0 elif r < -1.0: r = -1.0 df = n - 2 slope = r_num / ssxm r_t = r + TINY t = r * da.sqrt(df / ((1.0 - r_t) * (1.0 + r_t))) prob = 2 * stats.distributions.t.sf(np.abs(t), df) return slope, r**2, prob
def mean_squared_error( y_true: ArrayLike, y_pred: ArrayLike, sample_weight: Optional[ArrayLike] = None, multioutput: Optional[str] = "uniform_average", squared: bool = True, compute: bool = True, ) -> ArrayLike: _check_sample_weight(sample_weight) output_errors = ((y_pred - y_true)**2).mean(axis=0) if isinstance(multioutput, str): if multioutput == "raw_values": return output_errors elif multioutput == "uniform_average": # pass None as weights to np.average: uniform mean multioutput = None else: raise ValueError("Weighted 'multioutput' not supported.") result = output_errors.mean() if not squared: result = da.sqrt(result) if compute: result = result.compute() return result
def calc_lac(fcast, obs): """ Method to calculate the Local Anomaly Correlation (LAC). Uses numexpr for speed over larger datasets. Note: If necessary (memory concerns) in the future, the numexpr statements can be extended to use pytable arrays. Would need to provide means to function, as summing over the dataset is still very slow it seems. Parameters ---------- fcast: ndarray Time series of forecast data. M x N where M is the temporal dimension. obs: ndarray Time series of observations. M x N Returns ------- lac: ndarray Local anomaly corellations for all locations over the time range. """ # Calculate means of data f_mean = fcast.mean(axis=0) o_mean = obs.mean(axis=0) f_anom = fcast - f_mean o_anom = obs - o_mean # Calculate covariance between time series at each gridpoint cov = (f_anom * o_anom).sum(axis=0) # Calculate standardization terms f_std = (f_anom**2).sum(axis=0) o_std = (o_anom**2).sum(axis=0) if is_dask_array(f_std): f_std = da.sqrt(f_std) else: f_std = np.sqrt(f_std) if is_dask_array(o_std): o_std = da.sqrt(o_std) else: o_std = np.sqrt(o_std) std = f_std * o_std lac = cov / std return lac
def periodic_distance(a, b, periodic): '''Periodic distance between two arrays. Periodic is a 3 dimensional array containing the 3 box sizes. ''' delta = abs(a - b) delta = da.where(delta > 0.5 * periodic, periodic - delta, delta) return da.sqrt((delta ** 2).sum(axis=-1))
def get_distance(tan1, tan2, cos3): """ Gets distance component of Li kernels """ temp = tan1 * tan1 + tan2 * tan2 - 2.0 * tan1 * tan2 * cos3 return da.sqrt(da.maximum(temp, 0))
def get_array_moments( array: da.core.Array, mean: bool = True, std: bool = True, std_method: str = 'binom', axis: int = 0 ) -> Tuple[Optional[da.core.Array], Optional[da.core.Array]]: """ Computes specified array_moments Parameters ---------- array : array_like, shape (N, P) Array that moments will be computed from mean : bool Flag whether to compute mean of "array" along "axis" std : bool Flag whether to compute std of "array" along "axis" std_method : str Method used to compute standard deviation. Possible methods are: 'norm' ===> Normal Distribution Standard Deviation. See np.std 'binom' ====> Binomial Standard Deviation sqrt(2*p*(1-p)), where p = "mean"/2 axis : int Axis to compute mean and std along. Returns ------- array_mean : da.core.array, optional If "mean" is false, returns None Otherwise returns the array mean array_std: da.core.array, optional If "std" is false, returns None Otherwise returns the array std """ array_mean = None array_std = None if mean: array_mean = da.nanmean(array, axis=axis) if std: if std_method == 'binom': u = array_mean if mean else da.nanmean(array, axis=axis) u /= 2 array_std = da.sqrt(2 * u * (1 - u)) elif std_method == 'norm': array_std = da.nanstd(array, axis=axis) else: raise NotImplementedError( f'std_method, {std_method}, is not implemented ') array_mean, array_std = persist(array_mean, array_std) return array_mean, array_std
def _transformlng_dask(lng, lat): ret = 300.0 + lng + 2.0 * lat + 0.1 * lng * lng + \ 0.1 * lng * lat + 0.1 * da.sqrt(da.fabs(lng)) ret += (20.0 * da.sin(6.0 * lng * pi) + 20.0 * da.sin(2.0 * lng * pi)) * 2.0 / 3.0 ret += (20.0 * da.sin(lng * pi) + 40.0 * da.sin(lng / 3.0 * pi)) * 2.0 / 3.0 ret += (150.0 * da.sin(lng / 12.0 * pi) + 300.0 * da.sin(lng / 30.0 * pi)) * 2.0 / 3.0 return ret
def _transformlat_dask(lng, lat): ret = -100.0 + 2.0 * lng + 3.0 * lat + 0.2 * lat * lat + \ 0.1 * lng * lat + 0.2 * da.sqrt(da.fabs(lng)) ret += (20.0 * da.sin(6.0 * lng * pi) + 20.0 * da.sin(2.0 * lng * pi)) * 2.0 / 3.0 ret += (20.0 * da.sin(lat * pi) + 40.0 * da.sin(lat / 3.0 * pi)) * 2.0 / 3.0 ret += (160.0 * da.sin(lat / 12.0 * pi) + 320 * da.sin(lat * pi / 30.0)) * 2.0 / 3.0 return ret
def bw_std(image, block_shape, ddof=0, keep_shape=False): """ Blockwise standard deviation. """ # zero-mean bwm = bw_mean(image, block_shape, keep_shape=True) image_zm = image - bwm # follow standard deviation formula bws = bw_sum(image_zm**2, block_shape, keep_shape=keep_shape) return da.sqrt(bws / (np.prod(block_shape) - ddof))
def _unequal_var_ttest_denom(v1, n1, v2, n2): vn1 = v1 / n1 vn2 = v2 / n2 with np.errstate(divide="ignore", invalid="ignore"): df = (vn1 + vn2)**2 / (vn1**2 / (n1 - 1) + vn2**2 / (n2 - 1)) # If df is undefined, variances are zero (assumes n1 > 0 & n2 > 0). # Hence it doesn't matter what df is as long as it's not NaN. df = da.where(da.isnan(df), 1, df) # XXX: np -> da denom = da.sqrt(vn1 + vn2) return df, denom
def _solve_quadratic_dask(a__, b__, c__, min_val=0.0, max_val=1.0): """Solve quadratic equation and return the valid roots from interval [*min_val*, *max_val*] """ discriminant = b__ * b__ - 4 * a__ * c__ # Solve the quadratic polynomial x_1 = (-b__ + da.sqrt(discriminant)) / (2 * a__) x_2 = (-b__ - da.sqrt(discriminant)) / (2 * a__) # Find valid solutions, ie. 0 <= t <= 1 idxs = (x_1 < min_val) | (x_1 > max_val) x__ = da.where(idxs, x_2, x_1) idxs = (x__ < min_val) | (x__ > max_val) x__ = da.where(idxs, np.nan, x__) return x__
def _unequal_var_ttest_denom(v1, n1, v2, n2): vn1 = v1 / n1 vn2 = v2 / n2 with np.errstate(divide='ignore', invalid='ignore'): df = (vn1 + vn2)**2 / (vn1**2 / (n1 - 1) + vn2**2 / (n2 - 1)) # If df is undefined, variances are zero (assumes n1 > 0 & n2 > 0). # Hence it doesn't matter what df is as long as it's not NaN. df = da.where(da.isnan(df), 1, df) # XXX: np -> da denom = da.sqrt(vn1 + vn2) return df, denom
def filter(array: xr.DataArray, settings: dict, manager: LarvikManager = LarvikManager()) -> xr.DataArray: it = array prewx = dask_image.ndfilters.prewitt(it.data, axis=0) prewy = dask_image.ndfilters.prewitt(it.data, axis=1) prewittfiltered = da.sqrt(prewx * prewx + prewy * prewy) c = manager.meta.prepend(it, string="Prewitt of") channels = xr.DataArray(da.array(c), dims="c") x = xr.DataArray(prewittfiltered, dims=it.dims, coords={ **it.coords, "channels": channels}) return x
def inside_circle(total_count, chunk_size=-1): x = da.random.uniform(size=(total_count), chunks=(chunk_size)) y = da.random.uniform(size=(total_count), chunks=(chunk_size)) radii = da.sqrt(x * x + y * y) filtered = da.where(radii <= 1.0) indices = np.array(filtered[0]) count = len(radii[indices]) return count
def _calculate_f(coeffs, points, x, y, sumcol, type='dask'): """ Calculate the thin-plate energy function. """ w = coeffs[:-3] a1, ax, ay = coeffs[-3:] for wi, Pi in zip(w, points): sumcol += wi * _U_dask(da.sqrt((x - Pi[0])**2 + (y - Pi[1])**2)) return a1 + ax * x + ay * y + sumcol
def initialize_da(X, k, init='random', W=None, H=None): n_components = k n_samples, n_features = X.shape if init == 'random': avg = da.sqrt(X.mean() / n_components) H = avg * da.random.RandomState(42).normal( 0, 1, size=(n_components, n_features), chunks=(n_components, X.chunks[1][0])) W = avg * da.random.RandomState(42).normal( 0, 1, size=(n_samples, n_components), chunks=(n_samples, n_components)) H = da.fabs(H) W = da.fabs(W) return W, H if init == 'nndsvd' or init == 'nndsvda': # not converted to da yet raise NotImplementedError if init == 'custom': return W, H if init == 'random_vcol': import math #p_c = options.get('p_c', int(ceil(1. / 5 * X.shape[1]))) #p_r = options.get('p_r', int(ceil(1. / 5 * X.shape[0]))) p_c = int(math.ceil(1. / 5 * X.shape[1])) p_r = int(math.ceil(1. / 5 * X.shape[0])) prng = np.random.RandomState(42) #W = da.zeros((X.shape[0], n_components), chunks = (X.shape[0],n_components)) #H = da.zeros((n_components, X.shape[1]), chunks = (n_components,X.chunks[1][0])) W = [] H = [] for i in range(k): W.append(X[:, prng.randint(low=0, high=X.shape[1], size=p_c)].mean( axis=1).compute()) H.append(X[prng.randint(low=0, high=X.shape[0], size=p_r), :].mean( axis=0).compute()) W = np.stack(W, axis=1) H = np.stack(H, axis=0) return W, H
def fit(self) -> None: """ Returns ------- None """ self._center_vector = self._array.mean(axis=self._axis) if self._std_dist == 'normal': self._scale_vector = self._std_inverter( self._array.std(axis=self._axis)) else: p = self._center_vector / 2 self._scale_vector = self._std_inverter(da.sqrt(2 * p * (1 - p))) self._sym_scale_vector = self._scale_vector**2
def haversine_distance(pickup_latitude, pickup_longitude, dropoff_latitude, dropoff_longitude): x_1 = pi / 180 * pickup_latitude y_1 = pi / 180 * pickup_longitude x_2 = pi / 180 * dropoff_latitude y_2 = pi / 180 * dropoff_longitude dlon = y_2 - y_1 dlat = x_2 - x_1 a = sin(dlat / 2)**2 + cos(x_1) * cos(x_2) * sin(dlon / 2)**2 c = 2 * arcsin(sqrt(a)) r = 6371 # Radius of earth in kilometers return c * r
def ttest_1samp(a, popmean, axis=0, nan_policy='propagate'): if nan_policy != 'propagate': raise NotImplementedError("`nan_policy` other than 'propagate' " "have not been implemented.") n = a.shape[axis] df = n - 1 d = da.mean(a, axis) - popmean v = da.var(a, axis, ddof=1) denom = da.sqrt(v / float(n)) with np.errstate(divide='ignore', invalid='ignore'): t = da.divide(d, denom) t, prob = _ttest_finish(df, t) return delayed(Ttest_1sampResult, nout=2)(t, prob)
def ttest_1samp(a, popmean, axis=0, nan_policy="propagate"): if nan_policy != "propagate": raise NotImplementedError( "`nan_policy` other than 'propagate' have not been implemented.") n = a.shape[axis] df = n - 1 d = da.mean(a, axis) - popmean v = da.var(a, axis, ddof=1) denom = da.sqrt(v / float(n)) with np.errstate(divide="ignore", invalid="ignore"): t = da.divide(d, denom) t, prob = _ttest_finish(df, t) return delayed(Ttest_1sampResult, nout=2)(t, prob)
def do_compute(seed, size=int(4e4), radius=300): with dask.set_options(get=dask.threaded.get): #da.random.seed(seed) #arr = (da.random.normal(0.01, 1, (size,3), chunks=size//24)-0.5)*radius np.random.seed(seed) c = (np.random.normal(0.01, 1, (size, 3)) - 0.5) * radius arr = da.from_array(c, chunks=c.shape[0] // NCPUS) diff = arr[:, np.newaxis, :] - arr[np.newaxis, :, :] mat = da.sqrt((diff * diff).sum(-1)) inv6 = (1. / mat)**6 pot = 4. * (inv6 * inv6 - inv6) e = da.nansum(pot) / 2. return e.compute(num_workers=NCPUS)
def test_gradient(shape, varargs, axis, edge_order): a = np.random.randint(0, 10, shape) d_a = da.from_array(a, chunks=(len(shape) * (5, ))) r_a = np.gradient(a, *varargs, axis=axis, edge_order=edge_order) r_d_a = da.gradient(d_a, *varargs, axis=axis, edge_order=edge_order) if isinstance(axis, Number): assert_eq(r_d_a, r_a) else: assert len(r_d_a) == len(r_a) for e_r_d_a, e_r_a in zip(r_d_a, r_a): assert_eq(e_r_d_a, e_r_a) assert_eq(da.sqrt(sum(map(da.square, r_d_a))), np.sqrt(sum(map(np.square, r_a))))
def ttest_rel(a, b, axis=0, nan_policy="propagate"): if nan_policy != "propagate": raise NotImplementedError( "`nan_policy` other than 'propagate' have not been implemented.") n = a.shape[axis] df = float(n - 1) d = (a - b).astype(np.float64) v = da.var(d, axis, ddof=1) dm = da.mean(d, axis) denom = da.sqrt(v / float(n)) with np.errstate(divide="ignore", invalid="ignore"): t = da.divide(dm, denom) t, prob = _ttest_finish(df, t) return delayed(Ttest_relResult, nout=2)(t, prob)
def ttest_rel(a, b, axis=0, nan_policy='propagate'): if nan_policy != 'propagate': raise NotImplementedError("`nan_policy` other than 'propagate' " "have not been implemented.") n = a.shape[axis] df = float(n - 1) d = (a - b).astype(np.float64) v = da.var(d, axis, ddof=1) dm = da.mean(d, axis) denom = da.sqrt(v / float(n)) with np.errstate(divide='ignore', invalid='ignore'): t = da.divide(dm, denom) t, prob = _ttest_finish(df, t) return delayed(Ttest_relResult, nout=2)(t, prob)
def test_gradient(shape, varargs, axis, edge_order): a = np.random.randint(0, 10, shape) d_a = da.from_array(a, chunks=(len(shape) * (5,))) r_a = np.gradient(a, *varargs, axis=axis, edge_order=edge_order) r_d_a = da.gradient(d_a, *varargs, axis=axis, edge_order=edge_order) if isinstance(axis, Number): assert_eq(r_d_a, r_a) else: assert len(r_d_a) == len(r_a) for e_r_d_a, e_r_a in zip(r_d_a, r_a): assert_eq(e_r_d_a, e_r_a) assert_eq( da.sqrt(sum(map(da.square, r_d_a))), np.sqrt(sum(map(np.square, r_a))) )
def _equal_var_ttest_denom(v1, n1, v2, n2): df = n1 + n2 - 2.0 svar = ((n1 - 1) * v1 + (n2 - 1) * v2) / df denom = da.sqrt(svar * (1.0 / n1 + 1.0 / n2)) # XXX: np -> da return df, denom
def test_arithmetic(): x = np.arange(5).astype('f4') + 2 y = np.arange(5).astype('i8') + 2 z = np.arange(5).astype('i4') + 2 a = da.from_array(x, chunks=(2,)) b = da.from_array(y, chunks=(2,)) c = da.from_array(z, chunks=(2,)) assert eq(a + b, x + y) assert eq(a * b, x * y) assert eq(a - b, x - y) assert eq(a / b, x / y) assert eq(b & b, y & y) assert eq(b | b, y | y) assert eq(b ^ b, y ^ y) assert eq(a // b, x // y) assert eq(a ** b, x ** y) assert eq(a % b, x % y) assert eq(a > b, x > y) assert eq(a < b, x < y) assert eq(a >= b, x >= y) assert eq(a <= b, x <= y) assert eq(a == b, x == y) assert eq(a != b, x != y) assert eq(a + 2, x + 2) assert eq(a * 2, x * 2) assert eq(a - 2, x - 2) assert eq(a / 2, x / 2) assert eq(b & True, y & True) assert eq(b | True, y | True) assert eq(b ^ True, y ^ True) assert eq(a // 2, x // 2) assert eq(a ** 2, x ** 2) assert eq(a % 2, x % 2) assert eq(a > 2, x > 2) assert eq(a < 2, x < 2) assert eq(a >= 2, x >= 2) assert eq(a <= 2, x <= 2) assert eq(a == 2, x == 2) assert eq(a != 2, x != 2) assert eq(2 + b, 2 + y) assert eq(2 * b, 2 * y) assert eq(2 - b, 2 - y) assert eq(2 / b, 2 / y) assert eq(True & b, True & y) assert eq(True | b, True | y) assert eq(True ^ b, True ^ y) assert eq(2 // b, 2 // y) assert eq(2 ** b, 2 ** y) assert eq(2 % b, 2 % y) assert eq(2 > b, 2 > y) assert eq(2 < b, 2 < y) assert eq(2 >= b, 2 >= y) assert eq(2 <= b, 2 <= y) assert eq(2 == b, 2 == y) assert eq(2 != b, 2 != y) assert eq(-a, -x) assert eq(abs(a), abs(x)) assert eq(~(a == b), ~(x == y)) assert eq(~(a == b), ~(x == y)) assert eq(da.logaddexp(a, b), np.logaddexp(x, y)) assert eq(da.logaddexp2(a, b), np.logaddexp2(x, y)) assert eq(da.exp(b), np.exp(y)) assert eq(da.log(a), np.log(x)) assert eq(da.log10(a), np.log10(x)) assert eq(da.log1p(a), np.log1p(x)) assert eq(da.expm1(b), np.expm1(y)) assert eq(da.sqrt(a), np.sqrt(x)) assert eq(da.square(a), np.square(x)) assert eq(da.sin(a), np.sin(x)) assert eq(da.cos(b), np.cos(y)) assert eq(da.tan(a), np.tan(x)) assert eq(da.arcsin(b/10), np.arcsin(y/10)) assert eq(da.arccos(b/10), np.arccos(y/10)) assert eq(da.arctan(b/10), np.arctan(y/10)) assert eq(da.arctan2(b*10, a), np.arctan2(y*10, x)) assert eq(da.hypot(b, a), np.hypot(y, x)) assert eq(da.sinh(a), np.sinh(x)) assert eq(da.cosh(b), np.cosh(y)) assert eq(da.tanh(a), np.tanh(x)) assert eq(da.arcsinh(b*10), np.arcsinh(y*10)) assert eq(da.arccosh(b*10), np.arccosh(y*10)) assert eq(da.arctanh(b/10), np.arctanh(y/10)) assert eq(da.deg2rad(a), np.deg2rad(x)) assert eq(da.rad2deg(a), np.rad2deg(x)) assert eq(da.logical_and(a < 1, b < 4), np.logical_and(x < 1, y < 4)) assert eq(da.logical_or(a < 1, b < 4), np.logical_or(x < 1, y < 4)) assert eq(da.logical_xor(a < 1, b < 4), np.logical_xor(x < 1, y < 4)) assert eq(da.logical_not(a < 1), np.logical_not(x < 1)) assert eq(da.maximum(a, 5 - a), np.maximum(a, 5 - a)) assert eq(da.minimum(a, 5 - a), np.minimum(a, 5 - a)) assert eq(da.fmax(a, 5 - a), np.fmax(a, 5 - a)) assert eq(da.fmin(a, 5 - a), np.fmin(a, 5 - a)) assert eq(da.isreal(a + 1j * b), np.isreal(x + 1j * y)) assert eq(da.iscomplex(a + 1j * b), np.iscomplex(x + 1j * y)) assert eq(da.isfinite(a), np.isfinite(x)) assert eq(da.isinf(a), np.isinf(x)) assert eq(da.isnan(a), np.isnan(x)) assert eq(da.signbit(a - 3), np.signbit(x - 3)) assert eq(da.copysign(a - 3, b), np.copysign(x - 3, y)) assert eq(da.nextafter(a - 3, b), np.nextafter(x - 3, y)) assert eq(da.ldexp(c, c), np.ldexp(z, z)) assert eq(da.fmod(a * 12, b), np.fmod(x * 12, y)) assert eq(da.floor(a * 0.5), np.floor(x * 0.5)) assert eq(da.ceil(a), np.ceil(x)) assert eq(da.trunc(a / 2), np.trunc(x / 2)) assert eq(da.degrees(b), np.degrees(y)) assert eq(da.radians(a), np.radians(x)) assert eq(da.rint(a + 0.3), np.rint(x + 0.3)) assert eq(da.fix(a - 2.5), np.fix(x - 2.5)) assert eq(da.angle(a + 1j), np.angle(x + 1j)) assert eq(da.real(a + 1j), np.real(x + 1j)) assert eq((a + 1j).real, np.real(x + 1j)) assert eq(da.imag(a + 1j), np.imag(x + 1j)) assert eq((a + 1j).imag, np.imag(x + 1j)) assert eq(da.conj(a + 1j * b), np.conj(x + 1j * y)) assert eq((a + 1j * b).conj(), (x + 1j * y).conj()) assert eq(da.clip(b, 1, 4), np.clip(y, 1, 4)) assert eq(da.fabs(b), np.fabs(y)) assert eq(da.sign(b - 2), np.sign(y - 2)) l1, l2 = da.frexp(a) r1, r2 = np.frexp(x) assert eq(l1, r1) assert eq(l2, r2) l1, l2 = da.modf(a) r1, r2 = np.modf(x) assert eq(l1, r1) assert eq(l2, r2) assert eq(da.around(a, -1), np.around(x, -1))
def decomposition(self, normalize_poissonian_noise=False, algorithm='svd', output_dimension=None, signal_mask=None, navigation_mask=None, get=threaded.get, num_chunks=None, reproject=True, bounds=False, **kwargs): """Perform Incremental (Batch) decomposition on the data, keeping n significant components. Parameters ---------- normalize_poissonian_noise : bool If True, scale the SI to normalize Poissonian noise algorithm : str One of ('svd', 'PCA', 'ORPCA', 'ONMF'). By default 'svd', lazy SVD decomposition from dask. output_dimension : int the number of significant components to keep. If None, keep all (only valid for SVD) get : dask scheduler the dask scheduler to use for computations; default `dask.threaded.get` num_chunks : int the number of dask chunks to pass to the decomposition model. More chunks require more memory, but should run faster. Will be increased to contain atleast output_dimension signals. navigation_mask : {BaseSignal, numpy array, dask array} The navigation locations marked as True are not used in the decompostion. signal_mask : {BaseSignal, numpy array, dask array} The signal locations marked as True are not used in the decomposition. reproject : bool Reproject data on the learnt components (factors) after learning. **kwargs passed to the partial_fit/fit functions. Notes ----- Various algorithm parameters and their default values: ONMF: lambda1=1, kappa=1, robust=False, store_r=False batch_size=None ORPCA: fast=True, lambda1=None, lambda2=None, method=None, learning_rate=None, init=None, training_samples=None, momentum=None PCA: batch_size=None, copy=True, white=False """ if bounds: msg = ( "The `bounds` keyword is deprecated and will be removed " "in v2.0. Since version > 1.3 this has no effect.") warnings.warn(msg, VisibleDeprecationWarning) explained_variance = None explained_variance_ratio = None _al_data = self._data_aligned_with_axes nav_chunks = _al_data.chunks[:self.axes_manager.navigation_dimension] sig_chunks = _al_data.chunks[self.axes_manager.navigation_dimension:] num_chunks = 1 if num_chunks is None else num_chunks blocksize = np.min([multiply(ar) for ar in product(*nav_chunks)]) nblocks = multiply([len(c) for c in nav_chunks]) if algorithm != "svd" and output_dimension is None: raise ValueError("With the %s the output_dimension " "must be specified" % algorithm) if output_dimension and blocksize / output_dimension < num_chunks: num_chunks = np.ceil(blocksize / output_dimension) blocksize *= num_chunks # LEARN if algorithm == 'PCA': from sklearn.decomposition import IncrementalPCA obj = IncrementalPCA(n_components=output_dimension) method = partial(obj.partial_fit, **kwargs) reproject = True elif algorithm == 'ORPCA': from hyperspy.learn.rpca import ORPCA kwg = {'fast': True} kwg.update(kwargs) obj = ORPCA(output_dimension, **kwg) method = partial(obj.fit, iterating=True) elif algorithm == 'ONMF': from hyperspy.learn.onmf import ONMF batch_size = kwargs.pop('batch_size', None) obj = ONMF(output_dimension, **kwargs) method = partial(obj.fit, batch_size=batch_size) elif algorithm != "svd": raise ValueError('algorithm not known') original_data = self.data try: if normalize_poissonian_noise: data = self._data_aligned_with_axes ndim = self.axes_manager.navigation_dimension sdim = self.axes_manager.signal_dimension nm = da.logical_not( da.zeros( self.axes_manager.navigation_shape[::-1], chunks=nav_chunks) if navigation_mask is None else to_array( navigation_mask, chunks=nav_chunks)) sm = da.logical_not( da.zeros( self.axes_manager.signal_shape[::-1], chunks=sig_chunks) if signal_mask is None else to_array( signal_mask, chunks=sig_chunks)) ndim = self.axes_manager.navigation_dimension sdim = self.axes_manager.signal_dimension bH, aG = da.compute( data.sum(axis=tuple(range(ndim))), data.sum(axis=tuple(range(ndim, ndim + sdim)))) bH = da.where(sm, bH, 1) aG = da.where(nm, aG, 1) raG = da.sqrt(aG) rbH = da.sqrt(bH) coeff = raG[(..., ) + (None, ) * rbH.ndim] *\ rbH[(None, ) * raG.ndim + (...,)] coeff.map_blocks(np.nan_to_num) coeff = da.where(coeff == 0, 1, coeff) data = data / coeff self.data = data # LEARN if algorithm == "svd": reproject = False from dask.array.linalg import svd try: self._unfolded4decomposition = self.unfold() # TODO: implement masking if navigation_mask or signal_mask: raise NotImplemented( "Masking is not yet implemented for lazy SVD." ) U, S, V = svd(self.data) factors = V.T explained_variance = S ** 2 / self.data.shape[0] loadings = U * S finally: if self._unfolded4decomposition is True: self.fold() self._unfolded4decomposition is False else: this_data = [] try: for chunk in progressbar( self._block_iterator( flat_signal=True, get=get, signal_mask=signal_mask, navigation_mask=navigation_mask), total=nblocks, leave=True, desc='Learn'): this_data.append(chunk) if len(this_data) == num_chunks: thedata = np.concatenate(this_data, axis=0) method(thedata) this_data = [] if len(this_data): thedata = np.concatenate(this_data, axis=0) method(thedata) except KeyboardInterrupt: pass # GET ALREADY CALCULATED RESULTS if algorithm == 'PCA': explained_variance = obj.explained_variance_ explained_variance_ratio = obj.explained_variance_ratio_ factors = obj.components_.T elif algorithm == 'ORPCA': _, _, U, S, V = obj.finish() factors = U * S loadings = V explained_variance = S**2 / len(factors) elif algorithm == 'ONMF': factors, loadings = obj.finish() loadings = loadings.T # REPROJECT if reproject: if algorithm == 'PCA': method = obj.transform def post(a): return np.concatenate(a, axis=0) elif algorithm == 'ORPCA': method = obj.project obj.R = [] def post(a): return obj.finish()[4] elif algorithm == 'ONMF': method = obj.project def post(a): return np.concatenate(a, axis=1).T _map = map(lambda thing: method(thing), self._block_iterator( flat_signal=True, get=get, signal_mask=signal_mask, navigation_mask=navigation_mask)) H = [] try: for thing in progressbar( _map, total=nblocks, desc='Project'): H.append(thing) except KeyboardInterrupt: pass loadings = post(H) if explained_variance is not None and \ explained_variance_ratio is None: explained_variance_ratio = \ explained_variance / explained_variance.sum() # RESHUFFLE "blocked" LOADINGS ndim = self.axes_manager.navigation_dimension if algorithm != "svd": # Only needed for online algorithms try: loadings = _reshuffle_mixed_blocks( loadings, ndim, (output_dimension,), nav_chunks).reshape((-1, output_dimension)) except ValueError: # In case the projection step was not finished, it's left # as scrambled pass finally: self.data = original_data target = self.learning_results target.decomposition_algorithm = algorithm target.output_dimension = output_dimension if algorithm != "svd": target._object = obj target.factors = factors target.loadings = loadings target.explained_variance = explained_variance target.explained_variance_ratio = explained_variance_ratio # Rescale the results if the noise was normalized if normalize_poissonian_noise is True: target.factors = target.factors * rbH.ravel()[:, np.newaxis] target.loadings = target.loadings * raG.ravel()[:, np.newaxis]
def decomposition(self, output_dimension, normalize_poissonian_noise=False, algorithm='PCA', signal_mask=None, navigation_mask=None, get=threaded.get, num_chunks=None, reproject=True, bounds=True, **kwargs): """Perform Incremental (Batch) decomposition on the data, keeping n significant components. Parameters ---------- output_dimension : int the number of significant components to keep normalize_poissonian_noise : bool If True, scale the SI to normalize Poissonian noise algorithm : str One of ('PCA', 'ORPCA', 'ONMF'). By default ('PCA') IncrementalPCA from scikit-learn is run. get : dask scheduler the dask scheduler to use for computations; default `dask.threaded.get` num_chunks : int the number of dask chunks to pass to the decomposition model. More chunks require more memory, but should run faster. Will be increased to contain atleast output_dimension signals. navigation_mask : {BaseSignal, numpy array, dask array} The navigation locations marked as True are not used in the decompostion. signal_mask : {BaseSignal, numpy array, dask array} The signal locations marked as True are not used in the decomposition. reproject : bool Reproject data on the learnt components (factors) after learning. bounds : {tuple, bool} The (min, max) values of the data to normalize before learning. If tuple (min, max), those values will be used for normalization. If True, extremes will be looked up (expensive), default. If False, no normalization is done (learning may be very slow). If normalize_poissonian_noise is True, this cannot be True. **kwargs passed to the partial_fit/fit functions. Notes ----- Various algorithm parameters and their default values: ONMF: lambda1=1, kappa=1, robust=False, store_r=False batch_size=None ORPCA: fast=True, lambda1=None, lambda2=None, method=None, learning_rate=None, init=None, training_samples=None, momentum=None PCA: batch_size=None, copy=True, white=False """ explained_variance = None explained_variance_ratio = None _al_data = self._data_aligned_with_axes nav_chunks = _al_data.chunks[:self.axes_manager.navigation_dimension] sig_chunks = _al_data.chunks[self.axes_manager.navigation_dimension:] num_chunks = 1 if num_chunks is None else num_chunks blocksize = np.min([multiply(ar) for ar in product(*nav_chunks)]) nblocks = multiply([len(c) for c in nav_chunks]) if blocksize / output_dimension < num_chunks: num_chunks = np.ceil(blocksize / output_dimension) blocksize *= num_chunks ## LEARN if algorithm == 'PCA': from sklearn.decomposition import IncrementalPCA obj = IncrementalPCA(n_components=output_dimension) method = partial(obj.partial_fit, **kwargs) reproject = True elif algorithm == 'ORPCA': from hyperspy.learn.rpca import ORPCA kwg = {'fast': True} kwg.update(kwargs) obj = ORPCA(output_dimension, **kwg) method = partial(obj.fit, iterating=True) elif algorithm == 'ONMF': from hyperspy.learn.onmf import ONMF batch_size = kwargs.pop('batch_size', None) obj = ONMF(output_dimension, **kwargs) method = partial(obj.fit, batch_size=batch_size) else: raise ValueError('algorithm not known') original_data = self.data try: if normalize_poissonian_noise: if bounds is True: bounds = False # warnings.warn? data = self._data_aligned_with_axes ndim = self.axes_manager.navigation_dimension sdim = self.axes_manager.signal_dimension nm = da.logical_not( da.zeros( self.axes_manager.navigation_shape[::-1], chunks=nav_chunks) if navigation_mask is None else to_array( navigation_mask, chunks=nav_chunks)) sm = da.logical_not( da.zeros( self.axes_manager.signal_shape[::-1], chunks=sig_chunks) if signal_mask is None else to_array( signal_mask, chunks=sig_chunks)) ndim = self.axes_manager.navigation_dimension sdim = self.axes_manager.signal_dimension bH, aG = da.compute( data.sum(axis=range(ndim)), data.sum(axis=range(ndim, ndim + sdim))) bH = da.where(sm, bH, 1) aG = da.where(nm, aG, 1) raG = da.sqrt(aG) rbH = da.sqrt(bH) coeff = raG[(..., ) + (None, )*rbH.ndim] *\ rbH[(None, )*raG.ndim + (...,)] coeff.map_blocks(np.nan_to_num) coeff = da.where(coeff == 0, 1, coeff) data = data / coeff self.data = data # normalize the data for learning algs: if bounds: if bounds is True: _min, _max = da.compute(self.data.min(), self.data.max()) else: _min, _max = bounds self.data = (self.data - _min) / (_max - _min) # LEARN this_data = [] try: for chunk in progressbar( self._block_iterator( flat_signal=True, get=get, signal_mask=signal_mask, navigation_mask=navigation_mask), total=nblocks, leave=True, desc='Learn'): this_data.append(chunk) if len(this_data) == num_chunks: thedata = np.concatenate(this_data, axis=0) method(thedata) this_data = [] if len(this_data): thedata = np.concatenate(this_data, axis=0) method(thedata) except KeyboardInterrupt: pass # GET ALREADY CALCULATED RESULTS if algorithm == 'PCA': explained_variance = obj.explained_variance_ explained_variance_ratio = obj.explained_variance_ratio_ factors = obj.components_.T elif algorithm == 'ORPCA': _, _, U, S, V = obj.finish() factors = U * S loadings = V explained_variance = S**2 / len(factors) elif algorithm == 'ONMF': factors, loadings = obj.finish() loadings = loadings.T # REPROJECT if reproject: if algorithm == 'PCA': method = obj.transform post = lambda a: np.concatenate(a, axis=0) elif algorithm == 'ORPCA': method = obj.project obj.R = [] post = lambda a: obj.finish()[4] elif algorithm == 'ONMF': method = obj.project post = lambda a: np.concatenate(a, axis=1).T _map = map(lambda thing: method(thing), self._block_iterator( flat_signal=True, get=get, signal_mask=signal_mask, navigation_mask=navigation_mask)) H = [] try: for thing in progressbar( _map, total=nblocks, desc='Project'): H.append(thing) except KeyboardInterrupt: pass loadings = post(H) if explained_variance is not None and \ explained_variance_ratio is None: explained_variance_ratio = \ explained_variance / explained_variance.sum() # RESHUFFLE "blocked" LOADINGS ndim = self.axes_manager.navigation_dimension try: loadings = _reshuffle_mixed_blocks( loadings, ndim, (output_dimension,), nav_chunks).reshape((-1, output_dimension)) except ValueError: # In case the projection step was not finished, it's left # as scrambled pass finally: self.data = original_data target = self.learning_results target.decomposition_algorithm = algorithm target.output_dimension = output_dimension target._object = obj target.factors = factors target.loadings = loadings target.explained_variance = explained_variance target.explained_variance_ratio = explained_variance_ratio