def get_array_moments( array: da.core.Array, mean: bool = True, std: bool = True, std_method: str = 'binom', axis: int = 0 ) -> Tuple[Optional[da.core.Array], Optional[da.core.Array]]: """ Computes specified array_moments Parameters ---------- array : array_like, shape (N, P) Array that moments will be computed from mean : bool Flag whether to compute mean of "array" along "axis" std : bool Flag whether to compute std of "array" along "axis" std_method : str Method used to compute standard deviation. Possible methods are: 'norm' ===> Normal Distribution Standard Deviation. See np.std 'binom' ====> Binomial Standard Deviation sqrt(2*p*(1-p)), where p = "mean"/2 axis : int Axis to compute mean and std along. Returns ------- array_mean : da.core.array, optional If "mean" is false, returns None Otherwise returns the array mean array_std: da.core.array, optional If "std" is false, returns None Otherwise returns the array std """ array_mean = None array_std = None if mean: array_mean = da.nanmean(array, axis=axis) if std: if std_method == 'binom': u = array_mean if mean else da.nanmean(array, axis=axis) u /= 2 array_std = da.sqrt(2 * u * (1 - u)) elif std_method == 'norm': array_std = da.nanstd(array, axis=axis) else: raise NotImplementedError( f'std_method, {std_method}, is not implemented ') array_mean, array_std = persist(array_mean, array_std) return array_mean, array_std
def make_plot_CTTH(scores, optf, crs, dnt, var, cosfield): fig = plt.figure(figsize=(16, 12)) for cnt, s in enumerate(scores.keys()): values = scores[s] masked_values = np.ma.array(values[0], mask=np.isnan(values[0])) cmap = plt.get_cmap(values[3]) cmap.set_bad('grey', 1.) ax = fig.add_subplot(4, 3, cnt + 1, projection=crs) # ccrs.Robinson() ims = ax.imshow(masked_values, transform=crs, extent=crs.bounds, vmin=values[1], vmax=values[2], cmap=cmap, origin='upper', interpolation='none') ax.coastlines(color='black') # mean = '' mean = weighted_spatial_average(values[0], cosfield).compute() mean = '{:.2f}'.format(da.nanmean(values[0]).compute()) ax.set_title(var + ' ' + s + ' ' + dnt + ' {}'.format(mean)) plt.colorbar(ims) plt.tight_layout() plt.savefig(optf) plt.close() print('SAVED ', os.path.basename(optf))
def power_spectrum(filter, time): """Compute the mean power spectrum over all particles at a given time. This routine gives the power spectrum (power spectral density) for each of the sampled variables within `filter`, as a mean over all particles. It will run a single advection step at the specified time. The resulting dictionary contains a `freq` item, with the FFT frequency bins for the output spectra. Args: filter (filtering.LagrangeFilter): The pre-configured filter object to use for running the analysis. time (float): The time at which to perform the analysis. Returns: Dict[str, numpy.ndarray]: A dictionary of power spectra for each of the sampled variables on the filter. """ psds = {} advection_data = filter.advection_step(time, output_time=True) time_series = advection_data.pop("time") for v, a in advection_data.items(): spectra = da.fft.fft(a[1].rechunk((-1, "auto")), axis=0) mean_spectrum = da.nanmean(da.absolute(spectra) ** 2, axis=1) psds[v] = mean_spectrum.compute() psds["freq"] = 2 * np.pi * np.fft.fftfreq(time_series.size, filter.output_dt) return psds
def _hotspots_dask_numpy(raster, kernel): # apply kernel to raster values mean_array = convolve_2d(raster.data, kernel / kernel.sum()) # calculate z-scores global_mean = da.nanmean(raster.data) global_std = da.nanstd(raster.data) # commented out to avoid early compute to check if global_std is zero # if global_std == 0: # raise ZeroDivisionError( # "Standard deviation of the input raster values is 0." # ) z_array = (mean_array - global_mean) / global_std _func = partial(_calc_hotspots_numpy) pad_h = kernel.shape[0] // 2 pad_w = kernel.shape[1] // 2 out = z_array.map_overlap(_func, depth=(pad_h, pad_w), boundary=np.nan, meta=np.array(())) return out
def fit( self, X: Union[ArrayLike, DataFrameType], y: Optional[Union[ArrayLike, SeriesType]] = None, ) -> "StandardScaler": self._reset() attributes = OrderedDict() if isinstance(X, (pd.DataFrame, dd.DataFrame)): X = X.values if self.with_mean: mean_ = nanmean(X, 0) attributes["mean_"] = mean_ if self.with_std: var_ = nanvar(X, 0) scale_ = var_.copy() scale_[scale_ == 0] = 1 scale_ = da.sqrt(scale_) attributes["scale_"] = scale_ attributes["var_"] = var_ attributes["n_samples_seen_"] = np.nan values = compute(*attributes.values()) for k, v in zip(attributes, values): setattr(self, k, v) self.n_features_in_ = X.shape[1] return self
def nanmean(a, axis=None, dtype=None, out=None): if a.dtype.kind == "O": return _nanmean_ddof_object(0, a, axis=axis, dtype=dtype) if isinstance(a, dask_array_type): return dask_array.nanmean(a, axis=axis, dtype=dtype) return np.nanmean(a, axis=axis, dtype=dtype)
def semivar(*args, **kwargs): """ semivariance """ args = uf.broadcast(*args) X = da.stack([x.task.flatten() for x in args]) out = 0.5 * da.nanmean((X[0] - X[1])**2) return out
def nanmean(a, axis=None, dtype=None, out=None): if a.dtype.kind == 'O': return _nanmean_ddof_object(0, a, axis=axis, dtype=dtype) if isinstance(a, dask_array_type): return dask_array.nanmean(a, axis=axis, dtype=dtype) return np.nanmean(a, axis=axis, dtype=dtype)
def stripsel_ana(img_in): """ Analyze data of the stripsel JF detector (incomplete: still needs to integrate etc) """ if len(img_in.shape) == 3: if isinstance(img_in, da.Array): img_in = da.nanmean(img_in, axis=0) img_in = img_in.compute() else: img_in = np.nanmean(img_in, axis=0) img_corr = correct_stripeJF(img_in) return {'img_corr': img_corr, 'img_init': img_in}
def _fit_array(self, X): if self.strategy not in {"mean", "constant"}: msg = "Can only use strategy='mean' or 'constant' with Dask Array." raise ValueError(msg) if self.strategy == "mean": statistics = da.nanmean(X, axis=0).compute() else: statistics = np.full(X.shape[1], self.fill_value, dtype=X.dtype) self.statistics_, = da.compute(statistics)
def _calculate_summary_statistics(self): data = self._lazy_data() _raveled = data.ravel() _mean, _std, _min, _q1, _q2, _q3, _max = da.compute( da.nanmean(data), da.nanstd(data), da.nanmin(data), da.percentile(_raveled, [25, ]), da.percentile(_raveled, [50, ]), da.percentile(_raveled, [75, ]), da.nanmax(data), ) return _mean, _std, _min, _q1, _q2, _q3, _max
def nanmean(a, axis=None, dtype=None, out=None): if a.dtype.kind == "O": return _nanmean_ddof_object(0, a, axis=axis, dtype=dtype) with warnings.catch_warnings(): warnings.filterwarnings( "ignore", r"Mean of empty slice", category=RuntimeWarning ) if isinstance(a, dask_array_type): return dask_array.nanmean(a, axis=axis, dtype=dtype) return np.nanmean(a, axis=axis, dtype=dtype)
def stadistics(self): headers = ["group", "mean", "std dev", "min", "25%", "50%", "75%", "max", "nonzero", "nonan", "unique", "dtype"] self.chunksize = Chunks.build_from_shape(self.shape, self.dtypes) table = [] for group, (dtype, _) in self.dtypes.fields.items(): values = dict() values["dtype"] = dtype values["group"] = group darray = self.data[group].da if dtype == np.dtype(float) or dtype == np.dtype(int): da_mean = da.around(darray.mean(), decimals=3) da_std = da.around(darray.std(), decimals=3) da_min = da.around(darray.min(), decimals=3) da_max = da.around(darray.max(), decimals=3) result = dask.compute([da_mean, da_std, da_min, da_max])[0] values["mean"] = result[0] if not np.isnan(result[0]) else da.around(da.nanmean(darray), decimals=3).compute() values["std dev"] = result[1] if not np.isnan(result[0]) else da.around(da.nanstd(darray), decimals=3).compute() values["min"] = result[2] if not np.isnan(result[0]) else da.around(da.nanmin(darray), decimals=3).compute() values["max"] = result[3] if not np.isnan(result[0]) else da.around(da.nanmax(darray), decimals=3).compute() if len(self.shape[group]) == 1: da_percentile = da.around(da.percentile(darray, [25, 50, 75]), decimals=3) result = da_percentile.compute() values["25%"] = result[0] values["50%"] = result[1] values["75%"] = result[2] else: values["25%"] = "-" values["50%"] = "-" values["75%"] = "-" values["nonzero"] = da.count_nonzero(darray).compute() values["nonan"] = da.count_nonzero(da.notnull(darray)).compute() values["unique"] = "-" else: values["mean"] = "-" values["std dev"] = "-" values["min"] = "-" values["max"] = "-" values["25%"] = "-" values["50%"] = "-" values["75%"] = "-" values["nonzero"] = "-" values["nonan"] = da.count_nonzero(da.notnull(darray)).compute() vunique = darray.to_dask_dataframe().fillna('').nunique().compute() values["unique"] = vunique row = [] for column in headers: row.append(values[column]) table.append(row) print("# rows {}".format(self.shape[0])) return tabulate(table, headers)
def _impute_dask_array(x): import dask.array as da m = da.nanmean(x, axis=0).compute() start = 0 arrs = [] for i in range(len(x.chunks[1])): end = start + x.chunks[1][i] impute = _get_imputer(m[start:end]) arrs.append(x[:, start:end].map_blocks(impute, dtype=float)) start = end return da.concatenate(arrs, axis=1)
def test_nan(): x = np.array([[1, np.nan, 3, 4], [5, 6, 7, np.nan], [9, 10, 11, 12]]) d = da.from_array(x, chunks=(2, 2)) assert_eq(np.nansum(x), da.nansum(d)) assert_eq(np.nansum(x, axis=0), da.nansum(d, axis=0)) assert_eq(np.nanmean(x, axis=1), da.nanmean(d, axis=1)) assert_eq(np.nanmin(x, axis=1), da.nanmin(d, axis=1)) assert_eq(np.nanmax(x, axis=(0, 1)), da.nanmax(d, axis=(0, 1))) assert_eq(np.nanvar(x), da.nanvar(d)) assert_eq(np.nanstd(x, axis=0), da.nanstd(d, axis=0)) assert_eq(np.nanargmin(x, axis=0), da.nanargmin(d, axis=0)) assert_eq(np.nanargmax(x, axis=0), da.nanargmax(d, axis=0)) assert_eq(np.nanprod(x), da.nanprod(d))
def combine_extension_to_new_hdu(inputs_collection: PipelineCollection, operation: CombineOperation, ext: typing.Union[str, int], plane_shape: tuple[int, int]): image_cube = iofits.hdulists_to_dask_cube(inputs_collection.items, plane_shape, ext=ext) if operation is CombineOperation.MEAN: result = da.nanmean(image_cube, axis=0) if not isinstance(ext, int): hdr = {'EXTNAME': ext} else: hdr = None return dask.delayed(iofits.DaskHDU)(result, header=hdr, kind="image")
def test_nan(): x = np.array([[1, np.nan, 3, 4], [5, 6, 7, np.nan], [9, 10, 11, 12]]) d = da.from_array(x, blockshape=(2, 2)) assert eq(np.nansum(x), da.nansum(d)) assert eq(np.nansum(x, axis=0), da.nansum(d, axis=0)) assert eq(np.nanmean(x, axis=1), da.nanmean(d, axis=1)) assert eq(np.nanmin(x, axis=1), da.nanmin(d, axis=1)) assert eq(np.nanmax(x, axis=(0, 1)), da.nanmax(d, axis=(0, 1))) assert eq(np.nanvar(x), da.nanvar(d)) assert eq(np.nanstd(x, axis=0), da.nanstd(d, axis=0)) assert eq(np.nanargmin(x, axis=0), da.nanargmin(d, axis=0)) assert eq(np.nanargmax(x, axis=0), da.nanargmax(d, axis=0)) with ignoring(AttributeError): assert eq(np.nanprod(x), da.nanprod(d))
def _calculate_summary_statistics(self, rechunk=True): if rechunk is True: # Use dask auto rechunk instead of HyperSpy's one, what should be # better for these operations rechunk = "dask_auto" data = self._lazy_data(rechunk=rechunk) _raveled = data.ravel() _mean, _std, _min, _q1, _q2, _q3, _max = da.compute( da.nanmean(data), da.nanstd(data), da.nanmin(data), da.percentile(_raveled, [25, ]), da.percentile(_raveled, [50, ]), da.percentile(_raveled, [75, ]), da.nanmax(data), ) return _mean, _std, _min, _q1, _q2, _q3, _max
def test_nan(): x = np.array([[1, np.nan, 3, 4], [5, 6, 7, np.nan], [9, 10, 11, 12]]) d = da.from_array(x, chunks=(2, 2)) assert_eq(np.nansum(x), da.nansum(d)) assert_eq(np.nansum(x, axis=0), da.nansum(d, axis=0)) assert_eq(np.nanmean(x, axis=1), da.nanmean(d, axis=1)) assert_eq(np.nanmin(x, axis=1), da.nanmin(d, axis=1)) assert_eq(np.nanmax(x, axis=(0, 1)), da.nanmax(d, axis=(0, 1))) assert_eq(np.nanvar(x), da.nanvar(d)) assert_eq(np.nanstd(x, axis=0), da.nanstd(d, axis=0)) assert_eq(np.nanargmin(x, axis=0), da.nanargmin(d, axis=0)) assert_eq(np.nanargmax(x, axis=0), da.nanargmax(d, axis=0)) assert_eq(nanprod(x), da.nanprod(d))
def cov(*args, axis=None, **kwargs): """ covariance """ if axis is None: args = [x.flatten() for x in args] axis = 0 X = da.stack(args, axis=-1).rechunk(com.CHUNKSIZE) cond = da.any(da.isnan(X), axis=-1) X = da.where(cond[..., None], np.nan, X) X -= da.nanmean(X, axis=axis, keepdims=True) X = da.where(da.isnan(X), 0, X) return X.swapaxes(axis, -1) @ X.swapaxes(axis, -2).conj() / (X.shape[axis] - 1)
def fit(self, X, y=None): self._reset() attributes = OrderedDict() if isinstance(X, (pd.DataFrame, dd.DataFrame)): X = X.values if self.with_mean: mean_ = nanmean(X, 0) attributes["mean_"] = mean_ if self.with_std: var_ = nanvar(X, 0) scale_ = var_.copy() scale_[scale_ == 0] = 1 scale_ = da.sqrt(scale_) attributes["scale_"] = scale_ attributes["var_"] = var_ attributes["n_samples_seen_"] = np.nan values = compute(*attributes.values()) for k, v in zip(attributes, values): setattr(self, k, v) return self
def azInt(img, poni, rot, wvl, plot=1, clim=(0,300), corrImg = None): npt_az = 360 npt_rad = 1000 if len(img.shape)==3: if isinstance(img, da.Array): img = da.nanmean(img, axis=0) img = img.compute() else: img = np.nanmean(img, axis=0) if not (corrImg is None): img = corrImg(img) q2d, chi, I2d = azInt2d(img, npt_rad, npt_az, poni, rot, wvl) q, I = azInt1d(img, npt_rad, poni, rot, wvl) if plot: plotAz(img, q, I, chi, I2d, clim=clim) if expecting() == 1: return dict(q=q, I=I, chi=chi, I2d=I2d) else: return q, I, chi, I2d
def fit( self, X: Union[ArrayLike, DataFrameType], y: Optional[Union[ArrayLike, SeriesType]] = None, ) -> "StandardScaler": self._reset() X = self._validate_data( X, estimator=self, accept_dask_array=True, accept_dask_dataframe=True, accept_unknown_chunks=True, preserve_pandas_dataframe=True, ) attributes = OrderedDict() if isinstance(X, (pd.DataFrame, dd.DataFrame)): X = X.values if self.with_mean: mean_ = nanmean(X, 0) attributes["mean_"] = mean_ if self.with_std: var_ = nanvar(X, 0) scale_ = var_.copy() scale_[scale_ == 0] = 1 scale_ = da.sqrt(scale_) attributes["scale_"] = scale_ attributes["var_"] = var_ attributes["n_samples_seen_"] = X.shape[0] values = compute(*attributes.values()) for k, v in zip(attributes, values): setattr(self, k, v) self.n_features_in_: int = X.shape[1] return self
def composite(src_fps, save_loc, save_nam, method="mean", dt="default"): """Creates a composite from multiple rasters. Individual rasters have to be of the same size (extents, pixel size, data type). Multiple compositing are available, including mean, min, max, median etc. Parameters ---------- src_fps : list(str) List of paths to source files. save_loc : str Path to save folder. save_nam : str Name of the file to be saved. method : str Compositing method, either "mean", "min", "max" or "median". dt : str(optional) Orbit direction, either "DES" or "ASC" (required for generating previews). Returns ------- out_pth : str Absolute path to the product. """ # Make sure save location exists os.makedirs(save_loc, exist_ok=True) # Save TIFF metadata for output with rasterio.open(src_fps[0]) as rst: out_meta = rst.profile.copy() # Lazily load files into DASK ARRAYS print(f"#\n# Preparing Dask arrays...") chunks = {'band': 1, 'x': 1024, 'y': 1024} lazy_arrays = [xr.open_rasterio(fp, chunks=chunks) for fp in src_fps] stacked = da.concatenate(lazy_arrays, axis=0) stacked[stacked == 0] = np.nan # Calculate composite for selected method with dask print(f"# Compositing ({method}) using Dask...") if method == 'mean': comp_out = da.nanmean(stacked, axis=0, keepdims=True).compute() elif method == 'median': comp_out = da.nanmedian(stacked, axis=0, keepdims=True).compute() elif method == 'max': comp_out = da.nanmax(stacked, axis=0, keepdims=True).compute() elif method == 'min': comp_out = da.nanmin(stacked, axis=0, keepdims=True).compute() else: raise Exception('{} is not a valid compositing ' 'method!'.format(method)) # ---------------------------------------------------------------------------- # SAVE RESULTS TO FILES # ---------------------------------------------------------------------------- # Save composite to GeoTIFF tif_time = time.time() print("#\n# Saving composite image to TIFF...") out_nam = save_nam + ".tif" out_pth = os.path.join(save_loc, out_nam) out_meta.update(bigtiff="yes", compress='lzw') with rasterio.open(out_pth, "w", **out_meta) as dest: dest.write(comp_out) tif_time = time.time() - tif_time print(f"# Time (TIFF): {tif_time:.2f} seconds") # # Save preview file as JPEG # jpg_time = time.time() # print("#\n# Saving preview image to JPEG...") # # Pickle array for passing it to plot_preview() # spt = os.path.join(save_loc, "temp_array.p") # with open(spt, "wb") as pf: # pickle.dump(comp_out, pf) # comp_out = None # try: # plot_preview(spt, dt, out_pth[:-3] + "jpg") # except MemoryError as me: # print("# Memory error occurred, could not save to JPEG") # print(me) # finally: # # delete pickle # os.remove(spt) # jpg_time = time.time() - jpg_time # print(f"# Time (JPEG): {jpg_time:.2f} seconds") return out_pth
print(f"{n} clusters, score = {scores[-1]}") plt.plot(klabels, scores) plt.ylabel('Silhoutte score') plt.xlabel('$n_{clusters}$') print('Performing clustering via KMeans', rIVs[:, :kmeans_d].shape) kmeans = KMeans(n_clusters=8, random_state=10, n_jobs=-1).fit(rIVs[:, :kmeans_d]) clustering = kmeans.predict(rIVs[:, :kmeans_d]) # For visualization of curves corresponding to clusters, we grab the full spectra, filter out points where the spectrum was not measured due to drift and calculate the mean per image per spectrum. validIVs = da.where(fullIVs == 0, np.nan, fullIVs).reshape( (fullIVs.shape[0], -1)) meanIVs = [ da.nanmean(validIVs[:, clustering == index], axis=1) for index in range(kmeans.n_clusters) ] # + tstart = time.time() print('plotting clustering data') fig2, axs = plt.subplots(2, 4, figsize=[11, 5], dpi=600) #In case of 1 out of 1 columns figure axs = np.transpose(axs) coarse_2d = 3 * coarsen color = rIVs[::coarse_2d, :3] center_colors = kmeans.cluster_centers_[:, :3] - color.min(axis=0) color = color - color.min(axis=0, keepdims=True)
get_tile('http://localhost/tiles/{x}/{y}', ['x', 'y', 'z'], [0, 1, 2]) will fetch the data at the URL: http://localhost/tiles/0/1 This assumes that the data is in CoverageJSON format, and does the work of fetching the data, parsing it, and extracting the actual data as a numpy array. """ for axis, tile_index in zip(axis_names, tile_indices): url_template = url_template.replace('{' + axis + '}', str(tile_index)) # Debug line: uncomment to see which tiles are fetched. # Note that when printing these may get confused due to multithreading #print 'fetching tile from',url_template tile_data = json.loads(get_data(url_template)) tile_values = np.array(tile_data['values'], dtype=float).reshape(tile_data['shape']) return tile_values if __name__ == '__main__': # Usage example. arrs = get_dask_arrays('http://godiva.rdg.ac.uk/coverage/sst-tiled.json') print "Created dask array" sst = arrs['analysed_sst-yx_tiling'] print 'Shape:',sst.shape print "Got array, calculating means:" print 'Northern Eighth', da.nanmean(sst[0,:450,:]).compute() print 'Equatorial Quarter', da.nanmean(sst[0,1350:2250,:]).compute() print 'Southern Eighth', da.nanmean(sst[0,3150:,:]).compute() # Note that even though we defined c100, each tile is still fetched for each calculation. # That's because we've used a naive fetch method, with no caching c100 = sst[0,1700:1900,3500:3700] print 'Central 100 points', da.nanmean(c100).compute() print 'Central 100 points Min/Max', da.nanmin(c100).compute(), da.nanmax(c100).compute()
def _stage_2( YP: Array, X: Array, Y: Array, alphas: Optional[NDArray] = None, normalize: bool = True, _glow_adj_alpha: bool = False, _glow_adj_scaling: bool = False, ) -> Tuple[Array, Array]: """Stage 2 - WGR Meta Regression This stage will train separate ridge regression models for each outcome using the predictions from stage 1 for that same outcome as features. These predictions are then evaluated based on R2 score to determine an optimal "meta" estimator (see `_stage_1` for the "base" estimator description). Results then include only predictions and coefficients from this optimal model. For more details, see the level 1 regression model described in step 1 of [Mbatchou et al. 2020](https://www.biorxiv.org/content/10.1101/2020.06.19.162354v2). """ assert YP.ndim == 4 assert X.ndim == 2 assert Y.ndim == 2 # Check that chunking across samples is the same for all arrays assert YP.numblocks[2] == X.numblocks[0] == Y.numblocks[0] assert YP.chunks[2] == X.chunks[0] == Y.chunks[0] # Assert single chunks for covariates and outcomes assert X.numblocks[1] == Y.numblocks[1] == 1 # Extract shape statistics n_variant_block, n_alpha_1 = YP.shape[:2] n_sample_block = Y.numblocks[0] n_sample, n_outcome = Y.shape n_covar = X.shape[1] n_indvar = n_covar + n_variant_block * n_alpha_1 sample_chunks = Y.chunks[0] if normalize: assert_block_shape(YP, n_variant_block, 1, n_sample_block, 1) assert_chunk_shape(YP, 1, n_alpha_1, sample_chunks[0], n_outcome) # See: https://github.com/projectglow/glow/issues/260 if _glow_adj_scaling: YP = da.map_blocks( lambda x: (x - x.mean(axis=2, keepdims=True)) / x.std(axis=2, keepdims=True), YP, ) else: YP = (YP - YP.mean(axis=2, keepdims=True)) / YP.std(axis=2, keepdims=True) # Tranpose for refit on level 1 predictions YP = YP.transpose((3, 2, 0, 1)) assert_array_shape(YP, n_outcome, n_sample, n_variant_block, n_alpha_1) if alphas is None: # See: https://github.com/projectglow/glow/issues/255 if _glow_adj_alpha: alphas = get_alphas(n_variant_block * n_alpha_1 * n_outcome) else: alphas = get_alphas(n_variant_block * n_alpha_1) n_alpha_2 = alphas.size YR = [] BR = [] for i in range(n_outcome): # Slice and reshape to new 2D covariate matrix; # The order of raveling in trailing dimensions is important # and later reshapes will assume variants, alphas order XPB = YP[i].reshape((n_sample, n_variant_block * n_alpha_1)) # Prepend covariates and chunk along first dim only XPB = da.concatenate((X, XPB), axis=1) XPB = XPB.rechunk(chunks=(None, -1)) assert_array_shape(XPB, n_sample, n_indvar) assert XPB.numblocks == (n_sample_block, 1) # Extract outcome vector YB = Y[:, [i]] assert XPB.ndim == YB.ndim == 2 # Fit and predict folds for each parameter BB, YPB = _ridge_regression_cv(XPB, YB, alphas, n_zero_reg=n_covar)[-2:] assert_array_shape(BB, n_alpha_2, n_sample_block * n_indvar, 1) assert_array_shape(YPB, n_alpha_2, n_sample, 1) BR.append(BB) YR.append(YPB) # Concatenate predictions along outcome dimension YR = da.concatenate(YR, axis=2) assert_block_shape(YR, 1, n_sample_block, n_outcome) assert_chunk_shape(YR, n_alpha_2, sample_chunks[0], 1) assert_array_shape(YR, n_alpha_2, n_sample, n_outcome) # Move samples to last dim so all others are batch # dims for R2 calculations YR = da.transpose(YR, (0, 2, 1)) assert_array_shape(YR, n_alpha_2, n_outcome, n_sample) YR = YR.rechunk((-1, -1, None)) assert_block_shape(YR, 1, 1, n_sample_block) assert YR.shape[1:] == Y.T.shape # Concatenate betas along outcome dimension BR = da.concatenate(BR, axis=2) assert_block_shape(BR, 1, n_sample_block, n_outcome) assert_chunk_shape(BR, n_alpha_2, n_indvar, 1) assert_array_shape(BR, n_alpha_2, n_sample_block * n_indvar, n_outcome) # Compute R2 scores within each sample block for each outcome + alpha R2 = da.stack( [ r2_score(YR.blocks[..., i], Y.T.blocks[..., i]) # Avoid warnings on R2 calculations for blocks with single rows if YR.chunks[-1][i] > 1 else da.full(YR.shape[:-1], np.nan) for i in range(n_sample_block) ] ) assert_array_shape(R2, n_sample_block, n_alpha_2, n_outcome) # Coerce to finite or nan before nan-aware mean R2 = da.where(da.isfinite(R2), R2, np.nan) # Find highest mean alpha score for each outcome across blocks R2M = da.nanmean(R2, axis=0) assert_array_shape(R2M, n_alpha_2, n_outcome) # Identify index for the alpha value with the highest mean score R2I = da.argmax(R2M, axis=0) assert_array_shape(R2I, n_outcome) # Choose the predictions corresponding to the model with best score YRM = da.stack([YR[R2I[i], i, :] for i in range(n_outcome)], axis=-1) YRM = YRM.rechunk((None, -1)) assert_block_shape(YRM, n_sample_block, 1) assert_chunk_shape(YRM, sample_chunks[0], n_outcome) assert_array_shape(YRM, n_sample, n_outcome) # Choose the betas corresponding to the model with the best score BRM = da.stack([BR[R2I[i], :, i] for i in range(n_outcome)], axis=-1) BRM = BRM.rechunk((None, -1)) assert_block_shape(BRM, n_sample_block, 1) assert_chunk_shape(BRM, n_indvar, n_outcome) assert_array_shape(BRM, n_sample_block * n_indvar, n_outcome) return BRM, YRM
indexed = da.stack( [data[qmask][nn[:, index]] for index in range(0, nn.shape[1])], axis=1).rechunk({ 0: 1, 1: -1 }) regions = da.map_blocks(find_overlap_regions, indexed, diffvecs.rechunk({0: 1})[..., np.newaxis], mask=weight_mask, dtype=np.float64, chunks=(1, 4, 2, 2, fftsize, fftsize), new_axis=(-1, -2)) region_intensities = regions[:, :, :, 0] region_weights = regions[:, :, :, 1] region_means = da.nanmean(region_intensities * region_weights, axis=( -1, -2)) / da.nanmean(region_weights, axis=(-1, -2)) region_means = region_means.compute() region_ratios = region_means[..., 0] / region_means[..., 1] Iopt_weight = np.where(w_calc[qmask, :4] > w_min - 0.001, w_calc[qmask, :4], 0)**4 res_I = minimize(error_func, np.zeros((qmask).sum()), args=(nn[:, 1:], Iopt_weight, np.log(region_ratios))) rel_intens[qmask] = np.exp(res_I.x) rel = np.exp(res_I.x) im = ax.scatter(*pc[:, qmask], c=rel, zorder=5) ax.set_aspect('equal') plt.colorbar(im, ax=axs) print(res_I.message) # -
def composite(src_fps, save_loc, save_nam, method="median", comp_mask="all_bad", bbox=None): # Prepare save location save_dir = os.path.join(save_loc, save_nam) if not os.path.exists(save_dir): os.mkdir(save_dir) # Get extents main_extents = output_image_extent(src_fps, bbox) # Obtain propertis of output array (same for all bands/images) out_extents = main_extents['bounds'] out_w = main_extents['width'] out_h = main_extents['height'] nr_bands = main_extents['bandsCount'] # Initiate arrays for storing noumber of available & good observations nobs = np.zeros((out_h, out_w), dtype=np.int8) nok = nobs.copy() # Create temp dir if it doesn't exist sav_dir = '.\\tmp' if not os.path.exists(sav_dir): os.mkdir(sav_dir) # MAIN LOOP FOR COMPOSITING tTim_A = time.time() tmp_sav_pth = [] for band in range(nr_bands): print("#\n# Creating composite for Band {}".format(band+1)) comp_stack = [] # Loop all images for i, fp in enumerate(src_fps): str_time = time.time() # Open data set src = rasterio.open(fp) # Save copy of profile for writing tiff at the end if band == 0 and i == 0: out_meta = src.profile.copy() print("# Processing Image {}.".format(i+1)) # Skip Reading the image if bbox is out of bounds xL, yD, xR, yU = [xy for xy in src.bounds] xL_out, yD_out, xR_out, yU_out = out_extents chk_bbox = (xL > xR_out or yD > yU_out or xR < xL_out or yU < yD_out) if chk_bbox: print('# Image {} not included (out of bounds).'.format(i)) break # Calculate offset for reading and slicing win, sl_x, sl_y = image_offset(out_extents, src) # ------------------------------ # Read image and store to pickle # ------------------------------ # Set offset Window for reading of TIF subset offset = win # Initiate array for output comp_band = np.full((out_h, out_w), np.nan, dtype=np.float32) # Read image and save to pickle print("# Reading the image.") if band == 0: tmp_read = src.read(window=offset) for nc in range(1, nr_bands): img_nam = ('img' + str(i+1).zfill(2) + "_b" + str(nc+1).zfill(2) + '.p') img_pth = os.path.join(sav_dir, img_nam) pickle.dump(tmp_read[nc], open(img_pth, "wb")) tmp_read = tmp_read[0] else: img_nam = ('img' + str(i+1).zfill(2) + "_b" + str(band+1).zfill(2) + '.p') img_pth = os.path.join(sav_dir, img_nam) tmp_read = pickle.load(open(img_pth, "rb")) # Read the image into the array comp_band[sl_y[0]:sl_y[1], sl_x[0]:sl_x[1]] = tmp_read tmp_read = None src.close() # ------------------------------ # determine bad pixels from mask # ------------------------------ print("# Determining bad pixels.") if band == 0: # Get index of mask idx_bad = get_mask_idx(fp, offset, comp_mask, dilate=-1) # Get index of background idx_bck = get_mask_idx(fp, offset, "background") # Update nok and nobs nobs[sl_y[0]:sl_y[1], sl_x[0]:sl_x[1]] += 1 nok[sl_y[0]:sl_y[1], sl_x[0]:sl_x[1]] += 1 nok[idx_bad[0][0]+sl_y[0], idx_bad[0][1]+sl_x[0]] += -1 nobs[idx_bck[0][0]+sl_y[0], idx_bck[0][1]+sl_x[0]] += -1 # Save index to pickle for later use idx_nam = 'idxBad_' + str(i+1).zfill(2) + '.p' idx_pth = os.path.join(sav_dir, idx_nam) pickle.dump(idx_bad, open(idx_pth, "wb")) idx_bck = None else: # Read from Pickle idx_nam = 'idxBad_' + str(i+1).zfill(2) + '.p' idx_pth = os.path.join(sav_dir, idx_nam) idx_bad = pickle.load(open(idx_pth, "rb")) # Apply mask to image if idx_bad[1] > 0: comp_band[idx_bad[0][0]+sl_y[0], idx_bad[0][1]+sl_x[0]] = np.nan idx_bad = None # Stack comp_band array into Dask Array comp_stack.append(da.from_array(comp_band, chunks=(1024, 1024))) # Close the array to save memory comp_band = None end_time = time.time() print('# --- Time: %s seconds ---' % (end_time-str_time)) # Stack all images into 1 array stacked = da.stack(comp_stack, axis=0) # Calculate composite for selected method with dask print("# Compositing Band {}".format(band+1)) str_time = time.time() if method == 'mean': comp_out = da.nanmean(stacked, axis=0, keepdims=True).compute() elif method == 'median': comp_out = da.nanmedian(stacked, axis=0, keepdims=True).compute() elif method == 'max': comp_out = da.nanmax(stacked, axis=0, keepdims=True).compute() elif method == 'min': comp_out = da.nanmin(stacked, axis=0, keepdims=True).compute() else: raise Exception('{} is not a valid compositing ' 'method!'.format(method)) end_time = time.time() print('# --- Time: %s seconds ---' % (end_time-str_time)) # After one band is resolved, save to temp file and release memory by # deleting the array if nr_bands > 1: print('# Saving temporary composite file for this band.') # Create file name and save using pickle sav_fil = 'b_' + str(band+1).zfill(2) + '.p' sav_pth = os.path.join(sav_dir, sav_fil) pickle.dump(comp_out, open(sav_pth, "wb")) # Add to savePth list with filenames tmp_sav_pth.append(sav_pth) # Clean up workspace comp_out = None tTim_B = time.time() print('--- Total time: %s seconds --- \n' % (tTim_B - tTim_A)) # ---------------------------------------------------------------------------- # OUT OF THE COMPOSITE LOOP RESTORE SAVED FILES AND BUIL TIF # ---------------------------------------------------------------------------- if nr_bands > 1: print("# Restoring saved bands.") str_time = time.time() # Initiate output array comp_out = np.full((nr_bands, out_h, out_w), np.nan, dtype=np.float32) for bnd, pth in enumerate(tmp_sav_pth): comp_out[bnd, :, :] = pickle.load(open(pth, "rb")) # Remove temporary folder rmtree(sav_dir, ignore_errors=True) end_time = time.time() print('--- Time: %s seconds ---' % (end_time-str_time)) # ---------------------------------------------------------------------------- # SAVE RESULTS TO TIF # ---------------------------------------------------------------------------- print("# Saving composite image to TIFF.") str_time = time.time() # Save composite out_nam = save_nam + "_composite.tif" out_pth = os.path.join(save_dir, out_nam) out_px = out_meta["transform"][0] out_py = out_meta["transform"][4] out_trans = Affine(out_px, 0.0, xL_out, 0.0, out_py, yU_out) out_meta.update( height=comp_out.shape[1], width=comp_out.shape[2], transform=out_trans, bigtiff="yes" ) with rasterio.open(out_pth, "w", **out_meta) as dest: dest.write(comp_out) # Save nok mask out_nam = save_nam + "_nok.tif" out_pth = os.path.join(save_dir, out_nam) nok_meta = out_meta.copy() nok_meta.update( count=1, dtype="int8" ) with rasterio.open(out_pth, "w", **nok_meta) as dest: dest.write(np.expand_dims(nok, axis=0)) # Save nobs mask out_nam = save_nam + "_nobs.tif" out_pth = os.path.join(save_dir, out_nam) with rasterio.open(out_pth, "w", **nok_meta) as dest: dest.write(np.expand_dims(nobs, axis=0)) end_time = time.time() print('--- Time: %s seconds ---' % (end_time-str_time)) tTim_B = time.time() print('\n--- Total time: %s seconds --- \n' % (tTim_B - tTim_A))
def ds(self): if self._ds is None: file_exists = os.path.exists(self._result_file) reprocess = not file_exists or self._reprocess if reprocess: if file_exists: print('Old file exists ' + self._result_file) #print('Removing old file ' + self._result_file) #shutil.rmtree(self._result_file) ds_data = OrderedDict() to_seconds = np.vectorize( lambda x: x.seconds + x.microseconds / 1E6) print('Processing binary data...') xx, yy, zz = self._loadgrid() if xx is None: if self._from_nc: print('Processing existing netcdf...') fn = self._result_file[:-5] + '_QC_raw.nc' if os.path.exists(fn): ds_temp = xr.open_dataset(self._result_file[:-5] + '_QC_raw.nc', chunks={'time': 50}) u = da.transpose(ds_temp['U'].data, axes=[3, 0, 1, 2]) v = da.transpose(ds_temp['V'].data, axes=[3, 0, 1, 2]) w = da.transpose(ds_temp['W'].data, axes=[3, 0, 1, 2]) tt = ds_temp['time'] te = (tt - tt[0]) / np.timedelta64(1, 's') xx = ds_temp['x'].values yy = ds_temp['y'].values zz = ds_temp['z'].values else: print('USING OLD ZARR DATA') ds_temp = xr.open_zarr(self._result_file) u = da.transpose(ds_temp['U'].data, axes=[3, 0, 1, 2]) v = da.transpose(ds_temp['V'].data, axes=[3, 0, 1, 2]) w = da.transpose(ds_temp['W'].data, axes=[3, 0, 1, 2]) tt = ds_temp['time'] te = (tt - tt[0]) / np.timedelta64(1, 's') xx = ds_temp['x'].values yy = ds_temp['y'].values zz = ds_temp['z'].values print('ERROR: No NetCDF data found for ' + self._xml_file) #return None # print(u.shape) else: tt, uvw = self._loaddata(xx, yy, zz) if tt is None: print('ERROR: No binary data found for ' + self._xml_file) return None # calculate the elapsed time from the Timestamp objects and then convert to datetime64 datatype te = to_seconds(tt - tt[0]) tt = pd.to_datetime(tt) uvw = uvw.persist() u = uvw[:, :, :, :, 0] v = uvw[:, :, :, :, 1] w = uvw[:, :, :, :, 2] # u = xr.DataArray(uvw[:,:,:,:,0], coords=[tt, xx, yy, zz], dims=['time','x', 'y', 'z'], # name='U', attrs={'standard_name': 'sea_water_x_velocity', 'units': 'm s-1'}) # v = xr.DataArray(uvw[:,:,:,:,1], coords=[tt, xx, yy, zz], dims=['time', 'x', 'y', 'z'], # name='V', attrs={'standard_name': 'sea_water_x_velocity', 'units': 'm s-1'}) # w = xr.DataArray(uvw[:,:,:,:,2], coords=[tt, xx, yy, zz], dims=['time', 'x', 'y', 'z'], # name='W', attrs={'standard_name': 'upward_sea_water_velocity', 'units': 'm s-1'}) if xx is None: print('No data found') return None u = u.persist() v = v.persist() w = w.persist() dx = float(xx[1] - xx[0]) dy = float(yy[1] - yy[0]) dz = float(zz[1] - zz[0]) if self._norm_dims: exp = self._result_root.split('/')[4] runSheet = pd.read_csv('~/RunSheet-%s.csv' % exp) runSheet = runSheet.set_index('RunID') runDetails = runSheet.ix[int(self.run_id[-2:])] T = runDetails['T (s)'] h = runDetails['h (m)'] D = runDetails['D (m)'] ww = te / T om = 2. * np.pi / T d_s = (2. * 1E-6 / om)**0.5 bl = 3. * np.pi / 4. * d_s if exp == 'Exp6': if D == 0.1: dy_c = (188. + 82.) / 2 dx_c = 39.25 cx = dx_c / 1000. cy = dy_c / 1000. else: dy_c = (806. + 287.) / 2. * 0.22 dx_c = 113 * 0.22 cx = dx_c / 1000. cy = dy_c / 1000. elif exp == 'Exp8': dy_c = 624 * 0.22 dx_c = 15 cx = dx_c / 1000. cy = dy_c / 1000. xn = (xx + (D / 2. - cx)) / D yn = (yy - cy) / D zn = zz / h xnm, ynm = np.meshgrid(xn, yn) rr = np.sqrt(xnm**2. + ynm**2) cylMask = rr < 0.5 nanPlane = np.ones(cylMask.shape) nanPlane[cylMask] = np.nan nanPlane = nanPlane.T nanPlane = nanPlane[np.newaxis, :, :, np.newaxis] u = u * nanPlane v = v * nanPlane w = w * nanPlane if D == 0.1: xInds = xn > 3. else: xInds = xn > 2. blInd = np.argmax(zn > bl / h) blPlane = int(round(blInd)) Ue = u[:, xInds, :, :] Ue_bar = da.nanmean(Ue, axis=(1, 2, 3)).compute() Ue_bl = da.nanmean(Ue[:, :, :, blPlane], axis=(1, 2)).compute() inds = ~np.isnan(Ue_bl) xv = ww[inds] % 1. xv = xv + np.random.normal(scale=1E-6, size=xv.shape) yv = Ue_bl[inds] xy = np.stack([ np.concatenate([xv - 1., xv, xv + 1.]), np.concatenate([yv, yv, yv]) ]).T xy = xy[xy[:, 0].argsort(), :] xi = np.linspace(-0.5, 1.5, len(xv) / 8) n = np.nanmax(xy[:, 1]) # print(n) # fig,ax = pl.subplots() # ax.scatter(xy[:,0],xy[:,1]/n) # print(xy) spl = si.LSQUnivariateSpline(xy[:, 0], xy[:, 1] / n, t=xi, k=3) roots = spl.roots() der = spl.derivative() slope = der(roots) inds = np.min(np.where(slope > 0)) dt = (roots[inds] % 1.).mean() - 0.5 tpx = np.arange(0, 0.5, 0.001) U0_bl = np.abs(spl(tpx + dt).min() * n) ws = ww - dt Ue_spl = spl((ws - 0.5) % 1.0 + dt) * n * -1.0 #maxima = spl.derivative().roots() #Umax = spl(maxima) #UminIdx = np.argmin(Umax) #U0_bl = np.abs(Umax[UminIdx]*n) #ww_at_min = maxima[UminIdx] #ws = ww - ww_at_min + 0.25 inds = ~np.isnan(Ue_bar) xv = ww[inds] % 1. xv = xv + np.random.normal(scale=1E-6, size=xv.shape) yv = Ue_bar[inds] xy = np.stack([ np.concatenate([xv - 1., xv, xv + 1.]), np.concatenate([yv, yv, yv]) ]).T xy = xy[xy[:, 0].argsort(), :] xi = np.linspace(-0.5, 1.5, len(xv) / 8) n = np.nanmax(xy[:, 1]) spl = si.LSQUnivariateSpline(xy[:, 0], xy[:, 1] / n, t=xi, k=4) maxima = spl.derivative().roots() Umax = spl(maxima) UminIdx = np.argmin(Umax) U0_bar = np.abs(Umax[UminIdx] * n) ww = xr.DataArray(ww, coords=[ tt, ], dims=[ 'time', ]) ws = xr.DataArray(ws - 0.5, coords=[ tt, ], dims=[ 'time', ]) xn = xr.DataArray(xn, coords=[ xx, ], dims=[ 'x', ]) yn = xr.DataArray(yn, coords=[ yy, ], dims=[ 'y', ]) zn = xr.DataArray(zn, coords=[ zz, ], dims=[ 'z', ]) Ue_bar = xr.DataArray(Ue_bar, coords=[ tt, ], dims=[ 'time', ]) Ue_bl = xr.DataArray(Ue_bl, coords=[ tt, ], dims=[ 'time', ]) Ue_spl = xr.DataArray(Ue_spl, coords=[ tt, ], dims=[ 'time', ]) ds_data['ww'] = ww ds_data['ws'] = ws ds_data['xn'] = xn ds_data['yn'] = yn ds_data['zn'] = zn ds_data['Ue_bar'] = Ue_bar ds_data['Ue_bl'] = Ue_bl ds_data['Ue_spl'] = Ue_spl te = xr.DataArray(te, coords=[ tt, ], dims=[ 'time', ]) dims = ['time', 'x', 'y', 'z'] coords = [tt, xx, yy, zz] ds_data['U'] = xr.DataArray(u, coords=coords, dims=dims, name='U', attrs={ 'standard_name': 'sea_water_x_velocity', 'units': 'm s-1' }) ds_data['V'] = xr.DataArray(v, coords=coords, dims=dims, name='V', attrs={ 'standard_name': 'sea_water_x_velocity', 'units': 'm s-1' }) ds_data['W'] = xr.DataArray(w, coords=coords, dims=dims, name='W', attrs={ 'standard_name': 'sea_water_x_velocity', 'units': 'm s-1' }) ds_data['te'] = te # stdV = da.nanstd(v) # stdW = da.nanstd(w) # thres=7. if 'U0_bl' in locals(): condition = (da.fabs(v) / U0_bl > 1.5) | (da.fabs(w) / U0_bl > 0.6) for var in ['U', 'V', 'W']: ds_data[var].data = da.where(condition, np.nan, ds_data[var].data) piv_step_frame = float( self._xml_root.findall('piv/stepFrame')[0].text) print('Calculating tensor') # j = jacobianConv(ds.U, ds.V, ds.W, dx, dy, dz, sigma=1.5) j = jacobianDask(u, v, w, piv_step_frame, dx, dy, dz) print('Done') #j = da.from_array(j,chunks=(20,-1,-1,-1,-1,-1)) # j = jacobianDask(uvw[:,:,:,:,0],uvw[:,:,:,:,1], uvw[:,:,:,:,2], piv_step_frame, dx, dy, dz) jT = da.transpose(j, axes=[0, 1, 2, 3, 5, 4]) # j = j.persist() # jT = jT.persist() jacobianNorm = da.sqrt( da.nansum(da.nansum(j**2., axis=-1), axis=-1)) strainTensor = (j + jT) / 2. vorticityTensor = (j - jT) / 2. strainTensorNorm = da.sqrt( da.nansum(da.nansum(strainTensor**2., axis=-1), axis=-1)) vorticityTensorNorm = da.sqrt( da.nansum(da.nansum(vorticityTensor**2., axis=-1), axis=-1)) divergence = j[:, :, :, :, 0, 0] + j[:, :, :, :, 1, 1] + j[:, :, :, :, 2, 2] # print(divergence) omx = vorticityTensor[:, :, :, :, 2, 1] * 2. omy = vorticityTensor[:, :, :, :, 0, 2] * 2. omz = vorticityTensor[:, :, :, :, 1, 0] * 2. divNorm = divergence / jacobianNorm # divNorm = divNorm.persist() # divNorm_mean = da.nanmean(divNorm) # divNorm_std = da.nanstd(divNorm) dims = ['x', 'y', 'z'] comp = ['u', 'v', 'w'] ds_data['jacobian'] = xr.DataArray( j, coords=[tt, xx, yy, zz, comp, dims], dims=['time', 'x', 'y', 'z', 'comp', 'dims'], name='jacobian') ds_data['jacobianNorm'] = xr.DataArray( jacobianNorm, coords=[tt, xx, yy, zz], dims=['time', 'x', 'y', 'z'], name='jacobianNorm') ds_data['strainTensor'] = xr.DataArray( strainTensor, coords=[tt, xx, yy, zz, comp, dims], dims=['time', 'x', 'y', 'z', 'comp', 'dims'], name='strainTensor') ds_data['vorticityTensor'] = xr.DataArray( vorticityTensor, coords=[tt, xx, yy, zz, comp, dims], dims=['time', 'x', 'y', 'z', 'comp', 'dims'], name='vorticityTensor') ds_data['vorticityNorm'] = xr.DataArray( vorticityTensorNorm, coords=[tt, xx, yy, zz], dims=['time', 'x', 'y', 'z'], name='vorticityNorm') ds_data['strainNorm'] = xr.DataArray( strainTensorNorm, coords=[tt, xx, yy, zz], dims=['time', 'x', 'y', 'z'], name='strainNorm') ds_data['divergence'] = xr.DataArray( divergence, coords=[tt, xx, yy, zz], dims=['time', 'x', 'y', 'z'], name='divergence') ds_data['omx'] = xr.DataArray(omx, coords=[tt, xx, yy, zz], dims=['time', 'x', 'y', 'z'], name='omx') ds_data['omy'] = xr.DataArray(omy, coords=[tt, xx, yy, zz], dims=['time', 'x', 'y', 'z'], name='omy') ds_data['omz'] = xr.DataArray(omz, coords=[tt, xx, yy, zz], dims=['time', 'x', 'y', 'z'], name='omz') ds_data['divNorm'] = xr.DataArray(divNorm, coords=[tt, xx, yy, zz], dims=['time', 'x', 'y', 'z'], name='divNorm') # ds_data['divNorm_mean'] = xr.DataArray(divNorm_mean) # ds_data['divNorm_std'] = xr.DataArray(divNorm_std) ds = xr.Dataset(ds_data) # if self._from_nc: # for k,v in ds_temp.attrs.items(): # ds.attrs[k]=v #ds = ds.chunk({'time': 20}) self._append_CF_attrs(ds) self._append_attrs(ds) ds.attrs['filename'] = self._result_file if self._norm_dims: KC = U0_bl * T / D delta = (2. * np.pi * d_s) / h S = delta / KC ds.attrs['T'] = T ds.attrs['h'] = h ds.attrs['D'] = D ds.attrs['U0_bl'] = U0_bl ds.attrs['U0_bar'] = U0_bar ds.attrs['KC'] = KC ds.attrs['S'] = S ds.attrs['Delta+'] = ((1E-6 * T)**0.5) / h ds.attrs['Delta_l'] = 2 * np.pi * d_s ds.attrs['Delta_s'] = d_s ds.attrs['Re_D'] = U0_bl * D / 1E-6 ds.attrs['Beta'] = D**2. / (1E-6 * T) delta = (ds.attrs['dx'] * ds.attrs['dy'] * ds.attrs['dz'])**(1. / 3.) dpx = (ds.attrs['pdx'] * ds.attrs['pdy'] * ds.attrs['pdz'])**(1. / 3.) delta_px = delta / dpx dt = ds.attrs['piv_step_ensemble'] # divRMS = da.sqrt(da.nanmean((divergence * dt) ** 2.)) # divRMS = divRMS.persist() # vorticityTensorNorm.persist() # velocityError = divRMS/((3./(2.*delta_px**2.))**0.5) # print(da.percentile(ds_new['vorticityTensorNorm'].data.ravel(),99.)) # print(ds_new['divRMS']) # print(ds_new['divNorm_mean']) # vorticityError = divRMS/dt/da.percentile(vorticityTensorNorm.ravel(),99.) # divNorm_mean = da.nanmean(divNorm) # divNorm_std = da.nanstd(divNorm) # print("initial save") #ds.to_zarr(self._result_file,compute=False) #ds = xr.open_zarr(self._result_file) # xstart = np.argmax(xx > 0.05) # ystart = np.argmax(yy > 0.07) divRMS = da.sqrt(da.nanmean( (divergence * dt)**2.)) #.compute() #divNorm = divergence / jacobianNorm #divNorm = divNorm.compute() #divNorm_mean = da.nanmean(divNorm).compute() #divNorm_std = da.nanstd(divNorm).compute() velocityError = divRMS / ((3. / (2. * delta_px**2.))**0.5) vortNorm = vorticityTensorNorm #.compute() vorticityError = divRMS / dt / np.percentile( vortNorm.ravel(), 99.) velocityError, vorticityError = da.compute( velocityError, vorticityError) #ds.attrs['divNorm_mean'] = divNorm_mean #ds.attrs['divNorm_std'] = divNorm_std ds.attrs['velocityError'] = velocityError ds.attrs['vorticityError'] = vorticityError if self._norm_dims: xInds = (xn > 0.5) & (xn < 2.65) yInds = (yn > -0.75) & (yn < 0.75) else: xInds = range(len(ds['x'])) yInds = range(len(ds['y'])) vrms = (ds['V'][:, xInds, yInds, :]**2.).mean( dim=['time', 'x', 'y', 'z'])**0.5 wrms = (ds['W'][:, xInds, yInds, :]**2.).mean( dim=['time', 'x', 'y', 'z'])**0.5 ds.attrs['Vrms'] = float(vrms.compute()) ds.attrs['Wrms'] = float(wrms.compute()) #fig,ax = pl.subplots() #ax.plot(ds.ws,ds.Ue_spl/U0_bl,color='k') #ax.plot(ds.ws,ds.Ue_bl/U0_bl,color='g') #ax.set_xlabel(r'$t/T$') #ax.set_ylabel(r'$U_{bl}/U_0$') #fig.savefig(self._result_file[:-4] + 'png',dpi=125) #pl.close(fig) # print("second save") #ds.to_netcdf(self._result_file) ds.to_zarr(self._result_file, mode='w') print('Cached ' + self._result_file) #ds = xr.open_dataset(self._result_file,chunks={'time':20}) ds = xr.open_zarr(self._result_file) ds.attrs['filename'] = self._result_file else: #ds = xr.open_dataset(self._result_file,chunks={'time':20}) ds = xr.open_zarr(self._result_file) ds.attrs['filename'] = self._result_file self._ds = ds return self._ds
def tall_clutter(files, config, clutter_thresh_min=0.0002, clutter_thresh_max=0.25, radius=1, write_radar=True, out_file=None, use_dask=False): """ Wind Farm Clutter Calculation Parameters ---------- files : list List of radar files used for the clutter calculation. config : str String representing the configuration for the radar. Such possible configurations are listed in default_config.py Other Parameters ---------------- clutter_thresh_min : float Threshold value for which, any clutter values above the clutter_thres_min will be considered clutter, as long as they are also below the clutter_thres_max. clutter_thresh_max : float Threshold value for which, any clutter values below the clutter_thres_max will be considered clutter, as long as they are also above the clutter_thres_min. radius : int Radius of the area surrounding the clutter gate that will be also flagged as clutter. write_radar : bool Whether to or not, to write the clutter radar as a netCDF file. Default is True. out_file : string String of location and filename to write the radar object too, if write_radar is True. use_dask : bool Use dask instead of running stats for calculation. The will reduce run time. Returns ------- clutter_radar : Radar Radar object with the clutter field that was calculated. This radar only has the clutter field, but maintains all other radar specifications. """ field_names = get_field_names(config) refl_field = field_names["reflectivity"] vel_field = field_names["velocity"] ncp_field = field_names["normalized_coherent_power"] def get_reflect_array(file, first_shape): """ Retrieves a reflectivity array for a radar volume. """ try: radar = pyart.io.read( file, include_fields=[refl_field, ncp_field, vel_field]) reflect_array = deepcopy(radar.fields[refl_field]['data']) ncp = radar.fields[ncp_field]['data'] height = radar.gate_z["data"] up_in_the_air = height > 2000.0 the_mask = np.logical_or.reduce( (ncp < 0.8, reflect_array.mask, up_in_the_air)) reflect_array = np.ma.masked_where(the_mask, reflect_array) del radar if reflect_array.shape == first_shape: return reflect_array.filled(fill_value=np.nan) except (TypeError, OSError): print(file + ' is corrupt...skipping!') return np.nan * np.zeros(first_shape) if use_dask is False: run_stats = _RunningStats() first_shape = 0 for file in files: try: radar = pyart.io.read(file) reflect_array = radar.fields[refl_field]['data'] ncp = deepcopy(radar.fields[ncp_field]['data']) #reflect_array = np.ma.masked_where(ncp < 0.7, reflect_array) if first_shape == 0: first_shape = reflect_array.shape clutter_radar = radar run_stats.push(reflect_array) if reflect_array.shape == first_shape: run_stats.push(reflect_array) del radar except (TypeError, OSError): print(file + ' is corrupt...skipping!') continue mean = run_stats.mean() stdev = run_stats.standard_deviation() clutter_values = stdev / mean clutter_values = np.ma.masked_invalid(clutter_values) clutter_values_no_mask = clutter_values.filled(clutter_values_max + 1) else: cluster = LocalCluster(n_workers=20, processes=True) client = Client(cluster) first_shape = 0 i = 0 while first_shape == 0: try: radar = pyart.io.read(files[i]) reflect_array = radar.fields[refl_field]['data'] first_shape = reflect_array.shape clutter_radar = radar except (TypeError, OSError): i = i + 1 print(file + ' is corrupt...skipping!') continue arrays = [ delayed(get_reflect_array)(file, first_shape) for file in files ] array = [ da.from_delayed(a, shape=first_shape, dtype=float) for a in arrays ] array = da.stack(array, axis=0) print('## Calculating mean in parallel...') mean = np.array(da.nanmean(array, axis=0)) print('## Calculating standard deviation...') count = np.array(da.sum(da.isfinite(array), axis=0)) stdev = np.array(da.nanstd(array, axis=0)) clutter_values = stdev / mean clutter_values = np.ma.masked_invalid(clutter_values) clutter_values = np.ma.masked_where( np.logical_or(clutter_values.mask, count < 20), clutter_values) # Masked arrays can suck clutter_values_no_mask = clutter_values.filled( (clutter_thresh_max + 1)) shape = clutter_values.shape mask = np.ma.getmask(clutter_values) is_clutters = np.argwhere( np.logical_and.reduce(( clutter_values_no_mask > clutter_thresh_min, clutter_values_no_mask < clutter_thresh_max, ))) clutter_array = _clutter_marker(is_clutters, shape, mask, radius) clutter_radar.fields.clear() clutter_array = clutter_array.filled(0) clutter_dict = _clutter_to_dict(clutter_array) clutter_value_dict = _clutter_to_dict(clutter_values) clutter_value_dict["long_name"] = "Clutter value (std. dev/mean Z)" clutter_value_dict["standard_name"] = "clutter_value" clutter_radar.add_field('ground_clutter', clutter_dict, replace_existing=True) clutter_radar.add_field('clutter_value', clutter_value_dict, replace_existing=True) if write_radar is True: pyart.io.write_cfradial(out_file, clutter_radar) del clutter_radar return
def mask_imputation( array: da.core.Array, mask_values: Optional[da.core.Array] = None, fill_value: int = 0, mask_method: str = 'mean', mask_axis: int = 0) -> Tuple[da.core.Array, sparse._coo.core.COO]: """ Creates the mask that will fill "array" and the filled_array that has the missing values of "array" filled. If A is array and has missing values A = [[1, 2, 3], [?, 4, 5], [3, 4, ?]] Then mask is a Sparse COO array that has teh following entries mask = [[-, -, -], [a, -, -], [-, -, b]] Where "-" refers 0, but is replaced with "-" to show that value is not stored Then the filled array is: A_filled = [[1, 2, 3], [f, 4, 5], [3, 4, f]] Where "f" refers to a common fill value specified as "fill_value" Parameters ---------- array : array_like, shape (N, P) Array that a copy of will be filled, and if needed mask values will be computed from mask_values : array_like, shape (P,) optional Values to fill mask with, if already computed fill_value : int Value that will be used to fill NaN values in array mask_method : str Method used to compute mask_values. Only used if mask_values is not specified mask_axis : int Axis in which values will be computed from. axis = 0 ===> column summary of values axis = 1 ===> row summary of values Returns ------- filled_array: dask array, shape (N, P) copy of "array" with nan_values filled, if specified mask : dask array, shape (N, P) sparse dask array with mask values where "array" is NaN. """ if not isinstance(array._meta, np.ndarray): raise ValueError( f'expected meta, {type(np.ndarray)}, but got {type(array._meta)}') if mask_values is None: if mask_method == 'mean': mask_values = da.nanmean(array, axis=mask_axis).compute() else: raise NotImplementedError( f'mask_method, {mask_method}, is not implemented ') else: try: mask_values = mask_values.compute() except AttributeError: pass coords = compute(*da.where(da.isnan(array))) if not len(coords[0]): raise ValueError( 'expected array to have maskable values, but got none.') data = axis_wise_COO_data_by_axis(mask_values, coords, axis=1 - mask_axis) mask = sparse.COO(coords=np.vstack(coords), data=data, shape=array.shape, has_duplicates=False, cache=True) mask = da.from_array(mask, chunks=array.shape) filled_array = fill_array(array, fill_value=fill_value).persist() return filled_array, mask
b = a.any() # The request of a Numpy comparison is a dask array are stored as operations # to perform, but the operations have not been perfoemd yet. print('b:', b) # So to see the values we need to perform operations and convert to Numpy array. print('b.compute():', b.compute()) # Block 2 if False: # Most of the Numpy operations are available in Dask computations. a = da.random.random(1000, chunks=100) # x = da.arange(1000, chunks=100) b = da.ones(1000, chunks=100) c = b - a c = da.nanmean(c) print('c:', c) print('c.compute():', c.compute()) # Block 3 # How much faster is dask than Numpy at some calculations? if False: # Let's make a large array. num = 100000000 # What if that array is smaller? Turns out the overhead of implementing Dask # can make it slower for smaller datasets. # num = 10000 start_time = time.time() b = np.ones(num) - np.random.random(num)