def show_images(): x = plt.figure(1) plt.clf() plt.imshow(sky, vmin=da.min(sky), vmax=da.max(sky)) plt.title('sky') plt.show(block=False) y = plt.figure(2) plt.clf() plt.imshow(psf, vmin=da.min(psf), vmax=da.max(psf)) plt.title('psf') plt.show(block=False) z = plt.figure(3) plt.clf() plt.imshow(dirty, vmin=da.min(dirty), vmax=da.max(dirty)) plt.title('dirty') plt.show(block=False) while (plt.fignum_exists(1) and plt.fignum_exists(2) and plt.fignum_exists(3)): try: plt.pause(10000000) plt.close("all") except: break
def max_and_argmax(data): """Returns max and argmax along last two axes. Last two axes should correspond to the x and y dimensions. Parameters ---------- data : dask array data with at least 3 dimensions Returns ------- weights : dask array max of `data` along the last two axes argmax : dask array argmax of `data` along the last two axes """ # Slap out a dimension to nicely apply argmax and max flatData = data.reshape(data.shape[:-2] + (-1, )) argmax = da.argmax(flatData, axis=-1) # We can forego calculating both max and argmax as soon as # we have da.take_along_axis() https://github.com/dask/dask/issues/3663 # Would a map_blocks of np.take_along_axis() work and be faster? weights = da.max(flatData, axis=-1) return weights, argmax
def load_data(statistic, axis): import dask.array as da import numpy as np from glue.utils import view_shape x = da.from_zarr('/mnt/cephfs/zarr_data_full') f = 1500 scale = 2 lh = [] for k in range(scale): lc = [] for i in range(scale): lr = [] for j in range(scale): lr.append(x[f % 3500]) f = f + 1 lc.append(da.concatenate(lr)) lh.append(da.concatenate(lc, 1)) z = da.concatenate(lh, 2) if statistic == 'minimum': return da.min(z, axis).compute() elif statistic == 'maximum': return da.max(z, axis).compute() elif statistic == 'mean' or statistic == 'median': return da.mean(z, axis).compute() elif statistic == 'percentile': return percentile / 100 elif statistic == 'sum': return da.sum(z.axis).compute() return 0
def searchdask(a, v, how=None, atol=None): n_a = a.shape[0] searchfunc, args = presearch(a, v) if how == 'nearest': l_index = da.maximum(searchfunc(*args, side='right') - 1, 0) r_index = da.minimum(searchfunc(*args), n_a - 1) cond = 2 * v < (select(a, r_index) + select(a, l_index)) indexer = da.maximum(da.where(cond, l_index, r_index), 0) elif how == 'bfill': indexer = searchfunc(*args) elif how == 'ffill': indexer = searchfunc(*args, side='right') - 1 indexer = da.where(indexer == -1, n_a, indexer) elif how is None: l_index = searchfunc(*args) r_index = searchfunc(*args, side='right') indexer = da.where(l_index == r_index, n_a, l_index) else: return NotImplementedError if atol is not None: a2 = da.concatenate([a, [atol + da.max(v) + 1]]) indexer = da.where( da.absolute(select(a2, indexer) - v) > atol, n_a, indexer) return indexer
def extract(self): df_path = pd.read_csv('path_to_file.csv', sep=';') df_path = df_path.rename(columns={'Unnamed: 0': 'id'}) df_path = df_path.set_index('id') print(df_path) ds_batch = xr.open_mfdataset(df_path['path'], parallel=True) #loading ncdf files print(ds_batch) print("--- Total size (GB):") print(ds_batch.nbytes * (2**-30)) # get size of the dataset in GB #getting average albedos over whole time period (used for maps and scatter plots) darr = ds_batch['QFLAG'] #getting data for specific band print(darr) #res = darr.mean(['lon','lat']) #res = da.count_nonzero( da.bitwise_and(darr//2**5, 1), ['lon','lat']) #res = (darr==32).sum(['lon','lat']) #res = xr.ufunc.bitwise_and(darr, 0b100000).sum(['lon','lat']) func = lambda x: np.bitwise_and(np.right_shift(x, 5), np.uint64(1)) func = lambda x: np.bitwise_and(x, np.uint64(1)) res = xr.apply_ufunc(func, darr, input_core_dims=[['lon', 'lat']], dask='parallelized', vectorize=True) #res = itwise_and(np.right_shift(darr, 5), 1).sum(['lon','lat]) #res = (darr==32).max(['lon','lat']) print(np.array(res)) sys.exit() da_count = ((da >> 5) & 1) #calculate mean over time #da_mean_lowres = da_mean.sel(lat=slice(70, 30)).sel(lon=slice(-25, 70)) # this can be used to zoom in over Europe da_mean_lowres = da_mean.isel(lat=slice(None, None, 10)).isel( lon=slice(None, None, 10)) #downsampling for faster plotting #getting average, min and max albedos for each time step (used to plot timeline) da_timeline_mean = da.mean(['lon', 'lat']) da_timeline_max = da.max(['lon', 'lat']) da_timeline_min = da.min(['lon', 'lat']) #closing arrays to free memory DS.close() da.close() da_mean.close() return da_mean_lowres, da_timeline_mean, da_timeline_max, da_timeline_min da_mean_lowres.close() da_timeline_mean.close() da_timeline_max.close() da_timeline_min.close()
def _subdivide(self, hdf5obj, imagepathin, imagepathout=None): # Use whatever chunk size that imaris has used # Not sure this is perfect - sometimes there are some redundant # slices to pad out the chunk chunkshape = hdf5obj[imagepathin].chunks imshape = hdf5obj[imagepathin].shape aa = (tuple([imshape[0]]), self.chunkstuff(imshape[1], chunkshape[1]), self.chunkstuff(imshape[2], chunkshape[2])) dtp = hdf5obj[imagepathin].dtype #print("Image shape", imshape) subsamp = self._subdiv # imaris appears to do z,y,x - only subsample x and y... daskimg = da.from_array(hdf5obj[imagepathin], chunks=aa) #blurred = daskimg.map_overlap(mysmoother2, depth=(0, 6, 6), boundary='reflect', dtype = dtp) blurred = daskimg.map_overlap(self.mysmoother, depth=(0, 6, 6), boundary='reflect', dtype=dtp) #d2 = (np.ceil(np.array(chunkshape)/2.0)).astype(int) dz = tuple(np.ceil(np.array(aa[0]) / float(subsamp[0])).astype(int)) dy = tuple(np.ceil(np.array(aa[1]) / float(subsamp[1])).astype(int)) dx = tuple(np.ceil(np.array(aa[2]) / float(subsamp[2])).astype(int)) downsamp = blurred.map_blocks(self.myresize, dtype=dtp, chunks=(dz, dy, dx)) # histograms mx = da.max(downsamp) mn = da.max(downsamp) mx = mx.compute() mn = mn.compute() h, bins = da.histogram(downsamp, bins=256, range=(mx, mx)) self.to_hdf5(hdf5obj, imagepathout, downsamp) # need to fix this - will break on windows grouppath = posixpath.dirname(imagepathout) def mkAttr(XX): return np.frombuffer(str(XX).encode(), dtype='|S1') hdf5obj[grouppath].attrs['ImageSizeX'] = mkAttr(downsamp.shape[2]) hdf5obj[grouppath].attrs['ImageSizeY'] = mkAttr(downsamp.shape[1]) hdf5obj[grouppath].attrs['ImageSizeZ'] = mkAttr(downsamp.shape[0]) hdf5obj[grouppath].attrs['HistogramMin'] = mkAttr(mn) hdf5obj[grouppath].attrs['HistogramMax'] = mkAttr(mx) self.to_hdf5(hdf5obj, posixpath.join(grouppath, 'Histogram'), h)
def analyze(t, c, z): plane = data[t, c, z, :, :] smoothed_image = dask_image.ndfilters.gaussian_filter(plane, sigma=[1, 1]) threshold_value = 0.75 * da.max(smoothed_image).compute() threshold_image = smoothed_image > threshold_value label_image, num_labels = dask_image.ndmeasure.label(threshold_image) name = 't:%s, c: %s, z:%s' % (t, c, z) print("Plane coordinates: %s" % name) ref = 't_%s_c_%s_z_%s' % (t, c, z) return label_image, ref
def plot_subfigure(X, Y, subplot, transform): if transform == "pca": X = PCA(n_components=2).fit_transform(X) elif transform == "cca": X = CCA(n_components=2).fit(X, Y).transform(X) else: raise ValueError min_x = da.min(X[:, 0]) max_x = da.max(X[:, 0]) min_y = da.min(X[:, 1]) max_y = da.max(X[:, 1]) classif = OneVsRestClassifier(LogisticRegression()) classif.fit(X, Y) y_pred = classif.predict(X) print('{} + OneVsRestClassifier + LogisticRegression accuracy_score {}'. format(transform, accuracy_score(Y, y_pred))) plt.subplot(1, 2, subplot) plt.scatter(X[:, 0], X[:, 1], s=15, c='gray', edgecolors=(0, 0, 0)) for i in da.unique(Y.argmax(axis=1)): class_ = da.where(Y[:, i]) plt.scatter(X[class_, 0], X[class_, 1], s=25, linewidths=2, label='Class {}'.format(str(i))) for i in range(len(classif.estimators_)): plot_hyperplane(classif.estimators_[i], min_x, max_x, 'k--', 'Boundary\nfor class {}'.format(str(i))) plt.xticks(()) plt.yticks(()) plt.xlim(min_x - .1 * max_x, max_x + .1 * max_x) plt.ylim(min_y - .1 * max_y, max_y + .1 * max_y)
def add_data(workspace: String, dataset: String): import dask.array as da from survos2.improc.utils import optimal_chunksize ws = get(workspace) with dataset_from_uri(dataset, mode='r') as data: chunk_size = optimal_chunksize(data, Config['computing.chunk_size']) data = da.from_array(data, chunks=chunk_size) data -= da.min(data) data /= da.max(data) ds = ws.add_data(data) logger.info(type(ds)) return ds
def max_and_argmax(data): """Return the dask max and argmax of data along the last two axes, which corresponds to the x and y dimensions (uncomputed) """ # Slap out a dimension to nicely apply argmax and max flatData = data.reshape(data.shape[:-2] + (-1, )) argmax = da.argmax(flatData, axis=-1) # We can forego calculating both max and argmax as soon as # we have da.take_along_axis() https://github.com/dask/dask/issues/3663 # Would a map_blocks of np.take_along_axis() work and be faster? weights = da.max(flatData, axis=-1) return weights, argmax
def show_images(): plt.figure(1) plt.clf() plt.imshow(quad, vmin=da.min(quad), vmax=da.max(quad)) plt.title('quad') plt.show(block=False) while (plt.fignum_exists(1)): try: plt.pause(100000) plt.close("all") except: break
def show_results(): x = plt.figure(1) plt.clf() plt.imshow(hub, vmin=da.min(hub), vmax=da.max(hub)) plt.title('huber') plt.show(block=False) while (plt.fignum_exists(1)): try: plt.pause(10000000) plt.close("all") except: break
def show_images(): for i in range(len(dirty)): plt.figure(i+1) plt.clf() plt.imshow(quad[i], vmin = da.min(quad[i]), vmax = da.max(quad[i])) plt.title('quad' + str(i)) plt.show(block=False) while(plt.fignum_exists(1)): try: plt.pause(100000) plt.close("all") except: break
def maxproj2tiff( in_filepath: str, out_filepath: str, channel_names: typing.Any = None, flip: bool = False, overwrite: bool = False, ): """ Maximum projection over channels of HDF5 and save to disk as TIFF. Args: in_filepath, out_filepath: str Paths of input HDF5 and output TIFF files. channel_names: list(str), str Names of the HDF5 datasets to use. If string, treated as path to a text file where each line is the name of a channel. overwrite: bool [optional] Overwrite the output file if already exists, default False. """ # parse channel names if isinstance(channel_names, str): with open(channel_names, "r") as f: channel_names = [line.strip() for line in f] # load data f = h5py.File(in_filepath, "r") # allowing same API for images not need maximum projection # but still need to be saved as TIFF if len(channel_names) > 1: arr = f[channel_names[0]] else: arr_list = [da.from_array(f[key]) for key in channel_names] arr = da.max(da.stack(arr_list, axis=-1), axis=-1) # in case flipping is needed if flip: try: dtype = np.iinfo(arr.dtype) except ValueError: dtype = np.finfo(arr.dtype) arr = dtype.max - arr # save to disk as TIFF tifffile.imsave(out_filepath, arr)
def test_reductions(): x = np.arange(5).astype('f4') a = da.from_array(x, blockshape=(2, )) assert eq(da.all(a), np.all(x)) assert eq(da.any(a), np.any(x)) assert eq(da.argmax(a, axis=0), np.argmax(x, axis=0)) assert eq(da.argmin(a, axis=0), np.argmin(x, axis=0)) assert eq(da.max(a), np.max(x)) assert eq(da.mean(a), np.mean(x)) assert eq(da.min(a), np.min(x)) assert eq(da.nanargmax(a, axis=0), np.nanargmax(x, axis=0)) assert eq(da.nanargmin(a, axis=0), np.nanargmin(x, axis=0)) assert eq(da.nanmax(a), np.nanmax(x)) assert eq(da.nanmin(a), np.nanmin(x)) assert eq(da.nansum(a), np.nansum(x)) assert eq(da.nanvar(a), np.nanvar(x)) assert eq(da.nanstd(a), np.nanstd(x))
def fix_data(self, cube): """Fix data. Unit is %, values are <= 1. Parameters ---------- cube: iris.cube.Cube Cube to fix Returns ------- iris.cube.Cube Fixed cube. It can be a difference instance. """ if cube.units == "%" and da.max(cube.core_data()).compute() <= 1.: cube.data = cube.core_data() * 100. return cube
def test_reductions(): x = np.arange(5).astype('f4') a = da.from_array(x, chunks=(2,)) assert eq(da.all(a), np.all(x)) assert eq(da.any(a), np.any(x)) assert eq(da.argmax(a, axis=0), np.argmax(x, axis=0)) assert eq(da.argmin(a, axis=0), np.argmin(x, axis=0)) assert eq(da.max(a), np.max(x)) assert eq(da.mean(a), np.mean(x)) assert eq(da.min(a), np.min(x)) assert eq(da.nanargmax(a, axis=0), np.nanargmax(x, axis=0)) assert eq(da.nanargmin(a, axis=0), np.nanargmin(x, axis=0)) assert eq(da.nanmax(a), np.nanmax(x)) assert eq(da.nanmin(a), np.nanmin(x)) assert eq(da.nansum(a), np.nansum(x)) assert eq(da.nanvar(a), np.nanvar(x)) assert eq(da.nanstd(a), np.nanstd(x))
def add_data(workspace: String, data_fname: String): import dask.array as da from survos2.improc.utils import optimal_chunksize ws = get(workspace) logger.info(f"Adding data to workspace {ws}") with dataset_from_uri(data_fname, mode="r") as data: chunk_size = optimal_chunksize(data, Config["computing.chunk_size"]) logger.debug( f'Calculating optimal chunk size using chunk_size {Config["computing.chunk_size"]}: {chunk_size}' ) data = da.from_array(data, chunks=chunk_size) data -= da.min(data) data /= da.max(data) ds = ws.add_data(data) # ds.set_attr("chunk_size", chunk_size) return ds
def statistics(self, data, pca_stats=None): # set headers if pca_stats: # for pca if pca_stats["eigenvals"] is not None: self.stats_header.setText("Eigenvalue: {} ({}%)".format( round(pca_stats["eigenvals"][self.pc_id - 1], 2), round(pca_stats["eigenvals_%"][self.pc_id - 1], 2))) self.stats_header.setToolTip( "It shows how are the dispersion of the data with respect to its component" ) else: self.stats_header.setText("Eigenvalue: --") self.stats_header.setToolTip( "Is only available when the components are computed with the plugin" ) else: # for aoi self.stats_header.setText("Pixels in AOI: {}".format( round(data.size if data.size > 1 else 0, 2))) self.stats_header.setToolTip("") # restore or compute the statistics if self.QCBox_StatsLayer.currentText( ) == self.pc_name and self.stats_pc is not None: min, max, std, p25, p50, p75 = self.stats_pc else: da_data = da.from_array(data, chunks=(8000000, )) min = da.min(da_data).compute() max = da.max(da_data).compute() std = da.std(da_data).compute() p25 = da.percentile(da_data, 25).compute()[0] p50 = da.percentile(da_data, 50).compute()[0] p75 = da.percentile(da_data, 75).compute()[0] if self.QCBox_StatsLayer.currentText() == self.pc_name: self.stats_pc = (min, max, std, p25, p50, p75) # set in dialog self.stats_min.setText(str(round(min, 2))) self.stats_max.setText(str(round(max, 2))) self.stats_std.setText(str(round(std, 2))) self.stats_p25.setText(str(round(p25, 2))) self.stats_p50.setText(str(round(p50, 2))) self.stats_p75.setText(str(round(p75, 2)))
def test_workspace(): ws = Workspace(".") workspace_fpath = "./newws1" ws = ws.create(workspace_fpath) data_fname = "./tmp/testvol_4x4x4b.h5" with dataset_from_uri(data_fname, mode="r") as data: chunk_size = optimal_chunksize(data, Config["computing.chunk_size"]) data = da.from_array(data, chunks=chunk_size) data -= da.min(data) data /= da.max(data) ds = ws.add_data(data) # ds.set_attr("chunk_size", chunk_size) ws.add_dataset("testds", "float32") assert ws.exists(workspace_fpath) assert ws.has_data() assert ws.available_datasets() == ['testds'] ws.add_session('newsesh') assert ws.has_session('newsesh') ws.delete()
def cluster_centroids(data, clusters, k=None): """Return centroids of clusters & clusters in data. data is an array of observations with shape (A, B, ...). clusters is an array of integers of shape (A,) giving the index (from 0 to k-1) of the cluster to which each observation belongs. The clusters must all be non-empty. k is the number of clusters. If omitted, it is deduced from the values in the clusters array. The result is an array of shape (k, B, ...) containing the centroid of each cluster. >>> data = np.array([[12, 10, 87], ... [ 2, 12, 33], ... [68, 31, 32], ... [88, 13, 66], ... [79, 40, 89], ... [ 1, 77, 12]]) >>> cluster_centroids(data, np.array([1, 1, 2, 2, 0, 1])) array([[ 79., 40., 89.], [ 5., 33., 44.], [ 78., 22., 49.]]) """ if k is None: k = (da.max(clusters)).compute() + 1 result = [] result = [ da.mean(data[clusters.compute() == i], axis=0) for i in xrange(k) ] return da.reshape(da.concatenate(result, axis=0), shape=(k, ) + data.shape[1:])
def density_flux(population, total_population, carrying_capacity, distance, csx, csy, **kwargs): """ 'density-based dispersion' Dispersal is calculated using the following sequence of methods: Portions of populations at each element (node, or grid cell) in the study area array (raster) are moved to surrounding elements (a neighbourhood) within a radius that is defined by the input distance (:math:`d`), as presented in the conceptual figure below. .. image:: images/density_flux_neighbourhood.png :align: center .. attention:: No dispersal will occur if the provided distance is less than the distance between elements (grid cells) in the model domain, as none will be included in the neighbourhood The mean density (:math:`\\rho`) of all elements in the neighbourhood is calculated as: .. math:: \\rho=\\frac{\\sum_{i=1}^{n} \\frac{pop_T(i)}{k_T(i)}}{n} where, :math:`pop_T` is the total population (of the entire species) at each element (:math:`i`); and\n :math:`k_T` is the total carrying capacity for the species The density gradient at each element (:math:`\\Delta`) with respect to the mean is calculated as: .. math:: \\Delta(i)=\\frac{pop_T(i)}{k_T(i)}-\\rho If the centroid element is above the mean :math:`[\\Delta(i_0) > 0]`, it is able to release a portion of its population to elements in the neighbourhood. The eligible population to be received by surrounding elements is equal to the sum of populations at elements with negative density gradients, the :math:`candidates`: .. math:: candidates=\\sum_{i=1}^{n} \\Delta(i)[\\Delta(i) < 0]k_T(i) The minimum of either the population above the mean at the centroid element - :math:`source=\\Delta(i_0)*k_T(i_0)`, or the :math:`candidates` are used to determine the total population that is dispersed from the centroid element to the other elements in the neighbourhood: .. math:: dispersal=min\{source, candidates\} The population at the centroid element becomes: .. math:: pop_a(i_0)=pop_a(i_0)-\\frac{pop_a(i_0)}{pop_T(i_0)}dispersal where, :math:`pop_a` is the age (stage) group population, which is a sub-population of the total. The populations of the candidate elements in the neighbourhood become (a net gain due to negative gradients): .. math:: pop_a(i)=pop_a(i)-\\frac{\\Delta(i)[\\Delta(i) < 0]k_T(i)}{candidates}dispersal\\frac{pop_a(i)}{pop_T(i)} :param da.Array population: Sub-population to redistribute (subset of the ``total_population``) :param da.Array total_population: Total population :param da.Array carrying_capacity: Total Carrying Capacity (k) :param float distance: Maximum dispersal distance :param float csx: Cell size of the domain in the x-direction :param float csy: Cell size of the domain in the y-direction .. Attention:: Ensure the cell sizes are in the same units as the specified direction :Keyword Arguments: **mask** (*array*) -- A weighting mask that scales dispersal based on the normalized mask value (default: None) :return: Redistributed population """ if any([ not isinstance(a, da.Array) for a in [population, total_population, carrying_capacity] ]): raise DispersalError('Inputs must be a dask arrays') if distance == 0: # Don't do anything return population chunks = tuple(c[0] if c else 0 for c in population.chunks)[:2] mask = kwargs.get('mask', None) if mask is None: mask = da.ones(shape=population.shape, dtype='float32', chunks=chunks) # Normalize the mask mask_min = da.min(mask) _range = da.max(mask) - mask_min mask = da.where(_range > 0, (mask - mask_min) / _range, 1.) # Calculate the kernel indices and shape kernel = calculate_kernel(distance, csx, csy) if kernel is None: # Not enough distance to cover a grid cell return population kernel, m, n = kernel m = int(m) n = int(n) a = da.pad(da.dstack( [population, total_population, carrying_capacity, mask]), ((m, m), (n, n), (0, 0)), 'constant', constant_values=0) _m = -m if m == 0: _m = None _n = -n if n == 0: _n = None output = delayed(density_flux_task)(a, kernel, m, n)[m:_m, n:_n, 0] output = da.from_delayed(output, population.shape, np.float32) return output.rechunk(chunks)
def new_grid_mapping_from_coords( x_coords: xr.DataArray, y_coords: xr.DataArray, crs: Union[str, pyproj.crs.CRS], *, tile_size: Union[int, Tuple[int, int]] = None, tolerance: float = DEFAULT_TOLERANCE, ) -> GridMapping: crs = _normalize_crs(crs) assert_instance(x_coords, xr.DataArray, name='x_coords') assert_instance(y_coords, xr.DataArray, name='y_coords') assert_true(x_coords.ndim in (1, 2), 'x_coords and y_coords must be either 1D or 2D arrays') assert_instance(tolerance, float, name='tolerance') assert_true(tolerance > 0.0, 'tolerance must be greater zero') if x_coords.name and y_coords.name: xy_var_names = str(x_coords.name), str(y_coords.name) else: xy_var_names = _default_xy_var_names(crs) tile_size = _normalize_int_pair(tile_size, default=None) is_lon_360 = None # None means "not yet known" if crs.is_geographic: is_lon_360 = bool(np.any(x_coords > 180)) x_res = 0 y_res = 0 if x_coords.ndim == 1: # We have 1D x,y coordinates cls = Coords1DGridMapping assert_true(x_coords.size >= 2 and y_coords.size >= 2, 'sizes of x_coords and y_coords 1D arrays must be >= 2') size = x_coords.size, y_coords.size x_dim, y_dim = x_coords.dims[0], y_coords.dims[0] x_diff = _abs_no_zero(x_coords.diff(dim=x_dim).values) y_diff = _abs_no_zero(y_coords.diff(dim=y_dim).values) if not is_lon_360 and crs.is_geographic: is_anti_meridian_crossed = np.any(np.nanmax(x_diff) > 180) if is_anti_meridian_crossed: x_coords = to_lon_360(x_coords) x_diff = _abs_no_zero(x_coords.diff(dim=x_dim)) is_lon_360 = True x_res, y_res = x_diff[0], y_diff[0] x_diff_equal = np.allclose(x_diff, x_res, atol=tolerance) y_diff_equal = np.allclose(y_diff, y_res, atol=tolerance) is_regular = x_diff_equal and y_diff_equal if is_regular: x_res = round_to_fraction(x_res, 5, 0.25) y_res = round_to_fraction(y_res, 5, 0.25) else: x_res = round_to_fraction(float(np.nanmedian(x_diff)), 2, 0.5) y_res = round_to_fraction(float(np.nanmedian(y_diff)), 2, 0.5) if tile_size is None \ and x_coords.chunks is not None \ and y_coords.chunks is not None: tile_size = (max(0, *x_coords.chunks[0]), max(0, *y_coords.chunks[0])) # Guess j axis direction is_j_axis_up = bool(y_coords[0] < y_coords[-1]) else: # We have 2D x,y coordinates cls = Coords2DGridMapping assert_true( x_coords.shape == y_coords.shape, 'shapes of x_coords and y_coords' ' 2D arrays must be equal') assert_true( x_coords.dims == y_coords.dims, 'dimensions of x_coords and y_coords' ' 2D arrays must be equal') y_dim, x_dim = x_coords.dims height, width = x_coords.shape size = width, height x = da.asarray(x_coords) y = da.asarray(y_coords) x_x_diff = _abs_no_nan(da.diff(x, axis=1)) x_y_diff = _abs_no_nan(da.diff(x, axis=0)) y_x_diff = _abs_no_nan(da.diff(y, axis=1)) y_y_diff = _abs_no_nan(da.diff(y, axis=0)) if not is_lon_360 and crs.is_geographic: is_anti_meridian_crossed = da.any(da.max(x_x_diff) > 180) \ or da.any(da.max(x_y_diff) > 180) if is_anti_meridian_crossed: x_coords = to_lon_360(x_coords) x = da.asarray(x_coords) x_x_diff = _abs_no_nan(da.diff(x, axis=1)) x_y_diff = _abs_no_nan(da.diff(x, axis=0)) is_lon_360 = True is_regular = False if da.all(x_y_diff == 0) and da.all(y_x_diff == 0): x_res = x_x_diff[0, 0] y_res = y_y_diff[0, 0] is_regular = \ da.allclose(x_x_diff[0, :], x_res, atol=tolerance) \ and da.allclose(x_x_diff[-1, :], x_res, atol=tolerance) \ and da.allclose(y_y_diff[:, 0], y_res, atol=tolerance) \ and da.allclose(y_y_diff[:, -1], y_res, atol=tolerance) if not is_regular: # Let diff arrays have same shape as original by # doubling last rows and columns. x_x_diff_c = da.concatenate([x_x_diff, x_x_diff[:, -1:]], axis=1) y_x_diff_c = da.concatenate([y_x_diff, y_x_diff[:, -1:]], axis=1) x_y_diff_c = da.concatenate([x_y_diff, x_y_diff[-1:, :]], axis=0) y_y_diff_c = da.concatenate([y_y_diff, y_y_diff[-1:, :]], axis=0) # Find resolution via area x_abs_diff = da.sqrt(da.square(x_x_diff_c) + da.square(x_y_diff_c)) y_abs_diff = da.sqrt(da.square(y_x_diff_c) + da.square(y_y_diff_c)) if crs.is_geographic: # Convert degrees into meters x_abs_diff_r = da.radians(x_abs_diff) y_abs_diff_r = da.radians(y_abs_diff) x_abs_diff = _ER * da.cos(x_abs_diff_r) * y_abs_diff_r y_abs_diff = _ER * y_abs_diff_r xy_areas = (x_abs_diff * y_abs_diff).flatten() xy_areas = da.where(xy_areas > 0, xy_areas, np.nan) # Get indices of min and max area xy_area_index_min = da.nanargmin(xy_areas) xy_area_index_max = da.nanargmax(xy_areas) # Convert area to edge length xy_res_min = math.sqrt(xy_areas[xy_area_index_min]) xy_res_max = math.sqrt(xy_areas[xy_area_index_max]) # Empirically weight min more than max xy_res = 0.7 * xy_res_min + 0.3 * xy_res_max if crs.is_geographic: # Convert meters back into degrees # print(f'xy_res in meters: {xy_res}') xy_res = math.degrees(xy_res / _ER) # print(f'xy_res in degrees: {xy_res}') # Because this is an estimation, we can round to a nice number xy_res = round_to_fraction(xy_res, digits=1, resolution=0.5) x_res, y_res = float(xy_res), float(xy_res) if tile_size is None and x_coords.chunks is not None: j_chunks, i_chunks = x_coords.chunks tile_size = max(0, *i_chunks), max(0, *j_chunks) if tile_size is not None: tile_width, tile_height = tile_size x_coords = x_coords.chunk((tile_height, tile_width)) y_coords = y_coords.chunk((tile_height, tile_width)) # Guess j axis direction is_j_axis_up = np.all(y_coords[0, :] < y_coords[-1, :]) or None assert_true(x_res > 0 and y_res > 0, 'internal error: x_res and y_res could not be determined', exception_type=RuntimeError) x_res, y_res = _to_int_or_float(x_res), _to_int_or_float(y_res) x_res_05, y_res_05 = x_res / 2, y_res / 2 x_min = _to_int_or_float(x_coords.min() - x_res_05) y_min = _to_int_or_float(y_coords.min() - y_res_05) x_max = _to_int_or_float(x_coords.max() + x_res_05) y_max = _to_int_or_float(y_coords.max() + y_res_05) return cls(x_coords=x_coords, y_coords=y_coords, crs=crs, size=size, tile_size=tile_size, xy_bbox=(x_min, y_min, x_max, y_max), xy_res=(x_res, y_res), xy_var_names=xy_var_names, xy_dim_names=(str(x_dim), str(y_dim)), is_regular=is_regular, is_lon_360=is_lon_360, is_j_axis_up=is_j_axis_up)
def plot_dataset(X, y, images=None, labels=None, gray=False, save=None, y_original=None): print('data size {}'.format(X.shape)) uni_y = len(da.unique(y).compute()) x_min, x_max = da.min(X, 0), da.max(X, 0) X = (X - x_min) / (x_max - x_min) #if save is not None: #plt.figure(figsize=(27,18), dpi=600) #else: fig = plt.figure(figsize=(27, 18), dpi=100) ax = plt.subplot(111) for i in tqdm(range(X.shape[0])): plt.text(X[i, 0], X[i, 1], str(y[i]), color=plt.cm.Set1(y[i] / uni_y), fontdict={ 'weight': 'bold', 'size': 9 }) if images is not None: if hasattr(offsetbox, 'AnnotationBbox'): # only print thumbnails with matplotlib > 1.0 shown_images = da.array([[1., 1.]]) # just something big for i in range(X.shape[0]): dist = da.sum((X[i] - shown_images)**2, 1) if da.min(dist) < 4e-3: # don't show points that are too close continue if labels is not None: if y_original is not None: plt.text(X[i, 0] - 0.01, X[i, 1] - 0.033, labels[y_original[i]], fontdict={ 'weight': 'bold', 'size': 15 }) else: plt.text(X[i, 0] - 0.01, X[i, 1] - 0.033, labels[y[i]], fontdict={ 'weight': 'bold', 'size': 15 }) shown_images = da.r_[shown_images, [X[i]]] if gray: image_ = offsetbox.OffsetImage( da.expand_dims(util.invert(images[i]), axis=0)) else: image_ = offsetbox.OffsetImage(images[i], cmap=plt.cm.gray_r) imagebox = offsetbox.AnnotationBbox(image_, X[i]) ax.add_artist(imagebox) plt.xticks([]), plt.yticks([]) for item in [fig, ax]: item.patch.set_visible(False) ax.axis('off') if save is not None: print('Saving Image {} ...'.format(save)) plt.title('epoch ' + save.split('.')[0].split()[-1], fontdict={'fontsize': 20}, loc='left') plt.savefig(save) plt.close() else: plt.show() del X, y, fig, ax gc.collect()
def two_point_stats(arr1, arr2, mask=None, periodic_boundary=True, cutoff=None): """Calculate the 2-points stats for two arrays Args: arr1: array used to calculate cross-correlations (n_samples,n_x,n_y) arr2: array used to calculate cross-correlations (n_samples,n_x,n_y) mask: array specifying confidence in the measurement at a pixel (n_samples,n_x,n_y). In range [0,1]. periodic_boundary: whether to assume a periodic boundary (default is true) cutoff: the subarray of the 2 point stats to keep Returns: the snipped 2-points stats >>> two_point_stats( ... da.from_array(np.arange(10).reshape(2, 5), chunks=(2, 5)), ... da.from_array(np.arange(10).reshape(2, 5), chunks=(2, 5)), ... ).shape (2, 5) Test masking >>> array = da.array([[[1, 0 ,0], [0, 1, 1], [1, 1, 0]]]) >>> mask = da.array([[[1, 1, 1], [1, 1, 1], [1, 0, 0]]]) >>> norm_mask = da.array([[[2, 4, 3], [4, 7, 4], [3, 4, 2]]]) >>> expected = da.array([[[1, 0, 1], [1, 4, 1], [1, 0, 1]]]) / norm_mask >>> assert np.allclose( ... two_point_stats(array, array, mask=mask, periodic_boundary=False), ... expected ... ) The mask must be in the range 0 to 1. >>> array = da.array([[[1, 0], [0, 1]]]) >>> mask = da.array([[[2, 0], [0, 1]]]) >>> two_point_stats(array, array, mask) Traceback (most recent call last): ... RuntimeError: Mask must be in range [0,1] """ cutoff_ = int((np.min(arr1.shape[1:]) - 1) / 2) if cutoff is None: cutoff = cutoff_ cutoff = min(cutoff, cutoff_) nonperiodic_padder = sequence( dapad( pad_width=[(0, 0)] + [(cutoff, cutoff)] * (arr1.ndim - 1), mode="constant", constant_values=0, ), lambda x: da.rechunk(x, (x.chunks[0], ) + x.shape[1:]), ) padder = identity if periodic_boundary else nonperiodic_padder if mask is not None: if da.max(mask).compute() > 1.0 or da.min(mask).compute() < 0.0: raise RuntimeError("Mask must be in range [0,1]") mask_array = lambda arr: arr * mask normalize = lambda x: x / auto_correlation(padder(mask)) else: mask_array = identity if periodic_boundary: # The periodic normalization could always be the # auto_correlation of the mask. But for the sake of # efficiency, we specify the periodic normalization in the # case there is no mask. normalize = lambda x: x / arr1[0].size else: normalize = lambda x: x / auto_correlation( padder(np.ones_like(arr1))) return sequence( map_(mask_array), map_(padder), list, star(cross_correlation), normalize, center_slice(cutoff=cutoff), )([arr1, arr2])
def two_point_stats(arr1, arr2, periodic_boundary=True, cutoff=None, mask=None): r"""Calculate the 2-points stats for two arrays The discretized two point statistics are given by .. math:: f[r \; \vert \; l, l'] = \frac{1}{S} \sum_s m[s, l] m[s + r, l'] where :math:`f[r \; \vert \; l, l']` is the conditional probability of finding the local states :math:`l` and :math:`l` at a distance and orientation away from each other defined by the vector :math:`r`. `See this paper for more details on the notation. <https://doi.org/10.1007/s40192-017-0089-0>`_ The array ``arr1[i]`` (state :math:`l`) is correlated with ``arr2[i]`` (state :math:`l'`) for each sample ``i``. Both arrays must have the same number of samples and nominal states (integer value) or continuous variables. To calculate multiple different correlations for each sample, see :func:`~pymks.correlations_multiple`. To use ``two_point_stats`` as part of a Scikit-learn pipeline, see :class:`~pymks.TwoPointCorrelation`. Args: arr1: array used to calculate cross-correlations, shape ``(n_samples,n_x,n_y)`` arr2: array used to calculate cross-correlations, shape ``(n_samples,n_x,n_y)`` periodic_boundary: whether to assume a periodic boundary (default is ``True``) cutoff: the subarray of the 2 point stats to keep mask: array specifying confidence in the measurement at a pixel, shape ``(n_samples,n_x,n_y)``. In range [0,1]. Returns: the snipped 2-points stats If both arrays are Dask arrays then a Dask array is returned. >>> out = two_point_stats( ... da.from_array(np.arange(10).reshape(2, 5), chunks=(2, 5)), ... da.from_array(np.arange(10).reshape(2, 5), chunks=(2, 5)), ... ) >>> out.chunks ((2,), (5,)) >>> out.shape (2, 5) If either of the arrays are Numpy then a Numpy array is returned. >>> two_point_stats( ... np.arange(10).reshape(2, 5), ... np.arange(10).reshape(2, 5), ... ) array([[ 3., 4., 6., 4., 3.], [48., 49., 51., 49., 48.]]) Test masking >>> array = da.array([[[1, 0 ,0], [0, 1, 1], [1, 1, 0]]]) >>> mask = da.array([[[1, 1, 1], [1, 1, 1], [1, 0, 0]]]) >>> norm_mask = da.array([[[2, 4, 3], [4, 7, 4], [3, 4, 2]]]) >>> expected = da.array([[[1, 0, 1], [1, 4, 1], [1, 0, 1]]]) / norm_mask >>> assert np.allclose( ... two_point_stats(array, array, mask=mask, periodic_boundary=False)[:, 1:-1, 1:-1], ... expected ... ) The mask must be in the range 0 to 1. >>> array = da.array([[[1, 0], [0, 1]]]) >>> mask = da.array([[[2, 0], [0, 1]]]) >>> two_point_stats(array, array, mask=mask) Traceback (most recent call last): ... RuntimeError: Mask must be in range [0,1] """ # noqa: #501 n_is_even = 1 - np.array(arr1.shape[1:]) % 2 padding = np.array(arr1.shape[1:]) // 2 nonperiodic_padder = sequence( dapad( pad_width=[(0, 0)] + list(zip(padding, padding + n_is_even)), mode="constant", constant_values=0, ), lambda x: da.rechunk(x, (x.chunks[0], ) + x.shape[1:]), ) padder = identity if periodic_boundary else nonperiodic_padder if mask is not None: if da.max(mask).compute() > 1.0 or da.min(mask).compute() < 0.0: raise RuntimeError("Mask must be in range [0,1]") mask_array = lambda arr: arr * mask normalize = lambda x: x / auto_correlation(padder(mask)) else: mask_array = identity if periodic_boundary: # The periodic normalization could always be the # auto_correlation of the mask. But for the sake of # efficiency, we specify the periodic normalization in the # case there is no mask. normalize = sequence( lambda x: x / arr1[0].size, dapad( pad_width=[(0, 0)] + list(zip(0 * n_is_even, n_is_even)), mode="wrap", ), lambda x: da.rechunk(x, (x.chunks[0], ) + x.shape[1:]), ) else: normalize = lambda x: x / auto_correlation( padder(np.ones_like(arr1))) return sequence( map_(mask_array), map_(padder), list, star(cross_correlation), normalize, center_slice(cutoff=cutoff), )([arr1, arr2])
def predict_xr( model, input_xr, chunk_size=None, persist=True, proba=False, clean=False, return_input=False, ): """ Using dask-ml ParallelPostfit(), runs the parallel predict and predict_proba methods of sklearn estimators. Useful for running predictions on a larger-than-RAM datasets. Last modified: September 2020 Parameters ---------- model : scikit-learn model or compatible object Must have a .predict() method that takes numpy arrays. input_xr : xarray.DataArray or xarray.Dataset. Must have dimensions 'x' and 'y' chunk_size : int The dask chunk size to use on the flattened array. If this is left as None, then the chunks size is inferred from the .chunks() method on the `input_xr` persist : bool If True, and proba=True, then 'input_xr' data will be loaded into distributed memory. This will ensure data is not loaded twice for the prediction of probabilities, but this will only work if the data is not larger than RAM. proba : bool If True, predict probabilities. This only applies if the model has a .predict_proba() method clean : bool If True, remove Infs and NaNs from input and output arrays return_input : bool If True, then the data variables in the 'input_xr' dataset will be appended to the output xarray dataset. Returns ---------- output_xr : xarray.Dataset An xarray.Dataset containing the prediction output from model with input_xr as input, if proba=True then dataset will also contain the prediciton probabilities. Has the same spatiotemporal structure as input_xr. """ if chunk_size is None: chunk_size = int(input_xr.chunks["x"][0]) * int( input_xr.chunks["y"][0]) # convert model to dask predict model = ParallelPostFit(model) # with joblib.parallel_backend("dask"): x, y, crs = input_xr.x, input_xr.y, input_xr.geobox.crs input_data = [] for var_name in input_xr.data_vars: input_data.append(input_xr[var_name]) input_data_flattened = [] # TODO: transfer to dask dataframe for arr in input_data: data = arr.data.flatten().rechunk(chunk_size) input_data_flattened.append(data) # reshape for prediction input_data_flattened = da.array(input_data_flattened).transpose() if clean: input_data_flattened = da.where(da.isfinite(input_data_flattened), input_data_flattened, 0) if proba and persist: # persisting data so we don't require loading all the data twice input_data_flattened = input_data_flattened.persist() # apply the classification print(" predicting...") out_class = model.predict(input_data_flattened) # Mask out NaN or Inf values in results if clean: out_class = da.where(da.isfinite(out_class), out_class, 0) # Reshape when writing out out_class = out_class.reshape(len(y), len(x)) # stack back into xarray output_xr = xr.DataArray(out_class, coords={ "x": x, "y": y }, dims=["y", "x"]) output_xr = output_xr.to_dataset(name="Predictions") if proba: print(" probabilities...") out_proba = model.predict_proba(input_data_flattened) # convert to % out_proba = da.max(out_proba, axis=1) * 100.0 if clean: out_proba = da.where(da.isfinite(out_proba), out_proba, 0) out_proba = out_proba.reshape(len(y), len(x)) out_proba = xr.DataArray(out_proba, coords={ "x": x, "y": y }, dims=["y", "x"]) output_xr["Probabilities"] = out_proba if return_input: print(" input features...") # unflatten the input_data_flattened array and append # to the output_xr containin the predictions arr = input_xr.to_array() stacked = arr.stack(z=["y", "x"]) # handle multivariable output output_px_shape = () if len(input_data_flattened.shape[1:]): output_px_shape = input_data_flattened.shape[1:] output_features = input_data_flattened.reshape( (len(stacked.z), *output_px_shape)) # set the stacked coordinate to match the input output_features = xr.DataArray( output_features, coords={ "z": stacked["z"] }, dims=[ "z", *[ "output_dim_" + str(idx) for idx in range(len(output_px_shape)) ], ], ).unstack() # convert to dataset and rename arrays output_features = output_features.to_dataset(dim="output_dim_0") data_vars = list(input_xr.data_vars) output_features = output_features.rename( {i: j for i, j in zip(output_features.data_vars, data_vars)} # noqa pylint: disable=unnecessary-comprehension ) # merge with predictions output_xr = xr.merge([output_xr, output_features], compat="override") return assign_crs(output_xr, str(crs))
xds_from_ms( args.ms, # We only need the antenna and uvw columns columns=("UVW", "ANTENNA1", "ANTENNA2"), group_cols=[], index_cols=[], chunks={"row": 1e6})) # Should only have one dataset assert len(xds) == 1 # The unique baseline for one scan is same for every scan in the Measurement Set ds = xds[0] # Calculate Maximum baseline uvw = ds.UVW.data bl_max_dist = da.sqrt(da.max(da.sum(uvw**2, axis=1))) # bl_max_dist = da.stack(ds.UVW.data, my_ds.UVW.data for my_ds in xds, axis=1) # Need ant1 and ant2 to be int32 for the compound int64 below # to work assert ds.ANTENNA1.dtype == ds.ANTENNA2.dtype == np.int32 bl = da.stack([ds.ANTENNA1.data, ds.ANTENNA2.data], axis=1) # convert array to dtype int64 from int32 bl = bl.rechunk(-1, 2).view(np.int64) # get the unique values ubl = da.unique(bl) # dask compute, convert back to int32 and reshape ubl = da.compute(ubl)[0].view(np.int32).reshape(-1, 2)
# Should only be one correlation assert psf.shape[2] == 1, psf.shape # FFT the PSF psf_fft = da.fft.fftshift(da.fft.ifft2(da.fft.ifftshift(psf[:, :, 0]))) # Dirty image composed of the diagonal correlations if ncorr == 1: dirty = dirty_fft[0].real else: dirty = (dirty_fft[0].real + dirty_fft[ncorr - 1].real) * 0.5 # Normalised Amplitude psf = da.absolute(psf_fft.real) psf = (psf / da.max(psf)) # Scale the dirty image by the psf # x4 because the N**2 FFT normalization factor # on a square image double the size dirty = dirty / (da.max(psf) * 4.) # Visualise profiling if we have bokeh try: import bokeh # noqa except ImportError: from dask.diagnostics import ProgressBar with ProgressBar(): dirty = dirty.compute() else: