def coarsen(f): ''' Create data pyramid. ''' grid = f['resolutions']['1']['values'] top_n = grid.shape[0] tile_size = 256 max_zoom = math.ceil(math.log(top_n / tile_size) / math.log(2)) max_width = tile_size * 2**max_zoom chunk_size = tile_size * 16 curr_size = grid.shape dask_dset = da.from_array(grid, chunks=(chunk_size, chunk_size)) r = f['resolutions'] curr_resolution = 1 while curr_resolution < 2**max_zoom: curr_size = tuple(np.array(curr_size) / 2) print('coarsening') curr_resolution *= 2 print("curr_size:", curr_size) g = r.create_group(str(curr_resolution)) values = g.require_dataset('values', curr_size, dtype='f4', compression='lzf', fillvalue=np.nan) dask_dset = dask_dset.rechunk((chunk_size, chunk_size)) dask_dset = da.coarsen(np.nansum, dask_dset, {0: 2, 1: 2}) da.store(dask_dset, values)
def _vis_xformer(index): """ Transform katdal visibilities indexed by ``index`` into AIPS visiblities. """ if isinstance(self._katds.vis, DaskLazyIndexer): arrays = [ self._katds.vis, self._katds.weights, self._katds.flags ] vis, weights, flags = [ dask_getitem(array.dataset, np.s_[index, :, :]) for array in arrays ] else: vis = da.from_array(self._katds.vis[index]) weights = da.from_array(self._katds.weights[index]) flags = da.from_array(self._katds.flags[index]) # Apply flags by negating weights weights = da.where(flags, -32767.0, weights) # Split complex vis dtype into real and imaginary parts vis_dtype = vis.dtype.type(0).real.dtype vis = vis.view(vis_dtype).reshape(vis.shape + (2, )) out_array = np.empty(weights.shape + (3, ), dtype=vis_dtype) da.store([vis, weights], [out_array[..., 0:2], out_array[..., 2]], lock=False) return out_array
def store(self, dask, raster_path, **kwargs): """ Store a computed dask array into a new raster path. The format of the output raster will be interpreted by GDAL, although additional parameters will be implicitly added for a GeoTiff output to make it cloud optimized. .. Note:: The dask should have been created using the same Mosaic instance that is used to store it. Also, if the dask graph includes subsequent reductions or slicing, the dask may not fit the mosaic spatial definition. :param dask.Array dask: Input dask array :param str raster_path: Input path to a raster source. :param kwargs: Used for additional creation options, ex. { 'BIGTIFF': 'YES' } """ raster = create_raster_source(raster_path, self.top, self.left, self.shape, self.csx, self.csy, self.sr, self.dtype, self.nodata, self.chunks, **kwargs) da.store([dask.reshape(self.shape).rechunk(self.chunk_tuple)], [raster]) # Create internal overviews overview_resampling_method = kwargs.get('overview_resampling_method', 'nearest') cmd = f'gdaladdo -r {overview_resampling_method} "{raster_path}"' subprocess.call(cmd, shell=True)
def coarsen(f, type, tile_size=256): ''' Create data pyramid. ''' grid = f['resolutions']['1'][type] top_n = grid.shape[0] max_zoom = math.ceil(math.log(top_n / tile_size) / math.log(2)) chunk_size = tile_size * 16 curr_size = grid.shape dask_dset = da.from_array(grid, chunks=(chunk_size, chunk_size)) r = f['resolutions'] curr_resolution = 1 while curr_resolution < 2 ** max_zoom: curr_size = tuple(np.array(curr_size) / 2) print('coarsening') curr_resolution *= 2 print("curr_size:", curr_size) group_name = '{}{}'.format( curr_resolution, '' if type == 'values' else '-' + type ) g = r.create_group(group_name) values = g.require_dataset(type, curr_size, dtype='f4', compression='lzf', fillvalue=np.nan) dask_dset = dask_dset.rechunk((chunk_size, chunk_size)) dask_dset = da.coarsen(np.nansum, dask_dset, {0: 2, 1: 2}) da.store(dask_dset, values)
def _save_in_hdf5_object(self, f, tag="tomo"): if "class_name" not in f.attrs.keys(): f.attrs["class_name"] = self.__class__.__name__ f.attrs["module_name"] = self.__module__ if "params" not in f.keys() and self.params is not None: self.params._save_as_hdf5(hdf5_parent=f) if tag in f: grp = f[tag] else: grp = f.create_group(tag) grp.attrs["class_name"] = self.__class__.__name__ grp.attrs["module_name"] = self.__module__ for attr in self._attrs_to_save: grp.attrs[attr] = getattr(self, attr) for k in self._keys_to_save: data = getattr(self, k) print(f"Saving {type(data)} {k}...") if isinstance(data, da.core.Array): dataset = grp.require_dataset(f"/{tag}/{k}", shape=data.shape, dtype=data.dtype) da.store(data, dataset) else: grp.create_dataset(k, data=data)
def copy(source: GreyOrdinates, target: GreyOrdinates): """ Copies information from source to target. """ if source.data.shape != target.data.shape: raise ValueError("Source and target shape do not match") if isinstance(target.data, zarr.Array): target.data[:] = source.data else: chunks = getattr(target.data, 'chunks', getattr(source.data, 'chunks', 'auto')) for dataset in target.data, source.data: if not hasattr(dataset, 'chunks'): if chunks == 'auto': chunks = [1, 1] else: chunks = list(chunks) chunks[np.argmin(dataset.strides)] = None logger.info( f"Adopted chunk size: {tuple(chunks)} for CIFTI with shape {tuple(source.data.shape)}" ) data = source.as_dask(tuple(chunks)) da.store(data, target.data)
def write_raster(path, array, **kwargs): """Write a dask array to a raster file If array is 2d, write array on band 1. If array is 3d, write data on each band Arguments: path {string} -- path of raster to write array {dask.array.Array} -- band array kwargs {dict} -- keyword arguments to delegate to rasterio.open Examples: # Write a single band raster >> red_band = read_raster_band("test.tif", band=1) >> write_raster("new.tif", red_band) # Write a multiband raster >> img = read_raster("test.tif") >> new_img = process(img) >> write_raster("new.tif", new_img) """ if len(array.shape) != 2 and len(array.shape) != 3: raise TypeError('invalid shape (must be either 2d or 3d)') if is_dask_collection(array): with RasterioDataset(path, 'w', **kwargs) as dst: da.store(array, dst, lock=True) else: with rasterio.open(path, 'w', **kwargs) as dst: if len(array.shape) == 2: dst.write(array, 1) else: dst.write(array)
def overwrite_dataset(group, data, key, signal_axes=None, **kwds): if signal_axes is None: chunks = True else: chunks = get_signal_chunks(data.shape, data.dtype, signal_axes) maxshape = tuple(None for _ in data.shape) got_data = False while not got_data: try: these_kwds = kwds.copy() these_kwds.update(dict(shape=data.shape, dtype=data.dtype, exact=True, maxshape=maxshape, chunks=chunks, shuffle=True,)) dset = group.require_dataset(key, **these_kwds) got_data = True except TypeError: # if the shape or dtype/etc do not match, # we delete the old one and create new in the next loop run del group[key] if dset == data: # just a reference to already created thing pass else: if isinstance(data, da.Array): da.store(data.rechunk(dset.chunks), dset) else: da.store(da.from_array(data, chunks=dset.chunks), dset)
def get(cls, arrays, keep, out=None): """Extract several arrays from the underlying dataset. This is a variant of :meth:`__getitem__` that pulls from several arrays jointly. This can be significantly more efficient if intermediate dask nodes can be shared. Parameters ---------- arrays : list of :class:`DaskLazyIndexer` Arrays to index keep : NumPy index expression Second-stage index as a valid index or slice specification (supports arbitrary slicing or advanced indexing on any dimension) out : list of :class:`np.ndarray` If specified, output arrays in which to store results. It must be the same length as `arrays` and each array must have the appropriate shape and dtype. Returns ------- out : sequence of :class:`numpy.ndarray` Extracted output array (computed from the final dask version) """ kept = [dask_getitem(array.dataset, keep) for array in arrays] # Workaround for https://github.com/dask/dask/issues/3595 # This is equivalent to da.compute(kept), but does not allocate # excessive memory. if out is None: out = [np.empty(array.shape, array.dtype) for array in kept] da.store(kept, out, lock=False) return out
def coarsen(f, tile_size=256): """ Create data pyramid. """ grid = f["resolutions"]["1"]["values"] top_n = grid.shape[0] max_zoom = math.ceil(math.log(top_n / tile_size) / math.log(2)) max_width = tile_size * 2**max_zoom chunk_size = tile_size * 16 curr_size = grid.shape dask_dset = da.from_array(grid, chunks=(chunk_size, chunk_size)) r = f["resolutions"] curr_resolution = 1 while curr_resolution < 2**max_zoom: curr_size = tuple(np.array(curr_size) / 2) print("coarsening") curr_resolution *= 2 print("curr_size:", curr_size) g = r.create_group(str(curr_resolution)) values = g.require_dataset("values", curr_size, dtype="f4", compression="lzf", fillvalue=np.nan) dask_dset = dask_dset.rechunk((chunk_size, chunk_size)) dask_dset = da.coarsen(np.nansum, dask_dset, {0: 2, 1: 2}) da.store(dask_dset, values)
def load(dataset, indices, vis, weights, flags): """Load data from lazy indexers into existing storage. This is optimised for the MVF v4 case where we can use dask directly to eliminate one copy, and also load vis, flags and weights in parallel. In older formats it causes an extra copy. Parameters ---------- dataset : :class:`katdal.DataSet` Input dataset, possibly with an existing selection indices : tuple Index expression for subsetting the dataset vis, weights, flags : array-like Outputs, which must have the correct shape and type """ if isinstance(dataset.vis, DaskLazyIndexer): da.store([ dataset.vis.dask_getitem(indices), dataset.weights.dask_getitem(indices), dataset.flags.dask_getitem(indices) ], [vis, weights, flags], lock=False) else: vis[:] = dataset.vis[indices] weights[:] = dataset.weights[indices] flags[:] = dataset.flags[indices]
def dataarray_to_gridded_product(ds, grid_def, overwrite_existing=False): info = ds.attrs.copy() info.pop("area", None) if ds.ndim == 3: # RGB composite if ds.shape[0] in [3, 4]: channels = ds.shape[0] else: # unpreferred array orientation channels = ds.shape[-1] ds = np.rollaxis(ds, 2) else: channels = 1 if np.issubdtype(np.dtype(ds.dtype), np.floating): dtype = np.float32 else: dtype = ds.dtype p2g_metadata = { "product_name": info["name"], "satellite": info["platform_name"].lower(), "instrument": info["sensor"].lower() if isinstance(info["sensor"], str) else list( info["sensor"])[0].lower(), "data_kind": info["standard_name"], "begin_time": info["start_time"], "end_time": info["end_time"], "fill_value": np.nan, # "swath_columns": cols, # "swath_rows": rows, "rows_per_scan": info["rows_per_scan"], "data_type": dtype, "channels": channels, "grid_definition": grid_def, } info.update(p2g_metadata) filename = info["name"] + ".dat" info["grid_data"] = filename if os.path.isfile(filename): if not overwrite_existing: LOG.error("Binary file already exists: %s" % (filename, )) raise RuntimeError("Binary file already exists: %s" % (filename, )) else: LOG.warning("Binary file already exists, will overwrite: %s", filename) p2g_arr = np.memmap(filename, mode="w+", dtype=dtype, shape=ds.shape) da.store(ds.data.astype(dtype), p2g_arr) return containers.GriddedProduct(**info)
def __getitem__(self, keep): kept = self.dask_getitem(keep) # Workaround for https://github.com/dask/dask/issues/3595 # This is equivalent to kept.compute(), but does not # allocate excessive memory. out = np.empty(kept.shape, kept.dtype) da.store(kept, out, lock=False) return out
def write_data_variables(data_vars, nco): for name, variable in data_vars.items(): try: with dask.set_options(get=dask.async.get_sync): da.store(variable.data, nco[name], lock=True) except ValueError: nco[name][:] = netcdf_writer.netcdfy_data(variable.values) nco.sync()
def _store_data(data, dset, group, key, chunks): if isinstance(data, da.Array): if data.chunks != dset.chunks: data = data.rechunk(dset.chunks) da.store(data, dset) elif data.flags.c_contiguous: dset.write_direct(data) else: dset[:] = data
def overwrite_dataset(group, data, key, signal_axes=None, chunks=None, **kwds): if chunks is None: if isinstance(data, da.Array): # For lazy dataset, by default, we use the current dask chunking chunks = tuple([c[0] for c in data.chunks]) else: # If signal_axes=None, use automatic h5py chunking, otherwise # optimise the chunking to contain at least one signal per chunk chunks = get_signal_chunks(data.shape, data.dtype, signal_axes) if np.issubdtype(data.dtype, np.dtype('U')): # Saving numpy unicode type is not supported in h5py data = data.astype(np.dtype('S')) if data.dtype == np.dtype('O'): # For saving ragged array # http://docs.h5py.org/en/stable/special.html#arbitrary-vlen-data group.require_dataset(key, chunks, dtype=h5py.special_dtype(vlen=data[0].dtype), **kwds) group[key][:] = data[:] maxshape = tuple(None for _ in data.shape) got_data = False while not got_data: try: these_kwds = kwds.copy() these_kwds.update(dict(shape=data.shape, dtype=data.dtype, exact=True, maxshape=maxshape, chunks=chunks, shuffle=True,)) # If chunks is True, the `chunks` attribute of `dset` below # contains the chunk shape guessed by h5py dset = group.require_dataset(key, **these_kwds) got_data = True except TypeError: # if the shape or dtype/etc do not match, # we delete the old one and create new in the next loop run del group[key] if dset == data: # just a reference to already created thing pass else: _logger.info(f"Chunks used for saving: {dset.chunks}") if isinstance(data, da.Array): if data.chunks != dset.chunks: data = data.rechunk(dset.chunks) da.store(data, dset) elif data.flags.c_contiguous: dset.write_direct(data) else: dset[:] = data
def sync(self): if self.sources: import dask.array as da import dask if StrictVersion(dask.__version__) > StrictVersion('0.8.1'): da.store(self.sources, self.targets, lock=threading.Lock()) else: da.store(self.sources, self.targets) self.sources = [] self.targets = []
def sync(self): if self.sources: import dask.array as da import dask if LooseVersion(dask.__version__) > LooseVersion('0.8.1'): da.store(self.sources, self.targets, lock=GLOBAL_LOCK) else: da.store(self.sources, self.targets) self.sources = [] self.targets = []
def sync(self): if self.sources: import dask.array as da import dask if StrictVersion(dask.__version__) > StrictVersion("0.8.1"): da.store(self.sources, self.targets, lock=GLOBAL_LOCK) else: da.store(self.sources, self.targets) self.sources = [] self.targets = []
def area_to_swath_def(area, overwrite_existing=False): lons = area.lons lats = area.lats name = area.name name = name.replace(":", "") if lons.ndim == 1: rows, cols = lons.shape[0], 1 else: rows, cols = lons.shape info = { "swath_name": name, "longitude": name + "_lon.dat", "latitude": name + "_lat.dat", "swath_rows": rows, "swath_columns": cols, "data_type": lons.dtype, "fill_value": np.nan, } if hasattr(area, "attrs"): info.update(area.attrs) # Write lons to disk filename = info["longitude"] if os.path.isfile(filename): if not overwrite_existing: LOG.error("Binary file already exists: %s" % (filename, )) raise RuntimeError("Binary file already exists: %s" % (filename, )) else: LOG.warning("Binary file already exists, will overwrite: %s", filename) LOG.info("Writing longitude data to disk cache...") lon_arr = np.memmap(filename, mode="w+", dtype=lons.dtype, shape=lons.shape) da.store(lons.data, lon_arr) # Write lats to disk filename = info["latitude"] if os.path.isfile(filename): if not overwrite_existing: LOG.error("Binary file already exists: %s" % (filename, )) raise RuntimeError("Binary file already exists: %s" % (filename, )) else: LOG.warning("Binary file already exists, will overwrite: %s", filename) LOG.info("Writing latitude data to disk cache...") lat_arr = np.memmap(filename, mode="w+", dtype=lats.dtype, shape=lats.shape) da.store(lats.data, lat_arr) return containers.SwathDefinition(**info)
def get_scalar_outputs(dobj, nelem_in_yr, var_fcast, verif_data_attr, out_types, use_dask=False, ): if use_dask: truth_data = dobj.reset_data(verif_data_attr) else: truth_data = getattr(dobj, verif_data_attr) truth_1yr = truth_data[nelem_in_yr:] truth_init = truth_data[:-nelem_in_yr] curr_var_output = {} for out_type in out_types: fcast_factor, verif_factor = get_scalar_factor(dobj, out_type, verif_data_attr) var_out = var_fcast @ fcast_factor truth_init_out = truth_init @ verif_factor truth_1yr_out = truth_1yr @ verif_factor # Standardize PDO Index relative to truth output if out_type == 'pdo': truth_1yr_out, std_dev = _standardize_series(truth_1yr_out) var_out, _ = _standardize_series(var_out, std_dev=std_dev) truth_init_out, _ = _standardize_series(truth_init_out, std_dev=std_dev) if use_dask: t_truth_1yr_out = np.empty(truth_1yr_out.shape) t_truth_init_out = np.empty(truth_init_out.shape) dask_vars = [truth_1yr_out, truth_init_out] dask_outs = [t_truth_1yr_out, t_truth_init_out] if ST.is_dask_array(var_out): t_var_out = np.empty(var_out.shape) dask_vars.append(var_out) dask_outs.append(t_var_out) da.store(dask_vars, dask_outs) truth_1yr_out = t_truth_1yr_out truth_init_out = t_truth_init_out if ST.is_dask_array(var_out): var_out = t_var_out curr_var_output[out_type] = {'fcast': var_out, 't0': truth_init_out, '1yr': truth_1yr_out} return curr_var_output
def save_image(self, img, filename=None, compute=True, dtype=None, fill_value=None, **kwargs): filename = filename or self.get_filename( data_type=dtype_to_str(dtype), rows=img.data.shape[0], columns=img.data.shape[1], **img.data.attrs ) data = self._prep_data(img.data, dtype, fill_value) logger.info("Saving product %s to binary file %s", img.data.attrs["p2g_name"], filename) dst = np.memmap(filename, shape=img.data.shape, dtype=dtype, mode="w+") if compute: da.store(data, dst) return return [[data], [dst]]
def do_stack_task(config, task): global_attributes = config['global_attributes'] global_attributes['history'] = get_history_attribute(config, task) variable_params = config['variable_params'] output_filename = Path(task['output_filename']) tile = task['tile'] data = datacube.api.GridWorkflow.load( tile, dask_chunks=config['storage']['chunking']) unwrapped_datasets = xr_apply(tile.sources, _unwrap_dataset_list, dtype='O') data['dataset'] = datasets_to_doc(unwrapped_datasets) nco = create_netcdf_storage_unit(output_filename, data.crs, data.coords, data.data_vars, variable_params, global_attributes) for name, variable in data.data_vars.items(): try: with dask.set_options(get=dask. async .get_sync): da.store(variable.data, nco[name], lock=True) except ValueError: nco[name][:] = netcdf_writer.netcdfy_data(variable.values) nco.sync() nco.close() def update_dataset_location(labels, dataset): new_dataset = copy.copy(dataset) new_dataset.local_uri = output_filename.absolute().as_uri() return [dataset] updated_datasets = xr_apply(unwrapped_datasets, update_dataset_location, dtype='O') new_tile = datacube.api.Tile(sources=updated_datasets, geobox=tile.geobox) new_data = datacube.api.GridWorkflow.load( new_tile, dask_chunks=config['storage']['chunking']) if not data.identical(new_data): _LOG.error("Mismatch found for %s, not indexing", output_filename) raise ValueError("Mismatch found for %s, not indexing" % output_filename) return unwrapped_datasets, output_filename.absolute().as_uri()
def store(sources, targets): """ Adapted from dask.array.store :param sources: sources dask arrays :param targets: target data store locations :return: None """ # For debugging # ------------- # for source, target in zip(sources, targets): # da.store(source, target, compute=True) # return # ------------- da.store(sources, targets, compute=True)
def da_yxbt_sink( bands: Tuple[da.Array, ...], chunks: Tuple[int, ...], name="yxbt" ) -> da.Array: """ each band is in <t,y,x> output is <y,x,b,t> eval(bands) |> transpose(YXBT) |> Store(RAM) |> DaskArray(RAM, chunks) """ tk = tokenize(*bands) b = bands[0] dtype = b.dtype nt, ny, nx = b.shape nb = len(bands) shape = (ny, nx, nb, nt) token = Cache.dask_new(shape, dtype, f"{name}_alloc") sinks = [dask.delayed(_YXBTSink)(token, idx) for idx in range(nb)] fut = da.store(bands, sinks, lock=False, compute=False) sink_name = f"{name}_collect-{tk}" dsk = dict(fut.dask) dsk[sink_name] = (lambda *x: x[0], token.key, *fut.dask[fut.key]) dsk = HighLevelGraph.from_collections(sink_name, dsk, dependencies=sinks) token_done = Delayed(sink_name, dsk) return _da_from_mem(token_done, shape=shape, dtype=dtype, chunks=chunks, name=name)
def split_hdf5_multiple(arr, out_dirpath, nb_blocks, file_list): """ Arguments: ---------- arr: Array to split file_list: Empty list to store output files' objects. nb_blocks: Nb blocks we want to extract. None = all blocks. """ arr_dict = get_arr_chunks( arr, nb_blocks, as_dict=True) # get array blocks as dask array objects datasets = list() arr_list = list() for key, arr_block in arr_dict.items(): i, j, k = key filename = f'{i}_{j}_{k}.hdf5' filepath = os.path.join(out_dirpath, filename) if os.path.isfile(filepath): os.remove(filepath) file_list.append(h5py.File(filepath, 'w')) datasets.append(file_list[-1].create_dataset('/data', shape=arr_block.shape)) arr_list.append(arr_block) return da.store(arr_list, datasets, compute=False)
def da_yxbt_sink(bands: Tuple[da.Array, ...], chunks: Tuple[int, ...], name="yxbt") -> da.Array: """ each band is in <t,y,x> output is <y,x,b,t> eval(bands) |> transpose(YXBT) |> Store(RAM) |> DaskArray(RAM, chunks) """ b = bands[0] dtype = b.dtype nt, ny, nx = b.shape nb = len(bands) shape = (ny, nx, nb, nt) token = dask.delayed(Cache.new)(shape, dtype) sinks = [dask.delayed(_YXBTSink)(token, idx) for idx in range(nb)] fut = da.store(bands, sinks, lock=False, compute=False) return _da_from_mem(with_deps(token, fut), shape=shape, dtype=dtype, chunks=chunks, name=name)
def da_mem_sink(xx: da.Array, chunks: Tuple[int, ...], name="memsink") -> da.Array: """ It's a kind of fancy rechunk for special needs. Assumptions - Single worker only - ``xx`` can fit in RAM of the worker Note that every output chunk depends on ALL of input chunks. On some Dask worker: - Fully evaluate ``xx`` and serialize to RAM - Present in RAM view of the result with a different chunking regime A common use case would be to load a large collection (>50% of RAM) that needs to be processed by some non-Dask code as a whole. A simple ``do_stuff(xx.compute())`` would not work as duplicating RAM is not an option in that scenario. Normal rechunk might also run out of RAM and introduces large memory copy overhead as all input chunks need to be cached then re-assembled into a different chunking structure. """ token = dask.delayed(Cache.new)(xx.shape, xx.dtype) sink = dask.delayed(CachedArray)(token) fut = da.store(xx, sink, lock=False, compute=False) return _da_from_mem(with_deps(token, fut), shape=xx.shape, dtype=xx.dtype, chunks=chunks, name=name)
def apply_store(B, O, R, volumestokeep, reconstructed_array, outputimgdir, case_index): # creations of data for dask store function d_arrays, d_regions = compute_zones(B, O, R, volumestokeep) out_files = list() # to keep outfiles open during processing sources = list() targets = list() regions = list() for outfile_index in range(9): sliceslistoflist = d_arrays[outfile_index] # create file out_file = h5py.File('./' + str(outfile_index) + '.hdf5', 'w') out_files.append(out_file) # create dset dset = out_file.create_dataset('/data', shape=O) for i, st in enumerate(sliceslistoflist): tmp_array = reconstructed_array[st[0], st[1], st[2]] print("shape:", tmp_array.shape) reg = d_regions[outfile_index][i] tmp_array = tmp_array.rechunk(tmp_array.shape) sources.append(tmp_array) targets.append(dset) regions.append(reg) # storage: creation of task graph task = da.store(sources, targets, regions=regions, compute=False) filename = os.path.join(outputimgdir, 'after_store' + str(case_index) + '.png') task.visualize(optimize_graph=False, filename=filename) return task
def dataarray_to_gridded_product(ds, grid_def, overwrite_existing=False): info = ds.attrs.copy() info.pop("area", None) if ds.ndim == 3: # RGB composite if ds.shape[0] in [3, 4]: channels = ds.shape[0] else: # unpreferred array orientation channels = ds.shape[-1] ds = np.rollaxis(ds, 2) else: channels = 1 if np.issubdtype(np.dtype(ds.dtype), np.floating): dtype = np.float32 else: dtype = ds.dtype p2g_metadata = { "product_name": info["name"], "satellite": info["platform_name"].lower(), "instrument": info["sensor"].lower() if isinstance(info["sensor"], str) else list(info["sensor"])[0].lower(), "data_kind": info["standard_name"], "begin_time": info["start_time"], "end_time": info["end_time"], "fill_value": np.nan, # "swath_columns": cols, # "swath_rows": rows, "rows_per_scan": info["rows_per_scan"], "data_type": dtype, "channels": channels, "grid_definition": grid_def, } info.update(p2g_metadata) filename = info["name"] + ".dat" info["grid_data"] = filename if os.path.isfile(filename): if not overwrite_existing: LOG.error("Binary file already exists: %s" % (filename,)) raise RuntimeError("Binary file already exists: %s" % (filename,)) else: LOG.warning("Binary file already exists, will overwrite: %s", filename) p2g_arr = np.memmap(filename, mode="w+", dtype=dtype, shape=ds.shape) da.store(ds.data.astype(dtype), p2g_arr) return containers.GriddedProduct(**info)
def save_datasets(self, datasets, compute=True, **kwargs): """Save all datasets to one or more files. Subclasses can use this method to save all datasets to one single file or optimize the writing of individual datasets. By default this simply calls `save_dataset` for each dataset provided. Args: datasets (iterable): Iterable of `xarray.DataArray` objects to save using this writer. compute (bool): If `True` (default), compute all of the saves to disk. If `False` then the return value is either a `dask.delayed.Delayed` object or two lists to be passed to a `dask.array.store` call. See return values below for more details. **kwargs: Keyword arguments to pass to `save_dataset`. See that documentation for more details. Returns: Value returned depends on `compute` keyword argument. If `compute` is `True` the value is the result of a either a `dask.array.store` operation or a `dask.delayed.Delayed` compute, typically this is `None`. If `compute` is `False` then the result is either a `dask.delayed.Delayed` object that can be computed with `delayed.compute()` or a two element tuple of sources and targets to be passed to `dask.array.store`. If `targets` is provided then it is the caller's responsibility to close any objects that have a "close" method. """ sources = [] targets = [] for ds in datasets: res = self.save_dataset(ds, compute=False, **kwargs) if isinstance(res, tuple): # source, target to be passed to da.store sources.append(res[0]) targets.append(res[1]) else: # delayed object sources.append(res) # we have targets, we should save sources to targets if targets and compute: LOG.info("Computing and writing results...") res = da.store(sources, targets) for target in targets: if hasattr(target, 'close'): target.close() return res elif targets: return sources, targets delayed = dask.delayed(sources) if compute: LOG.info("Computing and writing results...") return delayed.compute() return delayed
def work(self): import dask.array as da import numpy as np import h5py from luigi.file import atomic_file fs = [h5py.File(f.path, mode='r') for f in self.input()] # Verify all H5s have the same structure datasets, groups, samples = [[] for x in fs], [[] for x in fs ], [[] for x in fs] for i, f in enumerate(fs): f.visititems(lambda n, o: datasets[i].append(n) if isinstance( o, h5py.Dataset) else groups[i].append(n)) samples[i] = f['samples'][:] if not all([set(datasets[0]) == set(x) for x in datasets]) and np.all( samples == samples[0], axis=0): raise Exception( "All HDF5 files must have the same groups/datasets/samples!") datasets, groups, samples = datasets[0], groups[0], samples[0] # Drop Samples dataset and handle separately datasets = [x for x in datasets if x != 'samples'] combined = { d: da.concatenate([da.from_array(f[d], chunks=100000) for f in fs]) for d in datasets } shapes = [(np.sum([f.get(d).shape for f in fs], axis=0)[0], *fs[0].get(d).shape[1:]) for d in datasets] dtypes = [fs[0].get(d).dtype for d in datasets] # Handles Samples dataset datasets.append('samples') combined.update({'samples': da.from_array(fs[0]['samples'], chunks=1)}) shapes.append(samples.shape) dtypes.append(samples.dtype) af = atomic_file(self.output().path) fout = h5py.File(af.tmp_path, 'w') # Set up group structure for g in groups: fout.create_group(g) # Create the datasets out_datasets = {} for p, dtype, shape in zip(datasets, dtypes, shapes): g, d = os.path.split(p) out_datasets[p] = (fout[g] if g else fout).create_dataset( d, shape=shape, dtype=dtype, chunks=True, compression='gzip') for k in combined.keys(): s = da.store(combined[k], out_datasets[k], compute=False) s.compute(num_workers=self.n_cpu) print("Done " + k) af.move_to_final_destination()
def area_to_swath_def(area, overwrite_existing=False): lons = area.lons lats = area.lats name = area.name name = name.replace(":", "") if lons.ndim == 1: rows, cols = lons.shape[0], 1 else: rows, cols = lons.shape info = { "swath_name": name, "longitude": name + "_lon.dat", "latitude": name + "_lat.dat", "swath_rows": rows, "swath_columns": cols, "data_type": lons.dtype, "fill_value": np.nan, } if hasattr(area, "attrs"): info.update(area.attrs) # Write lons to disk filename = info["longitude"] if os.path.isfile(filename): if not overwrite_existing: LOG.error("Binary file already exists: %s" % (filename,)) raise RuntimeError("Binary file already exists: %s" % (filename,)) else: LOG.warning("Binary file already exists, will overwrite: %s", filename) LOG.info("Writing longitude data to disk cache...") lon_arr = np.memmap(filename, mode="w+", dtype=lons.dtype, shape=lons.shape) da.store(lons.data, lon_arr) # Write lats to disk filename = info["latitude"] if os.path.isfile(filename): if not overwrite_existing: LOG.error("Binary file already exists: %s" % (filename,)) raise RuntimeError("Binary file already exists: %s" % (filename,)) else: LOG.warning("Binary file already exists, will overwrite: %s", filename) LOG.info("Writing latitude data to disk cache...") lat_arr = np.memmap(filename, mode="w+", dtype=lats.dtype, shape=lats.shape) da.store(lats.data, lat_arr) return containers.SwathDefinition(**info)
def sync(self, compute=True): if self.sources: import dask.array as da delayed_store = da.store(self.sources, self.targets, lock=self.lock, compute=compute, flush=True) self.sources = [] self.targets = [] return delayed_store
def save_datasets(self, datasets, compute=True, **kwargs): """Save all datasets to one or more files. Subclasses can use this method to save all datasets to one single file or optimize the writing of individual datasets. By default this simply calls `save_dataset` for each dataset provided. Args: datasets (iterable): Iterable of `xarray.DataArray` objects to save using this writer. compute (bool): If `True` (default), compute all of the saves to disk. If `False` then the return value is either a `dask.delayed.Delayed` object or two lists to be passed to a `dask.array.store` call. See return values below for more details. **kwargs: Keyword arguments to pass to `save_dataset`. See that documentation for more details. Returns: Value returned depends on `compute` keyword argument. If `compute` is `True` the value is the result of a either a `dask.array.store` operation or a `dask.delayed.Delayed` compute, typically this is `None`. If `compute` is `False` then the result is either a `dask.delayed.Delayed` object that can be computed with `delayed.compute()` or a two element tuple of sources and targets to be passed to `dask.array.store`. If `targets` is provided then it is the caller's responsibility to close any objects that have a "close" method. """ sources = [] targets = [] for ds in datasets: res = self.save_dataset(ds, compute=False, **kwargs) if isinstance(res, tuple): # source, target to be passed to da.store sources.append(res[0]) targets.append(res[1]) else: # delayed object sources.append(res) # we have targets, we should save sources to targets if targets and compute: res = da.store(sources, targets) for target in targets: if hasattr(target, 'close'): target.close() return res elif targets: return sources, targets delayed = dask.delayed(sources) if compute: return delayed.compute() return delayed
def do(f): dataset = f[path] gpath = os.path.dirname('/' + path) g = f[gpath] d = da.from_array(dataset, chunks=_good_chunk(dataset)) name = os.path.basename(dataset.name) tmp_name = name + '_tmp_cfm92askj3' if tmp_name in g: del g[tmp_name] tmp_d = g.create_dataset(tmp_name, shape=dataset.shape, dtype=dataset.dtype, chunks=chunks, compression=compression) da.store(d, tmp_d) del g[name] g[name] = g[tmp_name] del g[tmp_name]
def test_simple_delayed_write(self): """Test writing can be delayed.""" import dask.array as da from satpy.writers.geotiff import GeoTIFFWriter datasets = self._get_test_datasets() w = GeoTIFFWriter(base_dir=self.base_dir) # when we switch to rio_save on XRImage then this will be sources # and targets res = w.save_datasets(datasets, compute=False) # this will fail if rasterio isn't installed self.assertIsInstance(res, tuple) # two lists, sources and destinations self.assertEqual(len(res), 2) self.assertIsInstance(res[0], list) self.assertIsInstance(res[1], list) self.assertIsInstance(res[0][0], da.Array) da.store(res[0], res[1]) for target in res[1]: if hasattr(target, 'close'): target.close()
def overwrite_dataset(group, data, key, signal_axes=None, chunks=None, **kwds): if chunks is None: if signal_axes is None: # Use automatic h5py chunking chunks = True else: # Optimise the chunking to contain at least one signal per chunk chunks = get_signal_chunks(data.shape, data.dtype, signal_axes) maxshape = tuple(None for _ in data.shape) got_data = False while not got_data: try: these_kwds = kwds.copy() these_kwds.update(dict(shape=data.shape, dtype=data.dtype, exact=True, maxshape=maxshape, chunks=chunks, shuffle=True,)) # If chunks is True, the `chunks` attribute of `dset` below # contains the chunk shape guessed by h5py dset = group.require_dataset(key, **these_kwds) got_data = True except TypeError: # if the shape or dtype/etc do not match, # we delete the old one and create new in the next loop run del group[key] if dset == data: # just a reference to already created thing pass else: _logger.info("Chunks used for saving: %s" % str(dset.chunks)) if isinstance(data, da.Array): da.store(data.rechunk(dset.chunks), dset) elif data.flags.c_contiguous: dset.write_direct(data) else: dset[:] = data
def sync(self, compute=True): if self.sources: import dask.array as da # TODO: consider wrapping targets with dask.delayed, if this makes # for any discernable difference in perforance, e.g., # targets = [dask.delayed(t) for t in self.targets] delayed_store = da.store(self.sources, self.targets, lock=self.lock, compute=compute, flush=True) self.sources = [] self.targets = [] return delayed_store
def load(s, measure, dset_name, transpose_lst, df_attr='demog_df'): ''' given measure, h5 dataset name, transpose list: load data ''' df = getattr(s, df_attr) if measure in dir(s): print(measure, 'already loaded') if df.shape[0] != getattr(s, measure).shape[0]: print('shape of loaded data does not match demogs, reloading') else: return np.array([]) dsets = [h5py.File(fn, 'r')[dset_name] for fn in df['path'].values] arrays = [da.from_array(dset, chunks=dset.shape) for dset in dsets] stack = da.stack(arrays, axis=-1) # concatenate along last axis stack = stack.transpose(transpose_lst) # do transposition data = np.empty(stack.shape) da.store(stack, data) print(data.shape) return data
def fft_to_hdf5(x, filename, axis=-1, chunksize=2**26, available_memory=(4 * 1024**3), cache=None): """Simple wrapper for DAFT FFT function that writes to HDF5 This function calls the DAFT function, but also performs the computation of the FFT, and outputs the result into the requested HDF5 file Parameters ---------- x : array_like Input array, can be complex. filename : string Relative or absolute path to HDF5 file. If this string contains a colon, the preceding part is taken as the filename, while the following part is taken as the dataset group name. The default group name is 'X'. axis : int, optional Axis over which to compute the FFT. If not given, the last axis is used. chunksize : int, optional Chunksize to use when splitting up the input array. Default is 2**24, which is about 64MB -- a reasonable target that reduces memory usage. available_memory : int, optional Maximum amount of RAM to use for caching during computation. Defaults to 4*1024**3, which is 4GB. """ from h5py import File from dask import set_options from dask.array import store if cache is None: from chest import Chest # For more flexible caching cache = Chest(available_memory=available_memory) if ':' in filename: filename, groupname = filename.split(':') else: groupname = 'X' X_dask = DAFT(x, axis=axis, chunksize=chunksize) with set_options(cache=cache): with File(filename, 'w') as f: output = f.create_dataset(groupname, shape=X_dask.shape, dtype=X_dask.dtype) store(X_dask, output) return
def dask_detrend_data(data, output_arr): """ Detrend data using a linear fit. Parameters ---------- data: dask.array Input dataset to detrend. Assumes leading axis is sampling dimension. output_arr: ndarray-like Output array with same shape as data to store detrended data. Notes ----- This is a very expensive operation if using a large dataset. May slow down if forced to spill onto the disk cache It does not currently take into account X data. Instead, it creates a dummy array (using arange) for sampling points. """ dummy_time = np.arange(data.shape[0])[:, None] dummy_time = da.from_array(dummy_time, chunks=dummy_time.shape) # intercept handling x_offset = dummy_time.mean(axis=0) x_centered = dummy_time - x_offset y_offset = data.mean(axis=0) y_centered = data - y_offset coefs, resid, rank, s = da.linalg.lstsq(x_centered, y_centered) intercepts = y_offset - x_offset*coefs predict = da.dot(dummy_time, coefs) + intercepts detrended = data - predict da.store(detrended, output_arr) return output_arr
def _eval_blocks(expression, vars, vlen, typesize, vm, out_flavor, blen, **kwargs): """Perform the evaluation in blocks.""" if not blen: # Compute the optimal block size (in elements) # The next is based on experiments with bench/ctable-query.py # and the 'movielens-bench' repository if vm == "numexpr": bsize = 2**23 elif vm == "dask": bsize = 2**25 else: # python bsize = 2**21 blen = int(bsize / typesize) # Protection against too large atomsizes if blen == 0: blen = 1 if vm == "dask": if 'da' in vars: raise NameError( "'da' is reserved as a prefix for dask.array. " "Please use another prefix") for name in vars: var = vars[name] if is_sequence_like(var): vars[name] = da.from_array(var, chunks=(blen,) + var.shape[1:]) # Build the expression graph vars['da'] = da da_expr = _eval(expression, vars) if out_flavor in ("bcolz", "carray") and da_expr.shape: result = bcolz.zeros(da_expr.shape, da_expr.dtype, **kwargs) # Store while compute expression graph da.store(da_expr, result) return result else: # Store while compute return np.array(da_expr) # Check whether we have a re_evaluate() function in numexpr re_evaluate = bcolz.numexpr_here and hasattr(bcolz.numexpr, "re_evaluate") vars_ = {} # Get containers for vars maxndims = 0 for name in vars: var = vars[name] if is_sequence_like(var): ndims = len(var.shape) + len(var.dtype.shape) if ndims > maxndims: maxndims = ndims if len(var) > blen and hasattr(var, "_getrange"): shape = (blen, ) + var.shape[1:] vars_[name] = np.empty(shape, dtype=var.dtype) for i in xrange(0, vlen, blen): # Fill buffers for vars for name in vars: var = vars[name] if is_sequence_like(var) and len(var) > blen: if hasattr(var, "_getrange"): if i+blen < vlen: var._getrange(i, blen, vars_[name]) else: vars_[name] = var[i:] else: vars_[name] = var[i:i+blen] else: if hasattr(var, "__getitem__"): vars_[name] = var[:] else: vars_[name] = var # Perform the evaluation for this block if vm == "python": res_block = _eval(expression, vars_) else: if i == 0 or not re_evaluate: try: res_block = bcolz.numexpr.evaluate(expression, local_dict=vars_) except ValueError: # numexpr cannot handle this, so fall back to "python" vm warnings.warn( "numexpr cannot handle this expression: falling back " "to the 'python' virtual machine. You can choose " "another virtual machine by using the `vm` parameter.") return _eval_blocks( expression, vars, vlen, typesize, "python", out_flavor, blen, **kwargs) else: res_block = bcolz.numexpr.re_evaluate(local_dict=vars_) if i == 0: # Detection of reduction operations scalar = False dim_reduction = False if len(res_block.shape) == 0: scalar = True result = res_block continue elif len(res_block.shape) < maxndims: dim_reduction = True result = res_block continue # Get a decent default for expectedlen if out_flavor in ("bcolz", "carray"): nrows = kwargs.pop('expectedlen', vlen) result = bcolz.carray(res_block, expectedlen=nrows, **kwargs) else: out_shape = list(res_block.shape) out_shape[0] = vlen result = np.empty(out_shape, dtype=res_block.dtype) result[:blen] = res_block else: if scalar or dim_reduction: result += res_block elif out_flavor in ("bcolz", "carray"): result.append(res_block) else: result[i:i+blen] = res_block if isinstance(result, bcolz.carray): result.flush() if scalar: return result[()] return result
def calc_anomaly(data, yrsize, climo=None, output_arr=None): """ Caculate anomaly for the given data. Right now it assumes sub-annual data input so that the climatology subtracts means for each month instead of the mean of the entire series. Note: May take yrsize argument out and leave it to user to format data as to take the desired anomaly. Parameters ---------- data: ndarray Input data to calculate the anomaly from. Leading dimension should be the temporal axis. yrsize: int Number of elements that compose a full year. Used to reshape the data time axis to num years x size year for climatology purposes. climo: ndarray, optional User-provided climatology to subtract from the data. Must be broadcastable over the time-dimension of data output_arr: ndarray-like, optional Array to place output of anomaly calculation that supports ndarray-like slicing. This is required for dask array input. Returns ------- anomaly: ndarray-like Data converted to its anomaly form. climo: ndarray The calculated climatology that was subtracted from the data """ yrsize = int(yrsize) if not yrsize >= 1: raise ValueError('yrsize must be an integer >= 1') # Reshape to take monthly mean old_shp = data.shape new_shp = (old_shp[0]//yrsize, yrsize, old_shp[1]) data = data.reshape(new_shp) # Use of data[:] should work for ndarray or ndarray-like if climo is None: climo = data.mean(axis=0, keepdims=True) if is_dask_array(data): if output_arr is None: raise ValueError('calc_anomaly requires an output array keyword ' 'argument when operating on a Dask array.') anomaly = data - climo old_shp_anom = anomaly.reshape(old_shp) da.store(old_shp_anom, output_arr) out_climo = climo.compute() else: if output_arr is not None: output_arr[:] = np.squeeze(ne.evaluate('data - climo')) else: output_arr = np.squeeze(ne.evaluate('data - climo')) output_arr = output_arr.reshape(old_shp) out_climo = climo return output_arr, out_climo
def run_mean(data, window_size, trim_edge=None, output_arr=None): """ A function for calculating the running mean on data. Parameters ---------- data: ndarray Data matrix to perform running mean over. Expected to be in time(row) x space(column) format. And that samples span full years. window_size: int Size of the window to compute the running mean over. trim_edge: int, optional Remove specified items from the start and end of the sampling dimension of the running mean. Otherwise the window_size/2 items at the start and the end will have reflected padding effects. output_arr: ndarray-like, optional Array to place output of running mean that supports ndarray-like slicing. This is required for dask array input. Returns ------- result: ndarray Running mean result of given data. bot_edge: int Number of elements removed from beginning of the time series top_edge: int Number of elements removed from the ending of the time series """ sample_len = data.shape[0] if sample_len < window_size: raise ValueError("Window size must be smaller than or equal to the " "length of the time dimension of the data.") if trim_edge is not None: sample_len -= trim_edge*2 if sample_len < 1: raise ValueError('Not enough data to trim edges. Please try with ' 'trim_edge=None') weights = [1.0/float(window_size) for _ in xrange(window_size)] if is_dask_array(data): if output_arr is None: raise ValueError('calc_anomaly requires an output array keyword ' 'argument when operating on a Dask array.') def _run_mean_block(block): return convolve1d(block, weights, axis=0) old_chunk_shape = data pad = window_size // 2 ghost = da.ghost.ghost(data, depth={0: pad}, boundary={0: 'reflect'}) filt = ghost.map_blocks(_run_mean_block) unpadded = da.ghost.trim_internal(filt, {0: pad}) if trim_edge is not None: unpadded = unpadded[trim_edge:-trim_edge] da.store(unpadded, output_arr) else: res = convolve1d(data, weights, axis=0) if trim_edge: res = res[trim_edge:-trim_edge] if output_arr is not None: output_arr[:] = res else: output_arr = res return output_arr
def dataarray_to_swath_product(ds, swath_def, overwrite_existing=False): info = ds.attrs.copy() info.pop("area") if ds.ndim == 3: # RGB composite if ds.shape[0] in [3, 4]: channels = ds.shape[0] else: # unpreferred array orientation channels = ds.shape[-1] ds = np.rollaxis(ds, 2) else: channels = 1 if ds.ndim == 1: rows, cols = ds.shape[0], 1 else: rows, cols = ds.shape[-2:] if np.issubdtype(np.dtype(ds.dtype), np.floating): dtype = np.float32 else: dtype = ds.dtype if isinstance(info["sensor"], bytes): info["sensor"] = info["sensor"].decode("utf-8") p2g_metadata = { "product_name": info["name"], "satellite": info["platform_name"].lower(), "instrument": info["sensor"].lower() if isinstance(info["sensor"], str) else list(info["sensor"])[0].lower(), "data_kind": info["standard_name"], "begin_time": info["start_time"], "end_time": info["end_time"], "fill_value": np.nan, "swath_columns": cols, "swath_rows": rows, "rows_per_scan": info.get("rows_per_scan", rows), "data_type": dtype, "swath_definition": swath_def, "channels": channels, } info.update(p2g_metadata) if channels == 1: filename = info["name"] + ".dat" info["swath_data"] = filename if os.path.isfile(filename): if not overwrite_existing: LOG.error("Binary file already exists: %s" % (filename,)) raise RuntimeError("Binary file already exists: %s" % (filename,)) else: LOG.warning("Binary file already exists, will overwrite: %s", filename) LOG.info("Writing band data to disk cache...") p2g_arr = np.memmap(filename, mode="w+", dtype=dtype, shape=ds.shape) ds = ds.where(ds.notnull(), np.nan) da.store(ds.data.astype(dtype), p2g_arr) yield containers.SwathProduct(**info) else: for chn_idx in range(channels): tmp_info = info.copy() tmp_info["product_name"] = info["product_name"] + "_rgb_{:d}".format(chn_idx) filename = tmp_info["product_name"] + ".dat" tmp_info["swath_data"] = filename if os.path.isfile(filename): if not overwrite_existing: LOG.error("Binary file already exists: %s" % (filename,)) raise RuntimeError("Binary file already exists: %s" % (filename,)) else: LOG.warning("Binary file already exists, will overwrite: %s", filename) LOG.info("Writing band data to disk cache...") p2g_arr = np.memmap(filename, mode="w+", dtype=dtype, shape=ds.shape[-2:]) da.store(ds.data[chn_idx].astype(dtype), p2g_arr) yield containers.SwathProduct(**tmp_info)
def rio_save(self, filename, fformat=None, fill_value=None, dtype=np.uint8, compute=True, tags=None, keep_palette=False, cmap=None, **format_kwargs): """Save the image using rasterio. Overviews can be added to the file using the `overviews` kwarg, eg:: img.rio_save('myfile.tif', overviews=[2, 4, 8, 16]) """ fformat = fformat or os.path.splitext(filename)[1][1:4] drivers = {'jpg': 'JPEG', 'png': 'PNG', 'tif': 'GTiff', 'jp2': 'JP2OpenJPEG'} driver = drivers.get(fformat, fformat) if tags is None: tags = {} data, mode = self.finalize(fill_value, dtype=dtype, keep_palette=keep_palette, cmap=cmap) data = data.transpose('bands', 'y', 'x') data.attrs = self.data.attrs crs = None gcps = None transform = None if driver in ['GTiff', 'JP2OpenJPEG']: if not np.issubdtype(data.dtype, np.floating): format_kwargs.setdefault('compress', 'DEFLATE') photometric_map = { 'RGB': 'RGB', 'RGBA': 'RGB', 'CMYK': 'CMYK', 'CMYKA': 'CMYK', 'YCBCR': 'YCBCR', 'YCBCRA': 'YCBCR', } if mode.upper() in photometric_map: format_kwargs.setdefault('photometric', photometric_map[mode.upper()]) try: crs = rasterio.crs.CRS(data.attrs['area'].proj_dict) west, south, east, north = data.attrs['area'].area_extent height, width = data.sizes['y'], data.sizes['x'] transform = rasterio.transform.from_bounds(west, south, east, north, width, height) except KeyError: # No area logger.info("Couldn't create geotransform") except AttributeError: try: gcps = data.attrs['area'].lons.attrs['gcps'] crs = data.attrs['area'].lons.attrs['crs'] except KeyError: logger.info("Couldn't create geotransform") if "start_time" in data.attrs: stime = data.attrs['start_time'] stime_str = stime.strftime("%Y:%m:%d %H:%M:%S") tags.setdefault('TIFFTAG_DATETIME', stime_str) elif driver == 'JPEG' and 'A' in mode: raise ValueError('JPEG does not support alpha') # FIXME add metadata r_file = RIOFile(filename, 'w', driver=driver, width=data.sizes['x'], height=data.sizes['y'], count=data.sizes['bands'], dtype=dtype, nodata=fill_value, crs=crs, transform=transform, gcps=gcps, **format_kwargs) r_file.open() if not keep_palette: r_file.colorinterp = color_interp(data) r_file.rfile.update_tags(**tags) if keep_palette and cmap is not None: if data.dtype != 'uint8': raise ValueError('Rasterio only supports 8-bit colormaps') try: from trollimage.colormap import Colormap cmap = cmap.to_rio() if isinstance(cmap, Colormap) else cmap r_file.rfile.write_colormap(1, cmap) except AttributeError: raise ValueError("Colormap is not formatted correctly") if compute: # write data to the file now res = da.store(data.data, r_file) r_file.close() return res # provide the data object and the opened file so the caller can # store them when they would like. Caller is responsible for # closing the file return data.data, r_file
def calc_eofs(data, num_eigs, ret_pcs=False, var_stats_dict=None): """ Method to calculate the EOFs of given dataset. This assumes data comes in as an m x n matrix where m is the temporal dimension and n is the spatial dimension. Parameters ---------- data: ndarray Dataset to calculate EOFs from num_eigs: int Number of eigenvalues/vectors to return. Must be less than min(m, n). ret_pcs: bool, optional Return principal component matrix along with EOFs var_stats_dict: dict, optional Dictionary target to star some simple statistics about the EOF calculation. Note: if this is provided for a dask array it prompts two SVD calculations for both the compressed and full singular values. Returns ------- eofs: ndarray The eofs (as column vectors) of the data with dimensions n x k where k is the num_eigs. svals: ndarray Singular values from the svd decomposition. Returned as a row vector in order from largest to smallest. """ if is_dask_array(data): pcs, full_svals, eofs = da.linalg.svd_compressed(data, num_eigs) out_svals = np.zeros(num_eigs) out_eofs = np.zeros((num_eigs, data.shape[1])) out_pcs = np.zeros((data.shape[0], num_eigs)) da.store([eofs, full_svals, pcs], [out_eofs, out_svals, out_pcs]) out_eofs = out_eofs.T out_pcs = out_pcs.T if var_stats_dict is not None: logger.warning('Cannot currently provide variance statistics for ' 'EOFs computed on a dask array.') else: eofs, full_svals, pcs = svd(data[:].T, full_matrices=False) out_eofs = eofs[:, :num_eigs] out_svals = full_svals[:num_eigs] out_pcs = pcs[:num_eigs] # variance stats if var_stats_dict is not None: try: nt = data.shape[0] ns = data.shape[1] eig_vals = (full_svals ** 2) / (nt * ns) total_var = eig_vals.sum() var_expl_by_mode = eig_vals / total_var var_expl_by_retained = var_expl_by_mode[0:num_eigs].sum() var_stats_dict['nt'] = nt var_stats_dict['ns'] = ns var_stats_dict['eigvals'] = eig_vals var_stats_dict['num_ret_modes'] = num_eigs var_stats_dict['total_var'] = total_var var_stats_dict['var_expl_by_mode'] = var_expl_by_mode var_stats_dict['var_expl_by_ret'] = var_expl_by_retained except TypeError as e: print 'Must past dictionary type to var_stats_dict in order to ' \ 'output variance statistics.' print e if ret_pcs: return out_eofs, out_svals, out_pcs else: return out_eofs, out_svals
def valid_images_to_hdf5(directory, width=224, height=224, channels=3): ''' Function to build needed arrays for training or validating the neural network using out of core processing. If labels are passed, get a list of training image files, their labels ''' validationList, _ = get_list_of_validation_files(directory) # Pass directory containing validation images print('Creating the hdf5 file...') len_array = len(validationList) with h5py.File('validation_files.h5', 'w') as hf: dset = hf.create_dataset('validation_array', (len_array, channels, width, height), chunks=True) img_names = hf.create_dataset('image_names', (len_array,), chunks=True, dtype='S40') with h5py.File('validation_files.h5', 'r+') as hf: x = hf['validation_array'] X = da.from_array(x, chunks=1000) image_names = list(hf['image_names']) print('There are ', len(validationList), ' files in the validation list.') print('Breaking the validation list into chunks of 10,000...') chunkedList = get_chunks(validationList, 10000) # Break the list of files in to chunks of 10000 if channels == 3: for i, chunk in enumerate(chunkedList): # print(chunk) count = i + len(chunk[i][:])*i # Set counter for empty array # valid_sublist = chunk[i][:] print('Create empty list to store image names..') filenames = [] print('Creating an empty array to store images...') X = create_holding_array(chunk, width = width, height=height, channels=channels) # Create empty array for j, validFile in enumerate(chunk): print('Reading file #: ', j) filenames.append(os.path.basename(validFile)) # print(chunk) # input('') img = misc.imread(validFile) # Read the image img = misc.imresize(img, size = (width, height, channels)) # Resize image with color channel = 3 # img = np.transpose(img, (2,0,1)) # Store resized image in empty array X[j] = img asciiList = [] asciiList = [n.encode("ascii", "ignore") for n in filenames] X1 = np.transpose(X, (0, 3, 1, 2)) del X, filenames print(X1.shape) X_da = da.from_array(X1, chunks=1000) print('Opening validation_files.h5...') with h5py.File('validation_files.h5', 'r+') as hf: print('Putting validation_array in x...') x = hf['validation_array'] print('Putting validation_array in dask array...') dset = da.from_array(x, chunks=1000) print('Concatenating the two dask arrays...') X2 = da.concatenate([dset, X_da], axis=0) print('Storing the dask array in the hdf5 file...') da.store(X2, x) print('Put image_names dset into a list...') image_names = list(hf['image_names']) print('Extend the list with additional image names...') image_names.extend(asciiList) print('Done.') return filenames else: # If number of channels != 1 or != 3 print('Could not create dataset and resize training images...')
def to_geotiff(arr, path='./output.tif', proj=None, spec=None, bands=None, **kwargs): ''' Write out a geotiff file of the image Args: path (str): path to write the geotiff file to, default is ./output.tif proj (str): EPSG string of projection to reproject to spec (str): if set to 'rgb', write out color-balanced 8-bit RGB tif bands (list): list of bands to export. If spec='rgb' will default to RGB bands Returns: str: path the geotiff was written to''' assert has_rasterio, "To create geotiff images please install rasterio" try: img_md = arr.rda.metadata["image"] x_size = img_md["tileXSize"] y_size = img_md["tileYSize"] except (AttributeError, KeyError): x_size = kwargs.get("chunk_size", 256) y_size = kwargs.get("chunk_size", 256) try: tfm = kwargs['transform'] if 'transform' in kwargs else arr.affine except: tfm = None dtype = arr.dtype.name if arr.dtype.name != 'int8' else 'uint8' if spec is not None and spec.lower() == 'rgb': if bands is None: bands = arr._rgb_bands # skip if already DRA'ed if not arr.options.get('dra'): # add the RDA HistogramDRA op to get a RGB 8-bit image from gbdxtools.rda.interface import RDA rda = RDA() dra = rda.HistogramDRA(arr) # Reset the bounds and select the bands on the new Dask arr = dra.aoi(bbox=arr.bounds) arr = arr[bands,...].astype(np.uint8) dtype = 'uint8' else: if bands is not None: arr = arr[bands,...] meta = { 'width': arr.shape[2], 'height': arr.shape[1], 'count': arr.shape[0], 'dtype': dtype, 'driver': 'GTiff', 'transform': tfm } if proj is not None: meta["crs"] = {'init': proj} if "tiled" in kwargs and kwargs["tiled"]: meta.update(blockxsize=x_size, blockysize=y_size, tiled="yes") with rasterio.open(path, "w", **meta) as dst: writer = rio_writer(dst) result = store(arr, writer, compute=False) result.compute(scheduler=threaded_get) return path
def sync(self): if self.sources: import dask.array as da da.store(self.sources, self.targets) self.sources = [] self.targets = []