def rechunk_zarr(arr, target_chunks, target_store, intermediate_store='temp/intermediate_ld_rechunk.zarr', **kwargs): if os.path.isdir(target_store): try: z = zarr.open(target_store) z.store.rmdir() except Exception as e: raise e rechunked = rechunk(arr, target_chunks=target_chunks, target_store=target_store, temp_store=intermediate_store, max_mem=psutil.virtual_memory().available / psutil.cpu_count(), **kwargs) try: rechunked.execute() # Delete the older stores: zarr.open(intermediate_store).store.rmdir() arr.store.rmdir() except Exception as e: raise e return zarr.open(target_store)
fs.rm(url, recursive=True) elif command == "create": # create a random Zarr array for the source # dask_chunks determine the unit of work - they are bigger than the chunks # of the zarr file we are writing to dask_chunks = (800, 4000) zarr_chunks = source_chunks executor = PywrenExecutor(pywren.local_executor()) arr = da.random.random(size=shape, chunks=dask_chunks) itemsize = arr.dtype.itemsize max_mem = str(itemsize * prod(dask_chunks)) plan = rechunk(arr, zarr_chunks, max_mem, source_store) plan.execute(scheduler=executor_scheduler, executor=executor, batch_size=100) elif command == "rechunk": # rechunk the source to the target max_mem = 25_600_000 executor = PywrenExecutor(pywren.local_executor()) source_array = zarr.open_array(source_store, mode="r") plan = rechunk(source_array,
def test_rechunker_local(): source_url = "data/source.zarr" target_url = "data/target.zarr" temp_url = "data/temp.zarr" # delete for url in (source_url, target_url, temp_url): delete(url) source_store = fsspec.get_mapper(source_url) target_store = fsspec.get_mapper(target_url) temp_store = fsspec.get_mapper(temp_url) shape = (4000, 4000) source_chunks = (400, 4000) target_chunks = (4000, 400) executor = PywrenExecutor(pywren.local_executor()) # create # dask_chunks determine the unit of work - they are bigger than the chunks # of the zarr file we are writing to dask_chunks = (800, 4000) zarr_chunks = source_chunks arr = da.random.random(size=shape, chunks=dask_chunks) itemsize = arr.dtype.itemsize max_mem = str(itemsize * prod(dask_chunks)) plan = rechunk(arr, zarr_chunks, max_mem, source_store) plan.execute( scheduler=executor_scheduler, executor=executor, batch_size=100 ) z = zarr.open_array(source_store, mode="r") assert z.shape == shape assert z.chunks == source_chunks assert z.nchunks == 10 assert z.dtype == float # rechunk max_mem = 25_600_000 source_array = zarr.open_array(source_store, mode="r") plan = rechunk( source_array, target_chunks, max_mem, target_store, temp_store=temp_store ) plan.execute( scheduler=executor_scheduler, executor=executor, batch_size=100 ) z = zarr.open_array(target_store, mode="r") assert z.shape == shape assert z.chunks == target_chunks assert z.nchunks == 10 assert z.dtype == float
def transpose_rechunk( ds, chunks, iters=None, face=None, subsampling=None, name=None, out_dir=work_data_dir + "rechunked/", overwrite=True, max_mem="25GB", verbose=0, debug=False, ): """Rechunk a data array cluster = PBSCluster(cores=2, processes=1, walltime='03:00:00') Parameters: ds: xarray.DataArray input data array chunks: tuple (Nt, Nj, Ni) iters: xarray.DataArray mitgcm iterations to consider face: int, optional face to consider out_dir: str, optional output path max_mem: str, optional rechunker parameter verbose: turn on/off verbose """ if name is None: vnames = list(ds) assert len( vnames ) == 1, "You should have only one variablie in the xr dataset" v = vnames[0] if face is not None: ds = ds.sel(face=face) suff = "_f{:02d}.zarr".format(int(face)) print(" face={}".format(int(face))) else: suff = ".zarr" # rechunker outputs target_store = out_dir + v + suff temp_store = out_dir + "tmp.zarr" # clean archives if necessary rmtree(temp_store, ignore_errors=True) if os.path.isdir(target_store): if overwrite: rmtree(target_store) else: print("Do not overwrite") # assert False, 'Archive exists and you do not want to overwrite' return # select common time line if iters is not None: t0 = ds["time"].where(ds.iters == iters[0], drop=True).values[0] t1 = ds["time"].where(ds.iters == iters[-1], drop=True).values[0] ds = ds.sel(time=slice(t0, t1)) ds["dtime"] = ds["dtime"].compute() ds["iters"] = ds["iters"].compute() if subsampling is not None: i_dim, j_dim = get_ij_dims(ds[v]) ds = ds.isel( **{ i_dim: slice(0, None, subsampling), j_dim: slice(0, None, subsampling), }) # deal with the time dimension Nt = len(ds.time) - 1 if chunks[0] == 0 else chunks[0] chunks = (Nt, chunks[1], chunks[2]) # -1 is to obtain 8784 which you can divide by 4**2 # necessary ? yes ds = ds.isel(time=slice(len(ds.time) // Nt * Nt)) # init rechunker target_chunks = get_chunks(chunks, v, 1, verbose=verbose) r = rechunk(ds, target_chunks, max_mem, target_store, temp_store=temp_store) if verbose > 0: print_rechunk(r, v) # exec if debug: return r result = r.execute() # clean up intermediate file rmtree(temp_store, ignore_errors=True) print(" rechunking over")
def save_minian( var: xr.DataArray, dpath: str, meta_dict: Optional[dict] = None, overwrite=False, chunks: Optional[dict] = None, compute=True, mem_limit="500MB", ) -> xr.DataArray: """ Save a `xr.DataArray` with `zarr` storage backend following minian conventions. This function will store arbitrary `xr.DataArray` into `dpath` with `zarr` backend. A separate folder will be created under `dpath`, with folder name `var.name + ".zarr"`. Optionally metadata can be retrieved from directory hierarchy and added as coordinates of the `xr.DataArray`. In addition, an on-disk rechunking of the result can be performed using :func:`rechunker.rechunk` if `chunks` are given. Parameters ---------- var : xr.DataArray The array to be saved. dpath : str The path to the minian dataset directory. meta_dict : dict, optional How metadata should be retrieved from directory hierarchy. The keys should be negative integers representing directory level relative to `dpath` (so `-1` means the immediate parent directory of `dpath`), and values should be the name of dimensions represented by the corresponding level of directory. The actual coordinate value of the dimensions will be the directory name of corresponding level. By default `None`. overwrite : bool, optional Whether to overwrite the result on disk. By default `False`. chunks : dict, optional A dictionary specifying the desired chunk size. The chunk size should be specified using :doc:`dask:array-chunks` convention, except the "auto" specifiication is not supported. The rechunking operation will be carried out with on-disk algorithms using :func:`rechunker.rechunk`. By default `None`. compute : bool, optional Whether to compute `var` and save it immediately. By default `True`. mem_limit : str, optional The memory limit for the on-disk rechunking algorithm, passed to :func:`rechunker.rechunk`. Only used if `chunks` is not `None`. By default `"500MB"`. Returns ------- var : xr.DataArray The array representation of saving result. If `compute` is `True`, then the returned array will only contain delayed task of loading the on-disk `zarr` arrays. Otherwise all computation leading to the input `var` will be preserved in the result. Examples ------- The following will save the variable `var` to directory `/spatial_memory/alpha/learning1/minian/important_array.zarr`, with the additional coordinates: `{"session": "learning1", "animal": "alpha", "experiment": "spatial_memory"}`. >>> save_minian( ... var.rename("important_array"), ... "/spatial_memory/alpha/learning1/minian", ... {-1: "session", -2: "animal", -3: "experiment"}, ... ) # doctest: +SKIP """ dpath = os.path.normpath(dpath) Path(dpath).mkdir(parents=True, exist_ok=True) ds = var.to_dataset() if meta_dict is not None: pathlist = os.path.split(os.path.abspath(dpath))[0].split(os.sep) ds = ds.assign_coords( **dict([(dn, pathlist[di]) for dn, di in meta_dict.items()]) ) md = {True: "a", False: "w-"}[overwrite] fp = os.path.join(dpath, var.name + ".zarr") if overwrite: try: shutil.rmtree(fp) except FileNotFoundError: pass arr = ds.to_zarr(fp, compute=compute, mode=md) if (chunks is not None) and compute: chunks = {d: var.sizes[d] if v <= 0 else v for d, v in chunks.items()} dst_path = os.path.join(dpath, str(uuid4())) temp_path = os.path.join(dpath, str(uuid4())) with da.config.set( array_optimize=darr.optimization.optimize, delayed_optimize=default_delay_optimize, ): zstore = zr.open(fp) rechk = rechunker.rechunk( zstore[var.name], chunks, mem_limit, dst_path, temp_store=temp_path ) rechk.execute() try: shutil.rmtree(temp_path) except FileNotFoundError: pass arr_path = os.path.join(fp, var.name) for f in os.listdir(arr_path): os.remove(os.path.join(arr_path, f)) for f in os.listdir(dst_path): os.rename(os.path.join(dst_path, f), os.path.join(arr_path, f)) os.rmdir(dst_path) if compute: arr = xr.open_zarr(fp)[var.name] arr.data = darr.from_zarr(os.path.join(fp, var.name), inline_array=True) return arr
def rechunk_zarr_array_with_caching( zarr_array: xr.Dataset, chunking_approach: Optional[str] = None, template_chunk_array: Optional[xr.Dataset] = None, output_path: Optional[str] = None, max_mem: str = "200MB", overwrite: bool = False, ) -> xr.Dataset: """Use `rechunker` package to adjust chunks of dataset to a form conducive for your processing. Parameters ---------- zarr_array : zarr or xarray dataset Dataset you want to rechunk. output_path: str Path to where the output data is saved. If output path is not empty, the content would be loaded and the schema checked. If the schema check passed, the content will be returned without rechunking again (i.e. caching); else, the content can be overwritten (see overwrite option). chunking_approach : str Has to be one of `full_space` or `full_time`. If `full_space`, the data will be rechunked such that the space dimensions are contiguous (i.e. each chunk will contain full maps). If `full_time`, the data will be rechunked such that the time dimension is contiguous (i.e. each chunk will contain full time series) max_mem : str The max memory you want to allow for a chunk. Probably want it to be around 100 MB, but that is also controlled by the `calc_auspicious_chunk_sizes` calls. overwrite : bool Whether to overwrite the content saved at output_path if the content did not pass schema check. Returns ------- rechunked_ds : xr.Dataset Rechunked dataset """ # determine the chunking schema if template_chunk_array is None: if chunking_approach == 'full_space': chunk_dims = ( 'time', ) # if we need full maps, chunk along the time dimension elif chunking_approach == 'full_time': chunk_dims = ( 'lat', 'lon', ) # if we need full time series, chunk along the lat/lon dimensions else: raise NotImplementedError( "chunking_approach must be in ['full_space', 'full_time']") example_var = list(zarr_array.data_vars)[0] chunk_def = calc_auspicious_chunks_dict(zarr_array[example_var], chunk_dims=chunk_dims) else: example_var = list(zarr_array.data_vars)[0] chunk_def = { 'time': min(template_chunk_array.chunks['time'][0], len(zarr_array.time)), 'lat': min(template_chunk_array.chunks['lat'][0], len(zarr_array.lat)), 'lon': min(template_chunk_array.chunks['lon'][0], len(zarr_array.lon)), } chunks_dict = { 'time': None, # write None here because you don't want to rechunk this array 'lon': None, 'lat': None, } for var in zarr_array.data_vars: chunks_dict[var] = chunk_def # make the schema for what you want the rechunking routine to produce # so that you can check whether what you passed in (zarr_array) already looks like that # if it does, you'll skip the rechunking! schema_dict = {} for var in zarr_array.data_vars: schema_dict[var] = DataArraySchema(chunks=chunk_def) target_schema = DatasetSchema(schema_dict) # make storage patterns if output_path is not None: output_path = config.get( 'storage.intermediate.uri') + '/' + output_path temp_store, target_store, target_path = make_rechunker_stores(output_path) print(f'target path is {target_path}') # check and see if the output is empty, if there is content, check that it's chunked correctly if len(target_store) > 0: print('checking the cache') output = xr.open_zarr(target_store) try: # if the content in target path is correctly chunked, return target_schema.validate(output) return output except SchemaError: if overwrite: target_store.clear() else: raise NotImplementedError( 'The content in the output path is incorrectly chunked, but overwrite is disabled.' 'Either clear the output or enable overwrite by setting overwrite=True' ) # process the input zarr array delete_chunks_encoding(zarr_array) try: print('checking the chunk') # now check if the input is already correctly chunked. If so, save to the output location and return target_schema.validate(zarr_array) zarr_array.to_zarr(target_store, mode='w', consolidated=True) return zarr_array except SchemaError: print('rechunking') try: rechunk_plan = rechunk( zarr_array, chunks_dict, max_mem, target_store, temp_store=temp_store, ) rechunk_plan.execute(retries=5) except ValueError: print( 'WARNING: Failed to write zarr store, perhaps because of variable chunk sizes, trying to rechunk it' ) # clearing the store because the target store has already been created in the try statement above # and rechunker fails if there's already content at the target target_store.clear() zarr_array = zarr_array.chunk(chunks_dict[example_var]) rechunk_plan = rechunk( zarr_array, chunks_dict, max_mem, target_store, temp_store=temp_store, ) rechunk_plan.execute(retries=5) rechunked_ds = xr.open_zarr( target_store ) # ideally we want consolidated=True but it seems that functionality isn't offered in rechunker right now # we can just add a consolidate_metadata step here to do it after the fact (once rechunker is done) but only # necessary if we'll reopen this rechukned_ds multiple times return rechunked_ds
def rechunk_zarr_array( zarr_array: xr.Dataset, zarr_array_location: str, variable: str, chunk_dims: Union[Tuple, dict] = ("time", ), max_mem: str = "200MB", ): """Use `rechunker` package to adjust chunks of dataset to a form conducive for your processing. Parameters ---------- zarr_array : zarr or xarray dataset Dataset you want to rechunk. zarr_array_location: str Path to where the input data is sitting. Only returned/used if zarr_array does not need to rechunked chunk_dims : Union[Tuple, dict] Information for chunking the ds. If a dict is passed, it will rechunk following sizes as specified. The dict should look like: {variable: {'lat': chunk_size_lat, 'lon': chunk_size_lon, 'time': chunk_size_lon} 'lon': None, 'lat': None, 'time': None}. If a tuple is passed, it is the dimension(s) along which you want to chunk ds, and the optimal chunk sizes will get calculated internally. max_mem : str The max memory you want to allow for a chunk. Probably want it to be around 100 MB, but that is also controlled by the `calc_auspicious_chunk_sizes` calls. Returns ------- rechunked_ds, path_tgt : Tuple[xr.Dataset, str] Rechunked dataset as well as string of location where it's stored. """ if type(chunk_dims) == tuple: chunks_dict = { variable: calc_auspicious_chunks_dict(zarr_array, chunk_dims=chunk_dims), "time": None, # write None here because you don't want to rechunk this array "lon": None, "lat": None, } elif type(chunk_dims) == dict: chunks_dict = chunk_dims # ensure that the chunks_dict looks the way you want it to as {variable: {'lat': chunk_size_lat, 'lon': chunk_size_lon, 'time': chunk_size_lon} # 'lon': None, 'lat': None, 'time': none} assert variable in chunks_dict for dim in ["lat", "lon", "time"]: chunks_dict[dim] = None assert dim in chunks_dict[variable] # make the schema for what you want the rechunking routine to produce # so that you can check whether what you passed in (zarr_array) already looks like that # if it does, you'll skip the rechunking! target_schema = DataArraySchema(chunks=chunks_dict[variable]) try: # first confirm that you have a zarr_array_location assert zarr_array_location is not None target_schema.validate(zarr_array[variable]) # return back the dataset you introduced, and the path is None since you haven't created a new dataset return zarr_array, zarr_array_location except (SchemaError, AssertionError): delete_chunks_encoding(zarr_array) temp_store, target_store, path_tgt = make_rechunker_stores() # delete_chunks_encoding(ds) # need to do this before since it wont work on zarr array # for some reason doing this on zarr arrays is faster than on xr.open_zarr - it calls `copy_chunk` less. # TODO: could switch this to a validation with xarray schema - confirm that the chunks are all uniform and # if not, chunk them according to the spec provided by `calc_auspicious_chunks_dict` try: rechunk_plan = rechunk(zarr_array, chunks_dict, max_mem, target_store, temp_store=temp_store) rechunk_plan.execute(retries=5) except ValueError: print( "WARNING: Failed to write zarr store, perhaps because of variable chunk sizes, trying to rechunk it" ) # make new stores in case it failed mid-write. alternatively could clean up that store but # we don't have delete permission currently temp_store, target_store, path_tgt = make_rechunker_stores() delete_chunks_encoding(zarr_array) # TODO: will always work but need to double check the result and if it's taking a long time rechunk_plan = rechunk( zarr_array.chunk(chunks_dict[variable]), chunks_dict, max_mem, target_store, temp_store=temp_store, ) rechunk_plan.execute(retries=5) rechunked_ds = xr.open_zarr(target_store) # ideally we want consolidated=True but it seems that functionality isn't offered in rechunker right now # we can just add a consolidate_metadata step here to do it after the fact (once rechunker is done) but only # necessary if we'll reopen this rechukned_ds multiple times return rechunked_ds, path_tgt