def test_intermediate_to_target_memory(): shape = (175320, 721, 1440) source_chunks = (24, 721, 1440) target_chunks = (21915, 103, 10) itemsize = 4 max_mem = 12000000000 # 12 GB read_chunks, int_chunks, write_chunks = rechunking_plan( shape, source_chunks, target_chunks, itemsize, max_mem, consolidate_reads=True, ) read_chunks2, int_chunks2, write_chunks2 = rechunking_plan( shape, int_chunks, target_chunks, itemsize, max_mem, consolidate_reads=True, ) assert read_chunks2 == int_chunks2 == write_chunks2
def test_rechunking_plan_hypothesis(inputs): shape, source_chunks, target_chunks, max_mem, itemsize = inputs # print(shape, source_chunks, target_chunks, max_mem) args = shape, source_chunks, target_chunks, itemsize, max_mem read_chunks, int_chunks, write_chunks = rechunking_plan(*args) # print(" plan: ", read_chunks, int_chunks, write_chunks) # this should be guaranteed by the test source_chunk_mem = itemsize * prod(source_chunks) target_chunk_mem = itemsize * prod(target_chunks) assert source_chunk_mem <= max_mem assert target_chunk_mem <= max_mem ndim = len(shape) assert len(read_chunks) == ndim assert len(int_chunks) == ndim assert len(write_chunks) == ndim _verify_plan_correctness( source_chunks, read_chunks, int_chunks, write_chunks, target_chunks, itemsize, max_mem, )
def test_rechunking_plan_2d( shape, source_chunks, target_chunks, itemsize, max_mem, read_chunks_expected, intermediate_chunks_expected, write_chunks_expected, ): read_chunks, int_chunks, write_chunks = rechunking_plan( shape, source_chunks, target_chunks, itemsize, max_mem) assert read_chunks == read_chunks_expected assert int_chunks == intermediate_chunks_expected assert write_chunks == write_chunks_expected _verify_plan_correctness( shape, source_chunks, read_chunks, int_chunks, write_chunks, target_chunks, itemsize, max_mem, )
def rechunking_plan( dim_sizes: Mapping[str, int], source_chunks: Mapping[str, int], target_chunks: Mapping[str, int], itemsize: int, max_mem: int, ) -> List[Dict[str, int]]: """Make a rechunking plan.""" plan_shapes = algorithm.rechunking_plan( shape=tuple(dim_sizes.values()), source_chunks=tuple(source_chunks[dim] for dim in dim_sizes), target_chunks=tuple(target_chunks[dim] for dim in dim_sizes), itemsize=itemsize, max_mem=max_mem, ) return [dict(zip(dim_sizes.keys(), shapes)) for shapes in plan_shapes]
def rechunk_zarr2zarr_w_dask(source_array, target_chunks, max_mem, target_store, temp_store=None, source_storage_options={}, temp_storage_options={}, target_storage_options={}): shape = source_array.shape source_chunks = source_array.chunks dtype = source_array.dtype itemsize = dtype.itemsize read_chunks, int_chunks, write_chunks = rechunking_plan( shape, source_chunks, target_chunks, itemsize, max_mem) source_read = dsa.from_zarr(source_array, chunks=read_chunks, storage_options=source_storage_options) # create target target_array = zarr.empty(shape, chunks=target_chunks, dtype=dtype, store=target_store) target_array.attrs.update(source_array.attrs) if int_chunks == target_chunks: target_store_delayed = dsa.store(source_read, target_array, lock=False, compute=False) print("One step rechunking plan") return target_store_delayed else: # do intermediate store assert temp_store is not None int_array = zarr.empty(shape, chunks=int_chunks, dtype=dtype, store=temp_store) intermediate_store_delayed = dsa.store(source_read, int_array, lock=False, compute=False) int_read = dsa.from_zarr(int_array, chunks=write_chunks, storage_options=temp_storage_options) target_store_delayed = dsa.store(int_read, target_array, lock=False, compute=False) # now do some hacking to chain these together into a single graph. # get the two graphs as dicts int_dsk = dask.utils.ensure_dict(intermediate_store_delayed.dask) target_dsk = dask.utils.ensure_dict(target_store_delayed.dask) # find the root store key representing the read root_keys = [] for key in target_dsk: if isinstance(key, str): if key.startswith('from-zarr'): root_keys.append(key) assert len(root_keys) == 1 root_key = root_keys[0] # now rewrite the graph target_dsk[root_key] = (lambda a, *b: a, target_dsk[root_key], *int_dsk[intermediate_store_delayed.key]) target_dsk.update(int_dsk) # fuse dsk_fused, deps = fuse(target_dsk) delayed_fused = Delayed(target_store_delayed.key, dsk_fused) print("Two step rechunking plan") return delayed_fused
def _setup_array_rechunk( source_array, target_chunks, max_mem, target_store_or_group, target_options=None, temp_store_or_group=None, temp_options=None, name=None, ) -> CopySpec: _validate_options(target_options) _validate_options(temp_options) shape = source_array.shape source_chunks = ( source_array.chunksize if isinstance(source_array, dask.array.Array) else source_array.chunks ) dtype = source_array.dtype itemsize = dtype.itemsize if target_chunks is None: # this is just a pass-through copy target_chunks = source_chunks if isinstance(target_chunks, dict): array_dims = _get_dims_from_zarr_array(source_array) try: target_chunks = _shape_dict_to_tuple(array_dims, target_chunks) except KeyError: raise KeyError( "You must explicitly specify each dimension size in target_chunks. " f"Got array_dims {array_dims}, target_chunks {target_chunks}." ) # TODO: rewrite to avoid the hard dependency on dask max_mem = dask.utils.parse_bytes(max_mem) # don't consolidate reads for Dask arrays consolidate_reads = isinstance(source_array, zarr.core.Array) read_chunks, int_chunks, write_chunks = rechunking_plan( shape, source_chunks, target_chunks, itemsize, max_mem, consolidate_reads=consolidate_reads, ) # create target shape = tuple(int(x) for x in shape) # ensure python ints for serialization target_chunks = tuple(int(x) for x in target_chunks) int_chunks = tuple(int(x) for x in int_chunks) write_chunks = tuple(int(x) for x in write_chunks) target_array = _zarr_empty( shape, target_store_or_group, target_chunks, dtype, name=name, **(target_options or {}), ) try: target_array.attrs.update(source_array.attrs) except AttributeError: pass if read_chunks == write_chunks: int_array = None else: # do intermediate store assert temp_store_or_group is not None int_array = _zarr_empty( shape, temp_store_or_group, int_chunks, dtype, name=name, **(temp_options or {}), ) read_proxy = ArrayProxy(source_array, read_chunks) int_proxy = ArrayProxy(int_array, int_chunks) write_proxy = ArrayProxy(target_array, write_chunks) return CopySpec(read_proxy, int_proxy, write_proxy)
def _rechunk_array( source_array, target_chunks, max_mem, target_store_or_group, temp_store_or_group=None, name=None, source_storage_options={}, temp_storage_options={}, target_storage_options={}, ): shape = source_array.shape source_chunks = source_array.chunks dtype = source_array.dtype itemsize = dtype.itemsize if target_chunks is None: # this is just a pass-through copy target_chunks = source_chunks if isinstance(target_chunks, dict): array_dims = _get_dims_from_zarr_array(source_array) try: target_chunks = _shape_dict_to_tuple(array_dims, target_chunks) except KeyError: raise KeyError( "You must explicitly specify each dimension size in target_chunks. " f"Got array_dims {array_dims}, target_chunks {target_chunks}.") read_chunks, int_chunks, write_chunks = rechunking_plan( shape, source_chunks, target_chunks, itemsize, max_mem) print(source_chunks, read_chunks, int_chunks, write_chunks, target_chunks) source_read = dsa.from_zarr(source_array, chunks=read_chunks, storage_options=source_storage_options) # create target shape = tuple(int(x) for x in shape) # ensure python ints for serialization target_chunks = tuple(int(x) for x in target_chunks) int_chunks = tuple(int(x) for x in int_chunks) write_chunks = tuple(int(x) for x in write_chunks) target_array = _zarr_empty(shape, target_store_or_group, target_chunks, dtype, name=name) target_array.attrs.update(source_array.attrs) if read_chunks == write_chunks: target_store_delayed = dsa.store(source_read, target_array, lock=False, compute=False) return target_store_delayed else: # do intermediate store assert temp_store_or_group is not None int_array = _zarr_empty(shape, temp_store_or_group, int_chunks, dtype, name=name) intermediate_store_delayed = dsa.store(source_read, int_array, lock=False, compute=False) int_read = dsa.from_zarr(int_array, chunks=write_chunks, storage_options=temp_storage_options) target_store_delayed = dsa.store(int_read, target_array, lock=False, compute=False) # now do some hacking to chain these together into a single graph. # get the two graphs as dicts int_dsk = dask.utils.ensure_dict(intermediate_store_delayed.dask) target_dsk = dask.utils.ensure_dict(target_store_delayed.dask) # find the root store key representing the read root_keys = [] for key in target_dsk: if isinstance(key, str): if key.startswith("from-zarr"): root_keys.append(key) assert len(root_keys) == 1 root_key = root_keys[0] # now rewrite the graph target_dsk[root_key] = ( lambda a, *b: a, target_dsk[root_key], *int_dsk[intermediate_store_delayed.key], ) target_dsk.update(int_dsk) # fuse dsk_fused, deps = fuse(target_dsk) delayed_fused = Delayed(target_store_delayed.key, dsk_fused) print("Two step rechunking plan") return delayed_fused