def test_rechunk_no_temp_dir_provided_error(rechunk_args): # Verify that the correct error is raised when no temp_store is given # and the chunks to write differ from the chunks to read args = {k: v for k, v in rechunk_args.items() if k != "temp_store"} with pytest.raises(ValueError, match="A temporary store location must be provided"): api.rechunk(**args).execute()
def test_rechunk_bad_target_chunks(rechunk_args): if not _is_collection(rechunk_args["source"]): return rechunk_args = dict(rechunk_args) rechunk_args["target_chunks"] = (10, 10) with pytest.raises(ValueError, match="You must specify ``target-chunks`` as a dict"): api.rechunk(**rechunk_args)
def test_rechunk_invalid_source(tmp_path): with pytest.raises( ValueError, match= "Source must be a Zarr Array, Zarr Group, Dask Array or Xarray Dataset", ): api.rechunk([[1, 2], [3, 4]], target_chunks=(10, 10), max_mem=100, target_store=tmp_path)
def test_rechunk_no_target_chunks(rechunk_args): rechunk_args = dict(rechunk_args) if _is_collection(rechunk_args["source"]): rechunk_args["target_chunks"] = { v: None for v in rechunk_args["source"] } else: rechunk_args["target_chunks"] = None api.rechunk(**rechunk_args)
def test_rechunk_passthrough(rechunk_args): # Verify that no errors are raised when the target chunks == source chunks if _is_collection(rechunk_args["source"]): rechunk_args["target_chunks"] = { v: None for v in rechunk_args["source"] } else: rechunk_args["target_chunks"] = None api.rechunk(**rechunk_args).execute()
def test_unsupported_executor(tmp_path, source, target_chunks, executor): with pytest.raises( NotImplementedError, match="Executor type .* not supported for source", ): api.rechunk( source, target_chunks=target_chunks, max_mem=1600, target_store=str(tmp_path / "target.zarr"), temp_store=str(tmp_path / "temp.zarr"), executor=executor, )
def rechunked(tmp_path, request): if request.param == "Group": store_source = str(tmp_path / "source.zarr") group = zarr.group(store_source) group.attrs["foo"] = "bar" # 800 byte chunks a = group.ones("a", shape=(5, 10, 20), chunks=(1, 10, 20), dtype="f4") a.attrs["foo"] = "bar" b = group.ones("b", shape=(20, ), chunks=(10, ), dtype="f4") b.attrs["foo"] = "bar" target_store = str(tmp_path / "target.zarr") temp_store = str(tmp_path / "temp.zarr") max_mem = 1600 # should force a two-step plan for a target_chunks = {"a": (5, 10, 4), "b": (20, )} rechunked = api.rechunk(group, target_chunks, max_mem, target_store, temp_store=temp_store) else: shape = (8000, 8000) source_chunks = (200, 8000) dtype = "f4" max_mem = 25600000 dims = None target_chunks = (8000, 200) store_source = str(tmp_path / "source.zarr") source_array = zarr.ones(shape, chunks=source_chunks, dtype=dtype, store=store_source) # add some attributes source_array.attrs["foo"] = "bar" if dims: source_array.attrs[_DIMENSION_KEY] = dims ### Create targets ### target_store = str(tmp_path / "target.zarr") temp_store = str(tmp_path / "temp.zarr") rechunked = api.rechunk(source_array, target_chunks, max_mem, target_store, temp_store=temp_store) return rechunked
def rechunk(compressor): options = _wrap_options(rechunk_args["source"], dict(overwrite=True, compressor=compressor)) rechunked = api.rechunk(**rechunk_args, target_options=options) rechunked.execute() return sum(file.stat().st_size for file in Path(rechunked._target.store.path).rglob("*"))
def test_rechunk_dask_array(tmp_path, shape, source_chunks, dtype, target_chunks, max_mem): ### Create source array ### source_array = dsa.ones(shape, chunks=source_chunks, dtype=dtype) ### Create targets ### target_store = str(tmp_path / "target.zarr") temp_store = str(tmp_path / "temp.zarr") rechunked = api.rechunk(source_array, target_chunks, max_mem, target_store, temp_store=temp_store) assert isinstance(rechunked, api.Rechunked) target_array = zarr.open(target_store) assert target_array.chunks == tuple(target_chunks) result = rechunked.execute() assert isinstance(result, zarr.Array) a_tar = dsa.from_zarr(target_array) assert dsa.equal(a_tar, 1).all().compute()
def test_rechunk_group(tmp_path): store_source = str(tmp_path / "source.zarr") group = zarr.group(store_source) group.attrs["foo"] = "bar" # 800 byte chunks a = group.ones("a", shape=(5, 10, 20), chunks=(1, 10, 20), dtype="f4") a.attrs["foo"] = "bar" b = group.ones("b", shape=(20, ), chunks=(10, ), dtype="f4") b.attrs["foo"] = "bar" target_store = str(tmp_path / "target.zarr") temp_store = str(tmp_path / "temp.zarr") max_mem = 1600 # should force a two-step plan for a target_chunks = {"a": (5, 10, 4), "b": (20, )} delayed = api.rechunk(group, target_chunks, max_mem, target_store, temp_store=temp_store) target_group = zarr.open(target_store) assert "a" in target_group assert "b" in target_group assert dict(group.attrs) == dict(target_group.attrs) dask.compute(delayed) for aname in target_chunks: a_tar = dsa.from_zarr(target_group[aname]) assert dsa.equal(a_tar, 1).all().compute()
def test_rechunk_dataset( tmp_path, shape, source_chunks, target_chunks, max_mem, executor, target_store, temp_store, ): if target_store.startswith("mapper"): fsspec = pytest.importorskip("fsspec") target_store = fsspec.get_mapper(str(tmp_path) + target_store) temp_store = fsspec.get_mapper(str(tmp_path) + temp_store) else: target_store = str(tmp_path / target_store) temp_store = str(tmp_path / temp_store) ds = example_dataset(shape).chunk( chunks=dict(zip(["x", "y"], source_chunks))) options = dict(a=dict( compressor=zarr.Blosc(cname="zstd"), dtype="int32", scale_factor=0.1, _FillValue=-9999, )) rechunked = api.rechunk( ds, target_chunks=target_chunks, max_mem=max_mem, target_store=target_store, target_options=options, temp_store=temp_store, executor=executor, ) assert isinstance(rechunked, api.Rechunked) with dask.config.set(scheduler="single-threaded"): rechunked.execute() # Validate encoded variables dst = xarray.open_zarr(target_store, decode_cf=False) assert dst.a.dtype == options["a"]["dtype"] assert all(dst.a.values[-1] == options["a"]["_FillValue"]) assert dst.a.encoding["compressor"] is not None # Validate decoded variables dst = xarray.open_zarr(target_store, decode_cf=True) target_chunks_expected = (target_chunks["a"] if isinstance( target_chunks["a"], tuple) else (target_chunks["a"]["x"], target_chunks["a"]["y"])) assert dst.a.data.chunksize == target_chunks_expected assert dst.b.data.chunksize == target_chunks_expected[:1] assert dst.c.data.chunksize == source_chunks[1:] xarray.testing.assert_equal(ds.compute(), dst.compute()) assert ds.attrs == dst.attrs
def test_rechunk_option_overwrite(rechunk_args): api.rechunk(**rechunk_args).execute() # TODO: make this match more reliable based on outcome of # https://github.com/zarr-developers/zarr-python/issues/605 with pytest.raises(ValueError, match=r"path .* contains an array"): api.rechunk(**rechunk_args).execute() options = _wrap_options(rechunk_args["source"], dict(overwrite=True)) api.rechunk(**rechunk_args, target_options=options).execute()
def test_rechunk_dataset_dimchunks( tmp_path, shape, source_chunks, target_chunks, max_mem, ): temp_store = "temp.zarr" target_store = "target.zarr" target_store = str(tmp_path / target_store) temp_store = str(tmp_path / temp_store) ds = example_dataset(shape).chunk( chunks=dict(zip(["x", "y"], source_chunks))) options = dict(a=dict( compressor=zarr.Blosc(cname="zstd"), dtype="int32", scale_factor=0.1, _FillValue=-9999, )) rechunked = api.rechunk( ds, target_chunks=target_chunks, max_mem=max_mem, target_store=target_store, target_options=options, temp_store=temp_store, ) assert isinstance(rechunked, api.Rechunked) with dask.config.set(scheduler="single-threaded"): rechunked.execute() # Validate decoded variables dst = xarray.open_zarr(target_store, decode_cf=True) target_chunks_expected = [ target_chunks.get("x", source_chunks[0]), target_chunks.get("y", source_chunks[1]), ] if target_chunks_expected[1] < 0 or target_chunks_expected[1] > len(ds.y): target_chunks_expected[1] = len(ds.y) target_chunks_expected = tuple(target_chunks_expected) assert dst.a.data.chunksize == target_chunks_expected assert dst.b.data.chunksize == target_chunks_expected[:1] assert dst.c.data.chunksize == target_chunks_expected[1:] xarray.testing.assert_equal(ds.compute(), dst.compute()) assert ds.attrs == dst.attrs
def test_pywren_function_executor(tmp_path): pytest.importorskip("pywren_ibm_cloud") from rechunker.executors.pywren import ( pywren_local_function_executor, PywrenExecutor, ) # Create a Pywren function exectutor that we manage ourselves # and pass in to rechunker's PywrenExecutor with pywren_local_function_executor() as function_executor: executor = PywrenExecutor(function_executor) shape = (8000, 8000) source_chunks = (200, 8000) dtype = "f4" max_mem = 25600000 target_chunks = (400, 8000) ### Create source array ### store_source = str(tmp_path / "source.zarr") source_array = zarr.ones(shape, chunks=source_chunks, dtype=dtype, store=store_source) ### Create targets ### target_store = str(tmp_path / "target.zarr") temp_store = str(tmp_path / "temp.zarr") rechunked = api.rechunk( source_array, target_chunks, max_mem, target_store, temp_store=temp_store, executor=executor, ) assert isinstance(rechunked, api.Rechunked) target_array = zarr.open(target_store) assert target_array.chunks == tuple(target_chunks) result = rechunked.execute() assert isinstance(result, zarr.Array) a_tar = dsa.from_zarr(target_array) assert dsa.equal(a_tar, 1).all().compute()
def test_rechunk_group(tmp_path, executor, source_store, target_store, temp_store): if source_store.startswith("mapper"): fsspec = pytest.importorskip("fsspec") store_source = fsspec.get_mapper(str(tmp_path) + source_store) target_store = fsspec.get_mapper(str(tmp_path) + target_store) temp_store = fsspec.get_mapper(str(tmp_path) + temp_store) else: store_source = str(tmp_path / source_store) target_store = str(tmp_path / target_store) temp_store = str(tmp_path / temp_store) group = zarr.group(store_source) group.attrs["foo"] = "bar" # 800 byte chunks a = group.ones("a", shape=(5, 10, 20), chunks=(1, 10, 20), dtype="f4") a.attrs["foo"] = "bar" b = group.ones("b", shape=(20, ), chunks=(10, ), dtype="f4") b.attrs["foo"] = "bar" max_mem = 1600 # should force a two-step plan for a target_chunks = {"a": (5, 10, 4), "b": (20, )} rechunked = api.rechunk( group, target_chunks, max_mem, target_store, temp_store=temp_store, executor=executor, ) assert isinstance(rechunked, api.Rechunked) target_group = zarr.open(target_store) assert "a" in target_group assert "b" in target_group assert dict(group.attrs) == dict(target_group.attrs) rechunked.execute() for aname in target_chunks: assert target_group[aname].chunks == target_chunks[aname] a_tar = dsa.from_zarr(target_group[aname]) assert dsa.equal(a_tar, 1).all().compute()
def test_no_intermediate_fused(tmp_path): shape = (8000, 8000) source_chunks = (200, 8000) dtype = "f4" max_mem = 25600000 target_chunks = (400, 8000) store_source = str(tmp_path / "source.zarr") source_array = zarr.ones(shape, chunks=source_chunks, dtype=dtype, store=store_source) target_store = str(tmp_path / "target.zarr") rechunked = api.rechunk(source_array, target_chunks, max_mem, target_store) num_tasks = len( [v for v in rechunked.plan.dask.values() if dask.core.istask(v)]) assert num_tasks < 20 # less than if no fuse
def test_rechunk_array(tmp_path, shape, source_chunks, dtype, dims, target_chunks, max_mem, executor): ### Create source array ### store_source = str(tmp_path / "source.zarr") source_array = zarr.ones(shape, chunks=source_chunks, dtype=dtype, store=store_source) # add some attributes source_array.attrs["foo"] = "bar" if dims: source_array.attrs[_DIMENSION_KEY] = dims ### Create targets ### target_store = str(tmp_path / "target.zarr") temp_store = str(tmp_path / "temp.zarr") rechunked = api.rechunk( source_array, target_chunks, max_mem, target_store, temp_store=temp_store, executor=executor, ) assert isinstance(rechunked, api.Rechunked) target_array = zarr.open(target_store) if isinstance(target_chunks, dict): target_chunks_list = [target_chunks[d] for d in dims] else: target_chunks_list = target_chunks assert target_array.chunks == tuple(target_chunks_list) assert dict(source_array.attrs) == dict(target_array.attrs) result = rechunked.execute() assert isinstance(result, zarr.Array) a_tar = dsa.from_zarr(target_array) assert dsa.equal(a_tar, 1).all().compute()
def test_rechunk_invalid_option(rechunk_args): if isinstance(rechunk_args["source"], xarray.Dataset): # Options are essentially unbounded for Xarray (for CF encoding params), # so check only options with special error cases options = _wrap_options(rechunk_args["source"], {"chunks": 10}) with pytest.raises( ValueError, match= "Chunks must be provided in ``target_chunks`` rather than options", ): api.rechunk(**rechunk_args, target_options=options) else: for o in ["shape", "chunks", "dtype", "store", "name", "unknown"]: options = _wrap_options(rechunk_args["source"], {o: True}) with pytest.raises(ValueError, match=f"Zarr options must not include {o}"): api.rechunk(**rechunk_args, temp_options=options) with pytest.raises(ValueError, match=f"Zarr options must not include {o}"): api.rechunk(**rechunk_args, target_options=options)
) target_chunks = { var: encoding[var]["chunks"] for var in encoding if "chunks" in encoding[var] } target_options = { var: {k: v for k, v in encoding[var].items() if k != "chunks"} for var in encoding } with tempfile.TemporaryDirectory( prefix="bgen_to_zarr_", suffix=".zarr", dir=tempdir ) as tmpdir: rechunked = rechunker_api.rechunk( ds, max_mem=max_mem, target_chunks=target_chunks, target_store=output, target_options=target_options, temp_store=tmpdir, executor="dask", ) rechunked.execute() zarr.consolidate_metadata(output) ds: Dataset = xr.open_zarr(output, concat_characters=False) # type: ignore[no-untyped-call] if pack: ds = unpack_variables(ds) return ds
def test_rechunk_dataset(tmp_path, shape, source_chunks, target_chunks, max_mem, executor): target_store = str(tmp_path / "target.zarr") temp_store = str(tmp_path / "temp.zarr") a = numpy.arange(numpy.prod(shape)).reshape(shape).astype("f4") a[-1] = numpy.nan ds = xarray.Dataset( dict( a=xarray.DataArray(a, dims=["x", "y"], attrs={ "a1": 1, "a2": [1, 2, 3], "a3": "x" }), b=xarray.DataArray(numpy.ones(shape[0]), dims=["x"]), c=xarray.DataArray(numpy.ones(shape[1]), dims=["y"]), ), coords=dict( cx=xarray.DataArray(numpy.ones(shape[0]), dims=["x"]), cy=xarray.DataArray(numpy.ones(shape[1]), dims=["y"]), ), attrs={ "a1": 1, "a2": [1, 2, 3], "a3": "x" }, ) ds = ds.chunk(chunks=dict(zip(["x", "y"], source_chunks))) options = dict(a=dict( compressor=zarr.Blosc(cname="zstd"), dtype="int32", scale_factor=0.1, _FillValue=-9999, )) rechunked = api.rechunk( ds, target_chunks=dict(a=target_chunks, b=target_chunks[:1]), max_mem=max_mem, target_store=target_store, target_options=options, temp_store=temp_store, executor=executor, ) assert isinstance(rechunked, api.Rechunked) rechunked.execute() # Validate encoded variables dst = xarray.open_zarr(target_store, decode_cf=False) assert dst.a.dtype == options["a"]["dtype"] assert all(dst.a.values[-1] == options["a"]["_FillValue"]) assert dst.a.encoding["compressor"] is not None # Validate decoded variables dst = xarray.open_zarr(target_store, decode_cf=True) assert dst.a.data.chunksize == target_chunks assert dst.b.data.chunksize == target_chunks[:1] assert dst.c.data.chunksize == source_chunks[1:] xarray.testing.assert_equal(ds.compute(), dst.compute()) assert ds.attrs == dst.attrs
def rechunked(rechunk_args): return api.rechunk(**rechunk_args)