예제 #1
0
def rechunk_zarr(arr,
                 target_chunks,
                 target_store,
                 intermediate_store='temp/intermediate_ld_rechunk.zarr',
                 **kwargs):

    if os.path.isdir(target_store):
        try:
            z = zarr.open(target_store)
            z.store.rmdir()
        except Exception as e:
            raise e

    rechunked = rechunk(arr,
                        target_chunks=target_chunks,
                        target_store=target_store,
                        temp_store=intermediate_store,
                        max_mem=psutil.virtual_memory().available /
                        psutil.cpu_count(),
                        **kwargs)

    try:
        rechunked.execute()
        # Delete the older stores:
        zarr.open(intermediate_store).store.rmdir()
        arr.store.rmdir()
    except Exception as e:
        raise e

    return zarr.open(target_store)
예제 #2
0
                fs.rm(url, recursive=True)

    elif command == "create":
        # create a random Zarr array for the source

        # dask_chunks determine the unit of work - they are bigger than the chunks
        # of the zarr file we are writing to
        dask_chunks = (800, 4000)
        zarr_chunks = source_chunks

        executor = PywrenExecutor(pywren.local_executor())

        arr = da.random.random(size=shape, chunks=dask_chunks)
        itemsize = arr.dtype.itemsize
        max_mem = str(itemsize * prod(dask_chunks))
        plan = rechunk(arr, zarr_chunks, max_mem, source_store)

        plan.execute(scheduler=executor_scheduler,
                     executor=executor,
                     batch_size=100)

    elif command == "rechunk":
        # rechunk the source to the target

        max_mem = 25_600_000

        executor = PywrenExecutor(pywren.local_executor())

        source_array = zarr.open_array(source_store, mode="r")

        plan = rechunk(source_array,
예제 #3
0
def test_rechunker_local():
    source_url = "data/source.zarr"
    target_url = "data/target.zarr"
    temp_url = "data/temp.zarr"

    # delete
    for url in (source_url, target_url, temp_url):
        delete(url)

    source_store = fsspec.get_mapper(source_url)
    target_store = fsspec.get_mapper(target_url)
    temp_store = fsspec.get_mapper(temp_url)

    shape = (4000, 4000)
    source_chunks = (400, 4000)
    target_chunks = (4000, 400)

    executor = PywrenExecutor(pywren.local_executor())

    # create

    # dask_chunks determine the unit of work - they are bigger than the chunks
    # of the zarr file we are writing to
    dask_chunks = (800, 4000)
    zarr_chunks = source_chunks

    arr = da.random.random(size=shape, chunks=dask_chunks)
    itemsize = arr.dtype.itemsize
    max_mem = str(itemsize * prod(dask_chunks))
    plan = rechunk(arr, zarr_chunks, max_mem, source_store)

    plan.execute(
        scheduler=executor_scheduler,
        executor=executor,
        batch_size=100
    )

    z = zarr.open_array(source_store, mode="r")
    assert z.shape == shape
    assert z.chunks == source_chunks
    assert z.nchunks == 10
    assert z.dtype == float

    # rechunk

    max_mem = 25_600_000

    source_array = zarr.open_array(source_store, mode="r")

    plan = rechunk(
        source_array, target_chunks, max_mem, target_store, temp_store=temp_store
    )

    plan.execute(
        scheduler=executor_scheduler,
        executor=executor,
        batch_size=100
    )

    z = zarr.open_array(target_store, mode="r")
    assert z.shape == shape
    assert z.chunks == target_chunks
    assert z.nchunks == 10
    assert z.dtype == float
예제 #4
0
def transpose_rechunk(
    ds,
    chunks,
    iters=None,
    face=None,
    subsampling=None,
    name=None,
    out_dir=work_data_dir + "rechunked/",
    overwrite=True,
    max_mem="25GB",
    verbose=0,
    debug=False,
):
    """Rechunk a data array

    cluster = PBSCluster(cores=2, processes=1, walltime='03:00:00')

    Parameters:
        ds: xarray.DataArray
            input data array
        chunks: tuple
            (Nt, Nj, Ni)
        iters: xarray.DataArray
            mitgcm iterations to consider
        face: int, optional
            face to consider
        out_dir: str, optional
            output path
        max_mem: str, optional
            rechunker parameter
        verbose: turn on/off verbose

    """

    if name is None:
        vnames = list(ds)
        assert len(
            vnames
        ) == 1, "You should have only one variablie in the xr dataset"
        v = vnames[0]

    if face is not None:
        ds = ds.sel(face=face)
        suff = "_f{:02d}.zarr".format(int(face))
        print(" face={}".format(int(face)))
    else:
        suff = ".zarr"

    # rechunker outputs
    target_store = out_dir + v + suff
    temp_store = out_dir + "tmp.zarr"

    # clean archives if necessary
    rmtree(temp_store, ignore_errors=True)
    if os.path.isdir(target_store):
        if overwrite:
            rmtree(target_store)
        else:
            print("Do not overwrite")
            # assert False, 'Archive exists and you do not want to overwrite'
            return

    # select common time line
    if iters is not None:
        t0 = ds["time"].where(ds.iters == iters[0], drop=True).values[0]
        t1 = ds["time"].where(ds.iters == iters[-1], drop=True).values[0]
        ds = ds.sel(time=slice(t0, t1))
        ds["dtime"] = ds["dtime"].compute()
        ds["iters"] = ds["iters"].compute()

    if subsampling is not None:
        i_dim, j_dim = get_ij_dims(ds[v])
        ds = ds.isel(
            **{
                i_dim: slice(0, None, subsampling),
                j_dim: slice(0, None, subsampling),
            })

    # deal with the time dimension
    Nt = len(ds.time) - 1 if chunks[0] == 0 else chunks[0]
    chunks = (Nt, chunks[1], chunks[2])
    # -1 is to obtain 8784 which you can divide by 4**2
    # necessary ? yes
    ds = ds.isel(time=slice(len(ds.time) // Nt * Nt))

    # init rechunker
    target_chunks = get_chunks(chunks, v, 1, verbose=verbose)
    r = rechunk(ds,
                target_chunks,
                max_mem,
                target_store,
                temp_store=temp_store)

    if verbose > 0:
        print_rechunk(r, v)

    # exec
    if debug:
        return r
    result = r.execute()

    # clean up intermediate file
    rmtree(temp_store, ignore_errors=True)

    print(" rechunking over")
예제 #5
0
def save_minian(
    var: xr.DataArray,
    dpath: str,
    meta_dict: Optional[dict] = None,
    overwrite=False,
    chunks: Optional[dict] = None,
    compute=True,
    mem_limit="500MB",
) -> xr.DataArray:
    """
    Save a `xr.DataArray` with `zarr` storage backend following minian
    conventions.

    This function will store arbitrary `xr.DataArray` into `dpath` with `zarr`
    backend. A separate folder will be created under `dpath`, with folder name
    `var.name + ".zarr"`. Optionally metadata can be retrieved from directory
    hierarchy and added as coordinates of the `xr.DataArray`. In addition, an
    on-disk rechunking of the result can be performed using
    :func:`rechunker.rechunk` if `chunks` are given.

    Parameters
    ----------
    var : xr.DataArray
        The array to be saved.
    dpath : str
        The path to the minian dataset directory.
    meta_dict : dict, optional
        How metadata should be retrieved from directory hierarchy. The keys
        should be negative integers representing directory level relative to
        `dpath` (so `-1` means the immediate parent directory of `dpath`), and
        values should be the name of dimensions represented by the corresponding
        level of directory. The actual coordinate value of the dimensions will
        be the directory name of corresponding level. By default `None`.
    overwrite : bool, optional
        Whether to overwrite the result on disk. By default `False`.
    chunks : dict, optional
        A dictionary specifying the desired chunk size. The chunk size should be
        specified using :doc:`dask:array-chunks` convention, except the "auto"
        specifiication is not supported. The rechunking operation will be
        carried out with on-disk algorithms using :func:`rechunker.rechunk`. By
        default `None`.
    compute : bool, optional
        Whether to compute `var` and save it immediately. By default `True`.
    mem_limit : str, optional
        The memory limit for the on-disk rechunking algorithm, passed to
        :func:`rechunker.rechunk`. Only used if `chunks` is not `None`. By
        default `"500MB"`.

    Returns
    -------
    var : xr.DataArray
        The array representation of saving result. If `compute` is `True`, then
        the returned array will only contain delayed task of loading the on-disk
        `zarr` arrays. Otherwise all computation leading to the input `var` will
        be preserved in the result.

    Examples
    -------
    The following will save the variable `var` to directory
    `/spatial_memory/alpha/learning1/minian/important_array.zarr`, with the
    additional coordinates: `{"session": "learning1", "animal": "alpha",
    "experiment": "spatial_memory"}`.

    >>> save_minian(
    ...     var.rename("important_array"),
    ...     "/spatial_memory/alpha/learning1/minian",
    ...     {-1: "session", -2: "animal", -3: "experiment"},
    ... ) # doctest: +SKIP
    """
    dpath = os.path.normpath(dpath)
    Path(dpath).mkdir(parents=True, exist_ok=True)
    ds = var.to_dataset()
    if meta_dict is not None:
        pathlist = os.path.split(os.path.abspath(dpath))[0].split(os.sep)
        ds = ds.assign_coords(
            **dict([(dn, pathlist[di]) for dn, di in meta_dict.items()])
        )
    md = {True: "a", False: "w-"}[overwrite]
    fp = os.path.join(dpath, var.name + ".zarr")
    if overwrite:
        try:
            shutil.rmtree(fp)
        except FileNotFoundError:
            pass
    arr = ds.to_zarr(fp, compute=compute, mode=md)
    if (chunks is not None) and compute:
        chunks = {d: var.sizes[d] if v <= 0 else v for d, v in chunks.items()}
        dst_path = os.path.join(dpath, str(uuid4()))
        temp_path = os.path.join(dpath, str(uuid4()))
        with da.config.set(
            array_optimize=darr.optimization.optimize,
            delayed_optimize=default_delay_optimize,
        ):
            zstore = zr.open(fp)
            rechk = rechunker.rechunk(
                zstore[var.name], chunks, mem_limit, dst_path, temp_store=temp_path
            )
            rechk.execute()
        try:
            shutil.rmtree(temp_path)
        except FileNotFoundError:
            pass
        arr_path = os.path.join(fp, var.name)
        for f in os.listdir(arr_path):
            os.remove(os.path.join(arr_path, f))
        for f in os.listdir(dst_path):
            os.rename(os.path.join(dst_path, f), os.path.join(arr_path, f))
        os.rmdir(dst_path)
    if compute:
        arr = xr.open_zarr(fp)[var.name]
        arr.data = darr.from_zarr(os.path.join(fp, var.name), inline_array=True)
    return arr
예제 #6
0
def rechunk_zarr_array_with_caching(
    zarr_array: xr.Dataset,
    chunking_approach: Optional[str] = None,
    template_chunk_array: Optional[xr.Dataset] = None,
    output_path: Optional[str] = None,
    max_mem: str = "200MB",
    overwrite: bool = False,
) -> xr.Dataset:
    """Use `rechunker` package to adjust chunks of dataset to a form
    conducive for your processing.
    Parameters
    ----------
    zarr_array : zarr or xarray dataset
        Dataset you want to rechunk.
    output_path: str
        Path to where the output data is saved. If output path is not empty, the content would be loaded and the schema checked. If the schema check passed,
        the content will be returned without rechunking again (i.e. caching); else, the content can be overwritten (see overwrite option).
    chunking_approach : str
        Has to be one of `full_space` or `full_time`. If `full_space`, the data will be rechunked such that the space dimensions are contiguous (i.e. each chunk
        will contain full maps). If `full_time`, the data will be rechunked such that the time dimension is contiguous (i.e. each chunk will contain full time
        series)
    max_mem : str
        The max memory you want to allow for a chunk. Probably want it to be around 100 MB, but that
        is also controlled by the `calc_auspicious_chunk_sizes` calls.
    overwrite : bool
        Whether to overwrite the content saved at output_path if the content did not pass schema check.
    Returns
    -------
    rechunked_ds : xr.Dataset
        Rechunked dataset
    """
    # determine the chunking schema
    if template_chunk_array is None:
        if chunking_approach == 'full_space':
            chunk_dims = (
                'time',
            )  # if we need full maps, chunk along the time dimension
        elif chunking_approach == 'full_time':
            chunk_dims = (
                'lat',
                'lon',
            )  # if we need full time series, chunk along the lat/lon dimensions
        else:
            raise NotImplementedError(
                "chunking_approach must be in ['full_space', 'full_time']")
        example_var = list(zarr_array.data_vars)[0]
        chunk_def = calc_auspicious_chunks_dict(zarr_array[example_var],
                                                chunk_dims=chunk_dims)
    else:
        example_var = list(zarr_array.data_vars)[0]
        chunk_def = {
            'time':
            min(template_chunk_array.chunks['time'][0], len(zarr_array.time)),
            'lat':
            min(template_chunk_array.chunks['lat'][0], len(zarr_array.lat)),
            'lon':
            min(template_chunk_array.chunks['lon'][0], len(zarr_array.lon)),
        }
    chunks_dict = {
        'time':
        None,  # write None here because you don't want to rechunk this array
        'lon': None,
        'lat': None,
    }
    for var in zarr_array.data_vars:
        chunks_dict[var] = chunk_def

    # make the schema for what you want the rechunking routine to produce
    # so that you can check whether what you passed in (zarr_array) already looks like that
    # if it does, you'll skip the rechunking!
    schema_dict = {}
    for var in zarr_array.data_vars:
        schema_dict[var] = DataArraySchema(chunks=chunk_def)
    target_schema = DatasetSchema(schema_dict)

    # make storage patterns
    if output_path is not None:
        output_path = config.get(
            'storage.intermediate.uri') + '/' + output_path
    temp_store, target_store, target_path = make_rechunker_stores(output_path)
    print(f'target path is {target_path}')

    # check and see if the output is empty, if there is content, check that it's chunked correctly
    if len(target_store) > 0:
        print('checking the cache')
        output = xr.open_zarr(target_store)
        try:
            # if the content in target path is correctly chunked, return
            target_schema.validate(output)
            return output

        except SchemaError:
            if overwrite:
                target_store.clear()
            else:
                raise NotImplementedError(
                    'The content in the output path is incorrectly chunked, but overwrite is disabled.'
                    'Either clear the output or enable overwrite by setting overwrite=True'
                )

    # process the input zarr array
    delete_chunks_encoding(zarr_array)
    try:
        print('checking the chunk')
        # now check if the input is already correctly chunked. If so, save to the output location and return
        target_schema.validate(zarr_array)
        zarr_array.to_zarr(target_store, mode='w', consolidated=True)
        return zarr_array

    except SchemaError:
        print('rechunking')
        try:
            rechunk_plan = rechunk(
                zarr_array,
                chunks_dict,
                max_mem,
                target_store,
                temp_store=temp_store,
            )
            rechunk_plan.execute(retries=5)
        except ValueError:
            print(
                'WARNING: Failed to write zarr store, perhaps because of variable chunk sizes, trying to rechunk it'
            )
            # clearing the store because the target store has already been created in the try statement above
            # and rechunker fails if there's already content at the target
            target_store.clear()
            zarr_array = zarr_array.chunk(chunks_dict[example_var])
            rechunk_plan = rechunk(
                zarr_array,
                chunks_dict,
                max_mem,
                target_store,
                temp_store=temp_store,
            )
            rechunk_plan.execute(retries=5)
        rechunked_ds = xr.open_zarr(
            target_store
        )  # ideally we want consolidated=True but it seems that functionality isn't offered in rechunker right now
        # we can just add a consolidate_metadata step here to do it after the fact (once rechunker is done) but only
        # necessary if we'll reopen this rechukned_ds multiple times
        return rechunked_ds
예제 #7
0
def rechunk_zarr_array(
        zarr_array: xr.Dataset,
        zarr_array_location: str,
        variable: str,
        chunk_dims: Union[Tuple, dict] = ("time", ),
        max_mem: str = "200MB",
):
    """Use `rechunker` package to adjust chunks of dataset to a form
    conducive for your processing.
    Parameters
    ----------
    zarr_array : zarr or xarray dataset
        Dataset you want to rechunk.
    zarr_array_location: str
        Path to where the input data is sitting. Only returned/used if zarr_array does not need to rechunked
    chunk_dims : Union[Tuple, dict]
        Information for chunking the ds. If a dict is passed, it will rechunk following sizes as specified. The dict should look like:
            {variable: {'lat': chunk_size_lat,
                        'lon': chunk_size_lon,
                        'time': chunk_size_lon}
            'lon': None,
            'lat': None,
            'time': None}.
        If a tuple is passed, it is the dimension(s) along which you want to chunk ds, and the optimal chunk sizes will get calculated internally.
    max_mem : str
        The max memory you want to allow for a chunk. Probably want it to be around 100 MB, but that
        is also controlled by the `calc_auspicious_chunk_sizes` calls.
    Returns
    -------
    rechunked_ds, path_tgt : Tuple[xr.Dataset, str]
        Rechunked dataset as well as string of location where it's stored.
    """
    if type(chunk_dims) == tuple:
        chunks_dict = {
            variable:
            calc_auspicious_chunks_dict(zarr_array, chunk_dims=chunk_dims),
            "time":
            None,  # write None here because you don't want to rechunk this array
            "lon":
            None,
            "lat":
            None,
        }
    elif type(chunk_dims) == dict:
        chunks_dict = chunk_dims
        # ensure that the chunks_dict looks the way you want it to as {variable: {'lat': chunk_size_lat, 'lon': chunk_size_lon, 'time': chunk_size_lon}
        # 'lon': None, 'lat': None, 'time': none}
        assert variable in chunks_dict
        for dim in ["lat", "lon", "time"]:
            chunks_dict[dim] = None
            assert dim in chunks_dict[variable]

    # make the schema for what you want the rechunking routine to produce
    # so that you can check whether what you passed in (zarr_array) already looks like that
    # if it does, you'll skip the rechunking!
    target_schema = DataArraySchema(chunks=chunks_dict[variable])
    try:
        # first confirm that you have a zarr_array_location
        assert zarr_array_location is not None
        target_schema.validate(zarr_array[variable])
        # return back the dataset you introduced, and the path is None since you haven't created a new dataset
        return zarr_array, zarr_array_location
    except (SchemaError, AssertionError):
        delete_chunks_encoding(zarr_array)
        temp_store, target_store, path_tgt = make_rechunker_stores()
        # delete_chunks_encoding(ds) # need to do this before since it wont work on zarr array
        # for some reason doing this on zarr arrays is faster than on xr.open_zarr - it calls `copy_chunk` less.
        # TODO: could switch this to a validation with xarray schema - confirm that the chunks are all uniform and
        # if not, chunk them according to the spec provided by `calc_auspicious_chunks_dict`
        try:
            rechunk_plan = rechunk(zarr_array,
                                   chunks_dict,
                                   max_mem,
                                   target_store,
                                   temp_store=temp_store)
            rechunk_plan.execute(retries=5)
        except ValueError:
            print(
                "WARNING: Failed to write zarr store, perhaps because of variable chunk sizes, trying to rechunk it"
            )
            # make new stores in case it failed mid-write. alternatively could clean up that store but
            # we don't have delete permission currently
            temp_store, target_store, path_tgt = make_rechunker_stores()
            delete_chunks_encoding(zarr_array)
            # TODO: will always work but need to double check the result and if it's taking a long time
            rechunk_plan = rechunk(
                zarr_array.chunk(chunks_dict[variable]),
                chunks_dict,
                max_mem,
                target_store,
                temp_store=temp_store,
            )
            rechunk_plan.execute(retries=5)
        rechunked_ds = xr.open_zarr(target_store)
        # ideally we want consolidated=True but it seems that functionality isn't offered in rechunker right now
        # we can just add a consolidate_metadata step here to do it after the fact (once rechunker is done) but only
        # necessary if we'll reopen this rechukned_ds multiple times
        return rechunked_ds, path_tgt