Пример #1
0
def test_intermediate_to_target_memory():
    shape = (175320, 721, 1440)
    source_chunks = (24, 721, 1440)
    target_chunks = (21915, 103, 10)
    itemsize = 4
    max_mem = 12000000000  # 12 GB

    read_chunks, int_chunks, write_chunks = rechunking_plan(
        shape,
        source_chunks,
        target_chunks,
        itemsize,
        max_mem,
        consolidate_reads=True,
    )

    read_chunks2, int_chunks2, write_chunks2 = rechunking_plan(
        shape,
        int_chunks,
        target_chunks,
        itemsize,
        max_mem,
        consolidate_reads=True,
    )

    assert read_chunks2 == int_chunks2 == write_chunks2
Пример #2
0
def test_rechunking_plan_hypothesis(inputs):
    shape, source_chunks, target_chunks, max_mem, itemsize = inputs
    # print(shape, source_chunks, target_chunks, max_mem)

    args = shape, source_chunks, target_chunks, itemsize, max_mem
    read_chunks, int_chunks, write_chunks = rechunking_plan(*args)
    # print(" plan: ", read_chunks, int_chunks, write_chunks)

    # this should be guaranteed by the test
    source_chunk_mem = itemsize * prod(source_chunks)
    target_chunk_mem = itemsize * prod(target_chunks)
    assert source_chunk_mem <= max_mem
    assert target_chunk_mem <= max_mem

    ndim = len(shape)
    assert len(read_chunks) == ndim
    assert len(int_chunks) == ndim
    assert len(write_chunks) == ndim

    _verify_plan_correctness(
        source_chunks,
        read_chunks,
        int_chunks,
        write_chunks,
        target_chunks,
        itemsize,
        max_mem,
    )
Пример #3
0
def test_rechunking_plan_2d(
    shape,
    source_chunks,
    target_chunks,
    itemsize,
    max_mem,
    read_chunks_expected,
    intermediate_chunks_expected,
    write_chunks_expected,
):
    read_chunks, int_chunks, write_chunks = rechunking_plan(
        shape, source_chunks, target_chunks, itemsize, max_mem)
    assert read_chunks == read_chunks_expected
    assert int_chunks == intermediate_chunks_expected
    assert write_chunks == write_chunks_expected
    _verify_plan_correctness(
        shape,
        source_chunks,
        read_chunks,
        int_chunks,
        write_chunks,
        target_chunks,
        itemsize,
        max_mem,
    )
Пример #4
0
def rechunking_plan(
    dim_sizes: Mapping[str, int],
    source_chunks: Mapping[str, int],
    target_chunks: Mapping[str, int],
    itemsize: int,
    max_mem: int,
) -> List[Dict[str, int]]:
  """Make a rechunking plan."""
  plan_shapes = algorithm.rechunking_plan(
      shape=tuple(dim_sizes.values()),
      source_chunks=tuple(source_chunks[dim] for dim in dim_sizes),
      target_chunks=tuple(target_chunks[dim] for dim in dim_sizes),
      itemsize=itemsize,
      max_mem=max_mem,
  )
  return [dict(zip(dim_sizes.keys(), shapes)) for shapes in plan_shapes]
Пример #5
0
def rechunk_zarr2zarr_w_dask(source_array,
                             target_chunks,
                             max_mem,
                             target_store,
                             temp_store=None,
                             source_storage_options={},
                             temp_storage_options={},
                             target_storage_options={}):

    shape = source_array.shape
    source_chunks = source_array.chunks
    dtype = source_array.dtype
    itemsize = dtype.itemsize

    read_chunks, int_chunks, write_chunks = rechunking_plan(
        shape, source_chunks, target_chunks, itemsize, max_mem)

    source_read = dsa.from_zarr(source_array,
                                chunks=read_chunks,
                                storage_options=source_storage_options)

    # create target
    target_array = zarr.empty(shape,
                              chunks=target_chunks,
                              dtype=dtype,
                              store=target_store)
    target_array.attrs.update(source_array.attrs)

    if int_chunks == target_chunks:
        target_store_delayed = dsa.store(source_read,
                                         target_array,
                                         lock=False,
                                         compute=False)
        print("One step rechunking plan")
        return target_store_delayed

    else:
        # do intermediate store
        assert temp_store is not None
        int_array = zarr.empty(shape,
                               chunks=int_chunks,
                               dtype=dtype,
                               store=temp_store)
        intermediate_store_delayed = dsa.store(source_read,
                                               int_array,
                                               lock=False,
                                               compute=False)

        int_read = dsa.from_zarr(int_array,
                                 chunks=write_chunks,
                                 storage_options=temp_storage_options)
        target_store_delayed = dsa.store(int_read,
                                         target_array,
                                         lock=False,
                                         compute=False)

        # now do some hacking to chain these together into a single graph.
        # get the two graphs as dicts
        int_dsk = dask.utils.ensure_dict(intermediate_store_delayed.dask)
        target_dsk = dask.utils.ensure_dict(target_store_delayed.dask)

        # find the root store key representing the read
        root_keys = []
        for key in target_dsk:
            if isinstance(key, str):
                if key.startswith('from-zarr'):
                    root_keys.append(key)
        assert len(root_keys) == 1
        root_key = root_keys[0]

        # now rewrite the graph
        target_dsk[root_key] = (lambda a, *b: a, target_dsk[root_key],
                                *int_dsk[intermediate_store_delayed.key])
        target_dsk.update(int_dsk)

        # fuse
        dsk_fused, deps = fuse(target_dsk)
        delayed_fused = Delayed(target_store_delayed.key, dsk_fused)

        print("Two step rechunking plan")
        return delayed_fused
Пример #6
0
def _setup_array_rechunk(
    source_array,
    target_chunks,
    max_mem,
    target_store_or_group,
    target_options=None,
    temp_store_or_group=None,
    temp_options=None,
    name=None,
) -> CopySpec:
    _validate_options(target_options)
    _validate_options(temp_options)
    shape = source_array.shape
    source_chunks = (
        source_array.chunksize
        if isinstance(source_array, dask.array.Array)
        else source_array.chunks
    )
    dtype = source_array.dtype
    itemsize = dtype.itemsize

    if target_chunks is None:
        # this is just a pass-through copy
        target_chunks = source_chunks

    if isinstance(target_chunks, dict):
        array_dims = _get_dims_from_zarr_array(source_array)
        try:
            target_chunks = _shape_dict_to_tuple(array_dims, target_chunks)
        except KeyError:
            raise KeyError(
                "You must explicitly specify each dimension size in target_chunks. "
                f"Got array_dims {array_dims}, target_chunks {target_chunks}."
            )

    # TODO: rewrite to avoid the hard dependency on dask
    max_mem = dask.utils.parse_bytes(max_mem)

    # don't consolidate reads for Dask arrays
    consolidate_reads = isinstance(source_array, zarr.core.Array)
    read_chunks, int_chunks, write_chunks = rechunking_plan(
        shape,
        source_chunks,
        target_chunks,
        itemsize,
        max_mem,
        consolidate_reads=consolidate_reads,
    )

    # create target
    shape = tuple(int(x) for x in shape)  # ensure python ints for serialization
    target_chunks = tuple(int(x) for x in target_chunks)
    int_chunks = tuple(int(x) for x in int_chunks)
    write_chunks = tuple(int(x) for x in write_chunks)

    target_array = _zarr_empty(
        shape,
        target_store_or_group,
        target_chunks,
        dtype,
        name=name,
        **(target_options or {}),
    )
    try:
        target_array.attrs.update(source_array.attrs)
    except AttributeError:
        pass

    if read_chunks == write_chunks:
        int_array = None
    else:
        # do intermediate store
        assert temp_store_or_group is not None
        int_array = _zarr_empty(
            shape,
            temp_store_or_group,
            int_chunks,
            dtype,
            name=name,
            **(temp_options or {}),
        )

    read_proxy = ArrayProxy(source_array, read_chunks)
    int_proxy = ArrayProxy(int_array, int_chunks)
    write_proxy = ArrayProxy(target_array, write_chunks)
    return CopySpec(read_proxy, int_proxy, write_proxy)
Пример #7
0
def _rechunk_array(
    source_array,
    target_chunks,
    max_mem,
    target_store_or_group,
    temp_store_or_group=None,
    name=None,
    source_storage_options={},
    temp_storage_options={},
    target_storage_options={},
):

    shape = source_array.shape
    source_chunks = source_array.chunks
    dtype = source_array.dtype
    itemsize = dtype.itemsize

    if target_chunks is None:
        # this is just a pass-through copy
        target_chunks = source_chunks

    if isinstance(target_chunks, dict):
        array_dims = _get_dims_from_zarr_array(source_array)
        try:
            target_chunks = _shape_dict_to_tuple(array_dims, target_chunks)
        except KeyError:
            raise KeyError(
                "You must explicitly specify each dimension size in target_chunks. "
                f"Got array_dims {array_dims}, target_chunks {target_chunks}.")

    read_chunks, int_chunks, write_chunks = rechunking_plan(
        shape, source_chunks, target_chunks, itemsize, max_mem)

    print(source_chunks, read_chunks, int_chunks, write_chunks, target_chunks)

    source_read = dsa.from_zarr(source_array,
                                chunks=read_chunks,
                                storage_options=source_storage_options)

    # create target
    shape = tuple(int(x)
                  for x in shape)  # ensure python ints for serialization
    target_chunks = tuple(int(x) for x in target_chunks)
    int_chunks = tuple(int(x) for x in int_chunks)
    write_chunks = tuple(int(x) for x in write_chunks)

    target_array = _zarr_empty(shape,
                               target_store_or_group,
                               target_chunks,
                               dtype,
                               name=name)
    target_array.attrs.update(source_array.attrs)

    if read_chunks == write_chunks:
        target_store_delayed = dsa.store(source_read,
                                         target_array,
                                         lock=False,
                                         compute=False)
        return target_store_delayed

    else:
        # do intermediate store
        assert temp_store_or_group is not None
        int_array = _zarr_empty(shape,
                                temp_store_or_group,
                                int_chunks,
                                dtype,
                                name=name)
        intermediate_store_delayed = dsa.store(source_read,
                                               int_array,
                                               lock=False,
                                               compute=False)

        int_read = dsa.from_zarr(int_array,
                                 chunks=write_chunks,
                                 storage_options=temp_storage_options)
        target_store_delayed = dsa.store(int_read,
                                         target_array,
                                         lock=False,
                                         compute=False)

        # now do some hacking to chain these together into a single graph.
        # get the two graphs as dicts
        int_dsk = dask.utils.ensure_dict(intermediate_store_delayed.dask)
        target_dsk = dask.utils.ensure_dict(target_store_delayed.dask)

        # find the root store key representing the read
        root_keys = []
        for key in target_dsk:
            if isinstance(key, str):
                if key.startswith("from-zarr"):
                    root_keys.append(key)
        assert len(root_keys) == 1
        root_key = root_keys[0]

        # now rewrite the graph
        target_dsk[root_key] = (
            lambda a, *b: a,
            target_dsk[root_key],
            *int_dsk[intermediate_store_delayed.key],
        )
        target_dsk.update(int_dsk)

        # fuse
        dsk_fused, deps = fuse(target_dsk)
        delayed_fused = Delayed(target_store_delayed.key, dsk_fused)

        print("Two step rechunking plan")
        return delayed_fused