예제 #1
0
def test_concatenate_and_rechunk__shape_mismatch():
    z1 = zarr.zeros((5, 3), chunks=(2, 3), dtype="i4")
    z2 = zarr.zeros((5, 4), chunks=(2, 4), dtype="i4")
    zarrs = [z1, z2]

    with pytest.raises(ValueError, match="Zarr arrays must have matching shapes"):
        concatenate_and_rechunk(zarrs)
예제 #2
0
def test_concatenate_and_rechunk__1d():
    z1 = zarr.zeros(5, chunks=2, dtype="i4")
    z1[:] = np.arange(5)

    z2 = zarr.zeros(5, chunks=2, dtype="i4")
    z2[:] = np.arange(5, 10)

    zarrs = [z1, z2]

    out = concatenate_and_rechunk(zarrs)

    assert out.chunks == ((2, 2, 2, 2, 2),)
    np.testing.assert_array_equal(out.compute(), np.arange(10))
예제 #3
0
def test_concatenate_and_rechunk__2d():
    z1 = zarr.zeros((5, 3), chunks=(2, 3), dtype="i4")
    z1[:] = np.arange(15).reshape(5, 3)

    z2 = zarr.zeros((5, 3), chunks=(2, 3), dtype="i4")
    z2[:] = np.arange(15, 30).reshape(5, 3)

    zarrs = [z1, z2]

    out = concatenate_and_rechunk(zarrs)

    assert out.chunks == ((2, 2, 2, 2, 2), (3,))
    np.testing.assert_array_equal(out.compute(), np.arange(30).reshape(10, 3))
예제 #4
0
def _concat_zarrs_optimized(
    zarr_files: List[str],
    output: PathType,
    vars_to_rechunk: List[Hashable],
    vars_to_copy: List[Hashable],
) -> None:
    zarr_groups = [zarr.open_group(f) for f in zarr_files]

    first_zarr_group = zarr_groups[0]

    # create the top-level group
    zarr.open_group(str(output), mode="w")

    # copy variables that are to be rechunked
    # NOTE: that this uses _to_zarr function defined here that is needed to avoid
    # race conditions between writing the array contents and its metadata
    # see https://github.com/pystatgen/sgkit/pull/486
    delayed = []  # do all the rechunking operations in one computation
    for var in vars_to_rechunk:
        dtype = None
        if var in {"variant_id", "variant_allele"}:
            max_len = _get_max_len(zarr_groups, f"max_length_{var}")
            dtype = f"S{max_len}"

        arr = concatenate_and_rechunk([group[var] for group in zarr_groups],
                                      dtype=dtype)
        d = _to_zarr(  # type: ignore[no-untyped-call]
            arr,
            str(output),
            component=var,
            overwrite=True,
            compute=False,
            fill_value=None,
            attrs=first_zarr_group[var].attrs.asdict(),
        )
        delayed.append(d)
    da.compute(*delayed)

    # copy unchanged variables and top-level metadata
    with zarr.open_group(str(output)) as output_zarr:

        # copy variables that are not rechunked (e.g. sample_id)
        for var in vars_to_copy:
            output_zarr[var] = first_zarr_group[var]
            output_zarr[var].attrs.update(first_zarr_group[var].attrs)

        # copy top-level attributes
        output_zarr.attrs.update(first_zarr_group.attrs)
예제 #5
0
def test_concatenate_and_rechunk__tiny_file():
    z1 = zarr.zeros(4, chunks=3, dtype="i4")
    z1[:] = np.arange(4)

    # this zarr array lies entirely within the second chunk
    z2 = zarr.zeros(1, chunks=3, dtype="i4")
    z2[:] = np.arange(4, 5)

    z3 = zarr.zeros(5, chunks=3, dtype="i4")
    z3[:] = np.arange(5, 10)

    zarrs = [z1, z2, z3]

    out = concatenate_and_rechunk(zarrs)

    assert out.chunks == ((3, 3, 3, 1),)
    np.testing.assert_array_equal(out.compute(), np.arange(10))
예제 #6
0
def _concat_zarrs_optimized(
    zarr_files: List[str],
    output: PathType,
    vars_to_rechunk: List[Hashable],
    vars_to_copy: List[Hashable],
) -> None:
    zarr_groups = [zarr.open_group(f) for f in zarr_files]

    first_zarr_group = zarr_groups[0]

    with zarr.open_group(str(output)) as output_zarr:

        var_to_attrs = {}  # attributes to copy
        delayed = []  # do all the rechunking operations in one computation
        for var in vars_to_rechunk:
            var_to_attrs[var] = first_zarr_group[var].attrs.asdict()
            dtype = None
            if var == "variant_id":
                max_len = _get_max_len(zarr_groups, "max_variant_id_length")
                dtype = f"S{max_len}"
            elif var == "variant_allele":
                max_len = _get_max_len(zarr_groups,
                                       "max_variant_allele_length")
                dtype = f"S{max_len}"

            arr = concatenate_and_rechunk(
                [group[var] for group in zarr_groups], dtype=dtype)
            d = arr.to_zarr(
                str(output),
                component=var,
                overwrite=True,
                compute=False,
                fill_value=None,
            )
            delayed.append(d)
        da.compute(*delayed)

        # copy variables that are not rechunked (e.g. sample_id)
        for var in vars_to_copy:
            output_zarr[var] = first_zarr_group[var]
            output_zarr[var].attrs.update(first_zarr_group[var].attrs)

        # copy attributes
        output_zarr.attrs.update(first_zarr_group.attrs)
        for (var, attrs) in var_to_attrs.items():
            output_zarr[var].attrs.update(attrs)
예제 #7
0
def concat_zarrs_optimized(
    zarr_files: Sequence[str],
    output: Union[PathType, MutableMapping[str, bytes]],
    vars_to_rechunk: List[Hashable],
    vars_to_copy: List[Hashable],
    fix_strings: bool = False,
) -> None:
    if isinstance(output, Path):
        output = str(output)

    zarr_groups = [zarr.open_group(f) for f in zarr_files]

    first_zarr_group = zarr_groups[0]

    # create the top-level group
    zarr.open_group(output, mode="w")

    # copy variables that are to be rechunked
    # NOTE: that this uses _to_zarr function defined here that is needed to avoid
    # race conditions between writing the array contents and its metadata
    # see https://github.com/pystatgen/sgkit/pull/486
    delayed = []  # do all the rechunking operations in one computation
    for var in vars_to_rechunk:
        dtype = None
        if fix_strings and var in {"variant_id", "variant_allele"}:
            max_len = _get_max_len(zarr_groups, f"max_length_{var}")
            dtype = f"S{max_len}"
        arr = concatenate_and_rechunk([group[var] for group in zarr_groups],
                                      dtype=dtype)

        _to_zarr_kwargs = dict(
            compressor=first_zarr_group[var].compressor,
            filters=first_zarr_group[var].filters,
            fill_value=None,
        )
        if not fix_strings and arr.dtype == "O":
            # We assume that all object dtypes are variable length strings
            var_len_str_codec = numcodecs.VLenUTF8()
            _to_zarr_kwargs["object_codec"] = var_len_str_codec
            # Remove from filters to avoid double encoding error
            if var_len_str_codec in first_zarr_group[var].filters:
                filters = list(first_zarr_group[var].filters)
                filters.remove(var_len_str_codec)
                _to_zarr_kwargs["filters"] = filters

        d = _to_zarr(  # type: ignore[no-untyped-call]
            arr,
            output,
            component=var,
            overwrite=True,
            compute=False,
            attrs=first_zarr_group[var].attrs.asdict(),
            **_to_zarr_kwargs,
        )
        delayed.append(d)
    da.compute(*delayed)

    # copy unchanged variables and top-level metadata
    with zarr.open_group(output) as output_zarr:

        # copy variables that are not rechunked (e.g. sample_id)
        for var in vars_to_copy:
            output_zarr[var] = first_zarr_group[var]
            output_zarr[var].attrs.update(first_zarr_group[var].attrs)

        # copy top-level attributes
        group_attrs = dict(first_zarr_group.attrs)
        if "max_alt_alleles_seen" in group_attrs:
            max_alt_alleles_seen = _get_max_len(zarr_groups,
                                                "max_alt_alleles_seen")
            group_attrs["max_alt_alleles_seen"] = max_alt_alleles_seen
        output_zarr.attrs.update(group_attrs)

    # consolidate metadata
    zarr.consolidate_metadata(output)