def create_array(read_only=False, **kwargs): path = mktemp(suffix='.dbm') atexit.register(os.remove, path) store = DBMStore(path, flag='n', open=bsddb3.btopen) kwargs.setdefault('compressor', Zlib(1)) init_array(store, **kwargs) return Array(store, read_only=read_only)
def chromsizes_tsv_to_zarr(input, output, has_header): df = pd.read_csv(input, header=(0 if has_header else None), sep='\t') num_chroms = df.shape[0] columns = df.columns.values.tolist() chrom_names = df[columns[0]].values chrom_sizes = df[columns[1]].values df["name_len"] = df[columns[0]].apply(lambda name: len(name)) max_name_len = int(df["name_len"].max()) z = zarr.open(output, mode='w') compressor = Zlib(level=1) z.create_dataset("names", shape=(num_chroms, ), dtype=f"S{max_name_len}", compressor=compressor) z.create_dataset("sizes", shape=(num_chroms, ), dtype="u4", compressor=compressor) z["names"][:] = chrom_names z["sizes"][:] = chrom_sizes
def create_array(read_only=False, **kwargs): path = mkdtemp() atexit.register(shutil.rmtree, path) store = NestedDirectoryStore(path) kwargs.setdefault('compressor', Zlib(1)) init_array(store, **kwargs) return Array(store, read_only=read_only)
def chromsizes_negspy_to_zarr(assembly, output, has_header): chrom_order = nc.get_chromorder(assembly) chrom_info = nc.get_chrominfo(assembly) chrom_rows = [{ 0: chrom_name, 1: chrom_info.chrom_lengths[chrom_name] } for chrom_name in chrom_order] df = pd.DataFrame(columns=[0, 1], data=chrom_rows) num_chroms = df.shape[0] columns = df.columns.values.tolist() chrom_names = df[columns[0]].values chrom_sizes = df[columns[1]].values df["name_len"] = df[columns[0]].apply(lambda name: len(name)) max_name_len = int(df["name_len"].max()) z = zarr.open(output, mode='w') compressor = Zlib(level=1) z.create_dataset("names", shape=(num_chroms, ), dtype=f"S{max_name_len}", compressor=compressor) z.create_dataset("sizes", shape=(num_chroms, ), dtype="u4", compressor=compressor) z["names"][:] = chrom_names z["sizes"][:] = chrom_sizes
def _translator(self, name: str, h5obj: Union[h5py.Dataset, h5py.Group]): """Produce Zarr metadata for all groups and datasets in the HDF5 file. """ refs = {} if isinstance(h5obj, h5py.Dataset): lggr.debug(f'HDF5 dataset: {h5obj.name}') if h5obj.id.get_create_plist().get_layout() == h5py.h5d.COMPACT: RuntimeError( f'Compact HDF5 datasets not yet supported: <{h5obj.name} ' f'{h5obj.shape} {h5obj.dtype} {h5obj.nbytes} bytes>') return if (h5obj.scaleoffset or h5obj.fletcher32 or h5obj.compression in ('szip', 'lzf')): raise RuntimeError( f'{h5obj.name} uses unsupported HDF5 filters') if h5obj.compression == 'gzip': compression = Zlib(level=h5obj.compression_opts) else: compression = None # Add filter for shuffle filters = [] if h5obj.shuffle: filters.append(Shuffle(elementsize=h5obj.dtype.itemsize)) # Get storage info of this HDF5 dataset... cinfo = self._storage_info(h5obj) if self._xr and h5py.h5ds.is_scale(h5obj.id) and not cinfo: return # Create a Zarr array equivalent to this HDF5 dataset... za = self._zroot.create_dataset(h5obj.name, shape=h5obj.shape, dtype=h5obj.dtype, chunks=h5obj.chunks or False, fill_value=h5obj.fillvalue, compression=compression, filters=filters, overwrite=True) lggr.debug(f'Created Zarr array: {za}') self._transfer_attrs(h5obj, za) if self._xr: # Do this for xarray... adims = self._get_array_dims(h5obj) za.attrs['_ARRAY_DIMENSIONS'] = adims lggr.debug(f'_ARRAY_DIMENSIONS = {adims}') # Store chunk location metadata... if cinfo: for k, v in cinfo.items(): self.store[za._chunk_key(k)] = [self._uri, v['offset'], v['size']] elif isinstance(h5obj, h5py.Group): lggr.debug(f'HDF5 group: {h5obj.name}') zgrp = self._zroot.create_group(h5obj.name) self._transfer_attrs(h5obj, zgrp)
def create_array(read_only=False, **kwargs): path = mktemp(suffix='.lmdb') atexit_rmtree(path) try: store = LMDBStore(path, buffers=False) except ImportError: # pragma: no cover raise SkipTest('lmdb not installed') kwargs.setdefault('compressor', Zlib(1)) init_array(store, **kwargs) return Array(store, read_only=read_only)
def create_array(read_only=False, **kwargs): store = dict() dtype = kwargs.get('dtype', None) filters = [ Delta(dtype=dtype), FixedScaleOffset(dtype=dtype, scale=1, offset=0), ] kwargs.setdefault('filters', filters) compressor = Zlib(1) kwargs.setdefault('compressor', compressor) init_array(store, **kwargs) return Array(store, read_only=read_only)
def prep_source(source): foo = source.create_group('foo') foo.attrs['experiment'] = 'weird science' baz = foo.create_dataset('bar/baz', data=np.arange(100), chunks=(50,)) baz.attrs['units'] = 'metres' if request.param == 'hdf5': extra_kws = dict(compression='gzip', compression_opts=3, fillvalue=84, shuffle=True, fletcher32=True) else: extra_kws = dict(compressor=Zlib(3), order='F', fill_value=42, filters=[Adler32()]) source.create_dataset('spam', data=np.arange(100, 200).reshape(20, 5), chunks=(10, 2), dtype='i2', **extra_kws) return source
def test_copy_array_create_options(self, source, dest): dest_h5py = dest.__module__.startswith('h5py.') # copy array, provide creation options compressor = Zlib(9) create_kws = dict(chunks=(10,)) if dest_h5py: create_kws.update(compression='gzip', compression_opts=9, shuffle=True, fletcher32=True, fillvalue=42) else: # v3 case has no filters argument in zarr create_kws create_kws.update(compressor=compressor, fill_value=42, order='F') copy(source['foo/bar/baz'], dest, without_attrs=True, **create_kws) check_copied_array(source['foo/bar/baz'], dest['baz'], without_attrs=True, expect_props=create_kws)
def test_copy_array_create_options(self): source = self.source dest = self.new_dest() # copy array, provide creation options compressor = Zlib(9) create_kws = dict(chunks=(10,)) if self.dest_h5py: create_kws.update(compression='gzip', compression_opts=9, shuffle=True, fletcher32=True, fillvalue=42) else: create_kws.update(compressor=compressor, fill_value=42, order='F', filters=[Adler32()]) copy(source['foo/bar/baz'], dest, without_attrs=True, **create_kws) check_copied_array(source['foo/bar/baz'], dest['baz'], without_attrs=True, expect_props=create_kws)
def translator(self, name, h5obj): """Produce Zarr metadata for all groups and datasets in the HDF5 file. """ if isinstance(h5obj, h5py.Dataset): lggr.debug(f'Dataset: {h5obj.name}') if (h5obj.scaleoffset or h5obj.fletcher32 or h5obj.shuffle or h5obj.compression in ('szip', 'lzf')): raise RuntimeError( f'{h5obj.name} uses unsupported HDF5 filters') if h5obj.compression == 'gzip': compression = Zlib(level=h5obj.compression_opts) else: compression = None # Get storage info of this HDF5 dataset... cinfo = self.storage_info(h5obj) if self._xr and h5py.h5ds.is_scale(h5obj.id) and not cinfo: return # Create a Zarr array equivalent to this HDF5 dataset... za = self._zroot.create_dataset(h5obj.name, shape=h5obj.shape, dtype=h5obj.dtype, chunks=h5obj.chunks or False, fill_value=h5obj.fillvalue, compression=compression, overwrite=True) lggr.debug(f'Created Zarr array: {za}') self.transfer_attrs(h5obj, za) if self._xr: # Do this for xarray... adims = self._get_array_dims(h5obj) za.attrs['_ARRAY_DIMENSIONS'] = adims lggr.debug(f'_ARRAY_DIMENSIONS = {adims}') # Store chunk location metadata... if cinfo: cinfo['source'] = {'uri': self._uri, 'array_name': h5obj.name} FileChunkStore.chunks_info(za, cinfo) elif isinstance(h5obj, h5py.Group): lggr.debug(f'Group: {h5obj.name}') zgrp = self._zroot.create_group(h5obj.name) self.transfer_attrs(h5obj, zgrp)
import numpy as np from numcodecs import Zlib import zarr if __name__ == "__main__": arr = np.arange(3 * 12 * 6).reshape(3, 12, 6) z = zarr.open( "dummy_data.zarr", mode="w", shape=arr.shape, compressor=Zlib(level=1), chunks=(3, 3, 3), dtype="<i4", ) z[:, :, :] = arr
def test_create_dataset(self): g = self.create_group() # create as immediate child d1 = g.create_dataset('foo', shape=1000, chunks=100) assert isinstance(d1, Array) assert (1000,) == d1.shape assert (100,) == d1.chunks assert 'foo' == d1.path assert '/foo' == d1.name assert g.store is d1.store # create as descendant d2 = g.create_dataset('/a/b/c/', shape=2000, chunks=200, dtype='i1', compression='zlib', compression_opts=9, fill_value=42, order='F') assert isinstance(d2, Array) assert (2000,) == d2.shape assert (200,) == d2.chunks assert np.dtype('i1') == d2.dtype assert 'zlib' == d2.compressor.codec_id assert 9 == d2.compressor.level assert 42 == d2.fill_value assert 'F' == d2.order assert 'a/b/c' == d2.path assert '/a/b/c' == d2.name assert g.store is d2.store # create with data data = np.arange(3000, dtype='u2') d3 = g.create_dataset('bar', data=data, chunks=300) assert isinstance(d3, Array) assert (3000,) == d3.shape assert (300,) == d3.chunks assert np.dtype('u2') == d3.dtype assert_array_equal(data, d3[:]) assert 'bar' == d3.path assert '/bar' == d3.name assert g.store is d3.store # compression arguments handling follows... # compression_opts as dict d = g.create_dataset('aaa', shape=1000, dtype='u1', compression='blosc', compression_opts=dict(cname='zstd', clevel=1, shuffle=2)) assert d.compressor.codec_id == 'blosc' assert 'zstd' == d.compressor.cname assert 1 == d.compressor.clevel assert 2 == d.compressor.shuffle # compression_opts as sequence d = g.create_dataset('bbb', shape=1000, dtype='u1', compression='blosc', compression_opts=('zstd', 1, 2)) assert d.compressor.codec_id == 'blosc' assert 'zstd' == d.compressor.cname assert 1 == d.compressor.clevel assert 2 == d.compressor.shuffle # None compression_opts d = g.create_dataset('ccc', shape=1000, dtype='u1', compression='zlib') assert d.compressor.codec_id == 'zlib' assert 1 == d.compressor.level # None compression d = g.create_dataset('ddd', shape=1000, dtype='u1', compression=None) assert d.compressor is None # compressor as compression d = g.create_dataset('eee', shape=1000, dtype='u1', compression=Zlib(1)) assert d.compressor.codec_id == 'zlib' assert 1 == d.compressor.level
def test_create_dataset(self): g = self.create_group() # create as immediate child d1 = g.create_dataset('foo', shape=1000, chunks=100) assert_is_instance(d1, Array) eq((1000, ), d1.shape) eq((100, ), d1.chunks) eq('foo', d1.path) eq('/foo', d1.name) assert_is(g.store, d1.store) # create as descendant d2 = g.create_dataset('/a/b/c/', shape=2000, chunks=200, dtype='i1', compression='zlib', compression_opts=9, fill_value=42, order='F') assert_is_instance(d2, Array) eq((2000, ), d2.shape) eq((200, ), d2.chunks) eq(np.dtype('i1'), d2.dtype) eq('zlib', d2.compressor.codec_id) eq(9, d2.compressor.level) eq(42, d2.fill_value) eq('F', d2.order) eq('a/b/c', d2.path) eq('/a/b/c', d2.name) assert_is(g.store, d2.store) # create with data data = np.arange(3000, dtype='u2') d3 = g.create_dataset('bar', data=data, chunks=300) assert_is_instance(d3, Array) eq((3000, ), d3.shape) eq((300, ), d3.chunks) eq(np.dtype('u2'), d3.dtype) assert_array_equal(data, d3[:]) eq('bar', d3.path) eq('/bar', d3.name) assert_is(g.store, d3.store) # compression arguments handling follows... # compression_opts as dict d = g.create_dataset('aaa', shape=1000, dtype='u1', compression='blosc', compression_opts=dict(cname='zstd', clevel=1, shuffle=2)) eq(d.compressor.codec_id, 'blosc') eq('zstd', d.compressor.cname) eq(1, d.compressor.clevel) eq(2, d.compressor.shuffle) # compression_opts as sequence d = g.create_dataset('bbb', shape=1000, dtype='u1', compression='blosc', compression_opts=('zstd', 1, 2)) eq(d.compressor.codec_id, 'blosc') eq('zstd', d.compressor.cname) eq(1, d.compressor.clevel) eq(2, d.compressor.shuffle) # None compression_opts d = g.create_dataset('ccc', shape=1000, dtype='u1', compression='zlib') eq(d.compressor.codec_id, 'zlib') eq(1, d.compressor.level) # None compression d = g.create_dataset('ddd', shape=1000, dtype='u1', compression=None) assert_is_none(d.compressor) # compressor as compression d = g.create_dataset('eee', shape=1000, dtype='u1', compression=Zlib(1)) eq(d.compressor.codec_id, 'zlib') eq(1, d.compressor.level)
overwrite=True) group.array(name='30x20_c_>f4', dtype='>f4', data=array_30x20_c, chunks=(7, 13), overwrite=True) group.array(name='30x20_f_>f4', dtype='>f4', data=array_30x20_f, chunks=(7, 13), order='F', overwrite=True) group.array(name='30x20_c_>u8_zlib', dtype='>u8', compressor=Zlib(level=6), data=array_30x20_c, chunks=(7, 13), overwrite=True) group.array(name='30x20_c_>u8_gzip', dtype='>u8', compressor=GZip(level=6), data=array_30x20_c, chunks=(7, 13), overwrite=True) group.array(name='30x20_c_>u8_bz2', dtype='>u8', compressor=BZ2(level=1), data=array_30x20_c, chunks=(7, 13), overwrite=True)
# add groups compressed_grp = root_grp.create_group('compressed', overwrite=True) filtered_grp = root_grp.create_group('filtered', overwrite=True) comp_filt_grp = root_grp.create_group('comp_filt', overwrite=True) # In[ ]: # add compressed data arrays (no filters) # deflate a = compressed_grp.create_dataset('deflate1', shape=(200, 200), chunks=(50, 50), dtype='i4', overwrite=True, compressor=Zlib(level=1)) a[:] = data a = compressed_grp.create_dataset('deflate9', shape=(200, 200), chunks=(50, 50), dtype='i4', overwrite=True, compressor=Zlib(level=0)) a[:] = data # shuffle a = compressed_grp.create_dataset('shuffle', shape=(200, 200), chunks=(50, 50), dtype='i4', overwrite=True, compressor=Shuffle())
def bigwigs_to_zarr(input_bigwig_files, output_file, starting_resolution, name): # Short-hand for creating a DirectoryStore with a root group. f = zarr.open(output_file, mode='w') compressor = Zlib(level=1) num_samples = len(input_bigwig_files) # Create level zero groups chromosomes_group = f.create_group("chromosomes") # Prepare to fill in chroms dataset chromosomes = nc.get_chromorder('hg38') chromosomes = [str(chr_name) for chr_name in chromosomes[:25] ] # TODO: should more than chr1-chrM be used? num_chromosomes = len(chromosomes) chroms_length_arr = np.array( [nc.get_chrominfo('hg38').chrom_lengths[x] for x in chromosomes], dtype="i8") chroms_cumsum_arr = np.concatenate( (np.array([0]), np.cumsum(chroms_length_arr))) chromosomes_set = set(chromosomes) chrom_name_to_length = dict(zip(chromosomes, chroms_length_arr)) chrom_name_to_cumsum = dict(zip(chromosomes, chroms_cumsum_arr)) # Prepare to fill in resolutions dataset resolutions = [starting_resolution * (2**x) for x in range(16)] # Create each chromosome dataset. for chr_name, chr_len in chrom_name_to_length.items(): chr_group = chromosomes_group.create_group(chr_name) # Create each resolution group. for resolution in resolutions: chr_shape = (num_samples, math.ceil(chr_len / resolution)) chr_group.create_dataset(str(resolution), shape=chr_shape, dtype="f4", fill_value=np.nan, compressor=compressor) # Fill in data for each bigwig file. for bw_index, bw_file in tqdm(list(enumerate(input_bigwig_files)), desc='bigwigs'): if bbi.is_bigwig(bw_file): chromsizes = bbi.chromsizes(bw_file) matching_chromosomes = set( chromsizes.keys()).intersection(chromosomes_set) # Fill in data for each resolution of a bigwig file. for resolution in resolutions: # Fill in data for each chromosome of a resolution of a bigwig file. for chr_name in matching_chromosomes: chr_len = chrom_name_to_length[chr_name] chr_shape = (num_samples, math.ceil(chr_len / resolution)) arr = bbi.fetch(bw_file, chr_name, 0, chr_len, chr_shape[1], summary="sum") chromosomes_group[chr_name][str(resolution)][ bw_index, :] = arr else: print(f"{bw_file} not is_bigwig") max_mem = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss print(max_mem) # Append metadata to the top resolution row_infos attribute. row_infos = [] for bw_index, bw_file in enumerate(input_bigwig_files): row_infos.append({ "cluster": int(bw_index + 1), "file": os.path.basename(bw_file) }) # f.attrs should contain all tileset_info properties # For zarr, more attributes are used here to allow "serverless" f.attrs['row_infos'] = row_infos f.attrs['resolutions'] = sorted(resolutions, reverse=True) f.attrs['shape'] = [num_samples, 256] f.attrs['name'] = name f.attrs['coordSystem'] = "hg38" # https://github.com/zarr-developers/zarr-specs/issues/50 f.attrs['multiscales'] = [{ "version": "0.1", "name": chr_name, "datasets": [{ "path": f"chromosomes/{chr_name}/{resolution}" } for resolution in sorted(resolutions, reverse=True)], "type": "zarr-multivec", "metadata": { "chromoffset": int(chrom_name_to_cumsum[chr_name]), "chromsize": int(chr_len), } } for (chr_name, chr_len) in list(zip(chromosomes, chroms_length_arr))]
from __future__ import absolute_import, print_function, division import numpy as np from numpy.testing import assert_array_equal, assert_array_almost_equal from numcodecs import (AsType, Delta, FixedScaleOffset, PackBits, Categorize, Zlib, Blosc, BZ2, Quantize) from zarr.creation import array from zarr.compat import PY2 compressors = [ None, Zlib(), BZ2(), Blosc(), ] # TODO rely on backports and remove PY2 exclusion if not PY2: # pragma: py2 no cover from zarr.codecs import LZMA compressors.append(LZMA()) def test_array_with_delta_filter(): # setup astype = 'u1' dtype = 'i8'
def create_array(self, read_only=False, **kwargs): store = dict() kwargs.setdefault('compressor', Zlib(level=1)) init_array(store, **kwargs) return Array(store, read_only=read_only)
def create_array(read_only=False, **kwargs): store = CustomMapping() kwargs.setdefault('compressor', Zlib(1)) init_array(store, **kwargs) return Array(store, read_only=read_only)
import click import numpy as np import dask.array as da import zarr from numcodecs import Zlib from pathlib import Path PYRAMID_GROUP_NAME = "sub-resolutions" DEFAULT_COMPRESSOR = Zlib(level=1) def pad_axis(array, dim, pad_width): padding = [(0, 0) if i != dim else (0, pad_width) for i in range(len(array.shape))] padded = da.pad(array, padding, "constant") return padded def guess_rgb(shape): ndim = len(shape) last_dim = shape[-1] if ndim > 2 and last_dim < 5: return True else: return False def _create_pyramid(