def convert_data_set(path, data_set, batch_size=1000): loader = DataLoader(data_set, batch_size=batch_size, shuffle=False, num_workers=4) num_examples = len(data_set) os.makedirs(path, exist_ok=True) with zarr.LMDBStore(path) as store: root = zarr.group(store=store, overwrite=True) images_set = root.zeros('images', shape=(num_examples, 3, 96, 96), chunks=(1, None, None, None), dtype='u1') labels_set = root.zeros('labels', shape=(num_examples, ), chunks=(1, ), dtype='u1') current_iter = 0 for images, labels in tqdm(loader): size = images.shape[0] images_set[current_iter:current_iter + size] = images labels_set[current_iter:current_iter + size] = labels current_iter += size
def get_storage_map(fs, path, memcache=2 ** 26, lock=True, storage_cache=2 ** 28): store = _get_storage_map(fs, path) cache_path = get_cache_path(path) if storage_cache and storage_cache > 0: os.makedirs(cache_path, exist_ok=True) store = LRUCache( zarr.LMDBStore(cache_path, buffers=True, lock=lock), store, storage_cache ) if memcache and memcache > 0: store = LRUCache(zarr.MemoryStore(), store, memcache) return store
def _initialise(self, filename=None): """ Initialise the basic state of the data container. """ self.store = None self.data = zarr.group() if filename is not None: self.store = zarr.LMDBStore(filename, subdir=False) self.data = zarr.open_group(store=self.store) self.data.attrs[FORMAT_NAME_KEY] = self.FORMAT_NAME self.data.attrs[FORMAT_VERSION_KEY] = self.FORMAT_VERSION self.data.attrs["uuid"] = str(uuid.uuid4())
def __getitem__(self, idx): if self.datasets is None: store = zarr.LMDBStore(self.path) zarr_db = zarr.group(store=store) self.datasets = {key: zarr_db[key] for key in self.keys} items = [] for key in self.keys: item = self.datasets[key][idx] if key in self.transforms: item = self.transforms[key](item) items.append(item) return items
def _open_readonly(self, filename): # We set the mapsize here because LMBD will map 1TB of virtual memory if # we don't, making it hard to figure out how much memory we're actually # using. map_size = None try: map_size = os.path.getsize(filename) except OSError: # Ignore any exceptions here and let LMDB handle them. pass self.store = zarr.LMDBStore(filename, map_size=map_size, readonly=True, subdir=False, lock=False) self.data = zarr.open_group(store=self.store) self.check_format()
def get_obs_vector(session_ID, var, layer="X"): save_dir = save_analysis_path + str(session_ID) + "/" if (use_zarr is True): zarr_cache_dir = save_dir + "adata_cache" + ".zarr" if (os.path.exists(zarr_cache_dir) is True): with zarr.LMDBStore(zarr_cache_dir) as store_store: store = zarr.open_group(store=store_store, mode='r') if (var in store.obs.keys()): ret = list(store.obs[var]) else: idx = list(store.var["gene_ID"]).index(var) if (layer == "X"): ret = store["X_dense"][:, idx] else: ret = (store["layers_dense"][layer])[:, idx] return ret
def __init__(self, data_dir_pth, desired_chunk_size_bytes=1. * 1024**2, datastore_type=DatastoreType.LMDB, compression_type=CompressionType.BLOSC): """ :param data_dir_pth: Path to the zarr lmdb file :param desired_chunk_size_bytes: The size (in bytes) of chunk each array is split into :param datastore_type: LMDB uses the lmdb database which needs to be installed on the system. If not available, use DIRECTORY type, which uses os filesystem :param compression_type: BLOSC uses the blosc library through numcodecs, but requires the blosc library to be installed on the system, or have a compatible system where blosc can be automatically installed when installing numcodes. If blosc is not available, use LZMA, which uses the python built-in compression library LZMA. """ import zarr self.zarr = zarr self.datastore_type = datastore_type if datastore_type == DatastoreType.LMDB: self.store = zarr.LMDBStore(data_dir_pth) elif datastore_type == DatastoreType.DIRECTORY: self.store = zarr.DirectoryStore(data_dir_pth) else: raise RuntimeError( 'Unknown datastore type: {}'.format(datastore_type)) if compression_type == CompressionType.BLOSC: from numcodecs import Blosc self.compressor = Blosc(cname='blosclz', clevel=9, shuffle=Blosc.BITSHUFFLE) elif compression_type == CompressionType.LZMA: # import lzma # lzma_filters = [dict(id=lzma.FILTER_DELTA, dist=4), dict(id=lzma.FILTER_LZMA2, preset=1)] from numcodecs import LZMA self.compressor = LZMA() self.desired_chunk_size_bytes = desired_chunk_size_bytes if not os.path.exists(data_dir_pth): self.f = zarr.group(store=self.store, overwrite=True) else: self.f = zarr.group(store=self.store, overwrite=False) self.i = 0
def __init__(self, path, transforms=None): self.path = path self.keys = ('images', 'labels') assert os.path.exists(path), 'file `{}` not exists!'.format(path) with zarr.LMDBStore(path) as store: zarr_db = zarr.group(store=store) self.num_examples = zarr_db['labels'].shape[0] self.datasets = None if transforms is None: transforms = { 'labels': lambda v: torch.tensor(v, dtype=torch.long), 'images': lambda v: torch.tensor( (v - 127.5) / 127.5, dtype=torch.float32) } self.transforms = transforms
def __init__(self, data_dir_pth, desired_chunk_size_bytes=1. * 1024 ** 2): """ :param data_dir_pth: Path to the zarr lmdb file self.desired_chunk_size_bytes = desired_chunk_size_bytes """ import zarr from numcodecs import Blosc self.zarr = zarr self.store = zarr.LMDBStore(data_dir_pth) self.compressor = Blosc(cname='blosclz', clevel=9, shuffle=Blosc.BITSHUFFLE) self.desired_chunk_size_bytes = desired_chunk_size_bytes if not os.path.exists(data_dir_pth): self.f = zarr.group(store=self.store, overwrite=True) else: self.f = zarr.group(store=self.store, overwrite=False) self.i = 0
""" Quick script to patch up the sequence length attribute in version 1.0 sample files to make sure they are not 0. Older version supported this but new versions will not. """ import tsinfer import zarr import sys import os.path filename = sys.argv[1] sample_data = tsinfer.load(filename) sequence_length = sample_data.sites_position[-1] + 1 sample_data.close() # Add a megabyte to the map size in the file size goes up. map_size = os.path.getsize(filename) + 1024**2 store = zarr.LMDBStore(filename, subdir=False, map_size=map_size) data = zarr.open(store=store, mode="w+") data.attrs["sequence_length"] = sequence_length store.close() sample_data = tsinfer.load(filename) print("patched up sequence length") print(sample_data)