def open(self, mode: str = "r", cached: bool = True, cache_size_bytes: int = int(1e9)) -> "ChunkedDataset": """Opens a zarr dataset from disk from the path supplied in the constructor. Keyword Arguments: mode (str): Mode to open dataset in, default to read-only (default: {"r"}) cached (bool): Whether to cache files read from disk using a LRU cache. (default: {True}) cache_size_bytes (int): Size of cache in bytes (default: {1e9} (1GB)) Raises: Exception: When any of the expected arrays (frames, agents, scenes) is missing or the store couldn't be opened. """ if cached: self.root = zarr.open_group( store=zarr.LRUStoreCache(zarr.DirectoryStore(self.path), max_size=cache_size_bytes), mode=mode ) else: self.root = zarr.open_group(self.path, mode=mode) self.frames = self.root[FRAME_ARRAY_KEY] self.agents = self.root[AGENT_ARRAY_KEY] self.scenes = self.root[SCENE_ARRAY_KEY] try: self.tl_faces = self.root[TL_FACE_ARRAY_KEY] except KeyError: warnings.warn( f"{TL_FACE_ARRAY_KEY} not found in {self.path}! Traffic lights will be disabled", RuntimeWarning, stacklevel=2, ) self.tl_faces = np.empty((0,), dtype=TL_FACE_DTYPE) return self
def prepare_zarr_group(dataset_id, dataset, store, table="MAIN"): dir_store = zarr.DirectoryStore(store) try: # Open in read/write, must exist group = zarr.open_group(store=dir_store, mode="r+") except zarr.errors.GroupNotFoundError: # Create, must not exist group = zarr.open_group(store=dir_store, mode="w-") group_name = f"{table}_{dataset_id}" ds_group = group.require_group(table).require_group(group_name) schema = DatasetSchema.from_dataset(dataset) for column, column_schema in schema.data_vars.items(): create_array(ds_group, column, column_schema, False) for column, column_schema in schema.coords.items(): create_array(ds_group, column, column_schema, True) ds_group.attrs.update({ **schema.attrs, DASKMS_ATTR_KEY: { "chunks": dict(dataset.chunks) } }) return ds_group
def collect_zarr(file_name, out_dir, num_procs): final_zarr_file = '%s/%s' % (out_dir, file_name) # seed w/ job0 job_zarr_file = '%s/job0/%s' % (out_dir, file_name) shutil.copytree(job_zarr_file, final_zarr_file) # open final final_zarr_open = zarr.open_group(final_zarr_file) for pi in range(1, num_procs): # open job job_zarr_file = '%s/job%d/%s' % (out_dir, pi, file_name) job_zarr_open = zarr.open_group(job_zarr_file, 'r') # append to final for key in final_zarr_open.keys(): if key in ['percentiles', 'target_ids', 'target_labels']: # once is enough pass elif key[-4:] == '_pct': # average u_k1 = np.array(final_zarr_open[key]) x_k = np.array(job_zarr_open[key]) final_zarr_open[key] = u_k1 + (x_k - u_k1) / (pi + 1) else: # append final_zarr_open[key].append(job_zarr_open[key])
def prepare_zarr_group(dataset_id, dataset, store, rechunk=False): try: # Open in read/write, must exist group = zarr.open_group(store=store.map, mode="r+") except zarr.errors.GroupNotFoundError: # Create, must not exist group = zarr.open_group(store=store.map, mode="w-") table_path = store.table if store.table else "MAIN" group_name = f"{table_path}_{dataset_id}" ds_group = group.require_group(table_path).require_group(group_name) dataset, ds_group = maybe_rechunk(dataset, ds_group, rechunk=rechunk) schema = DatasetSchema.from_dataset(dataset) schema_chunks = schema.chunks for column, column_schema in schema.data_vars.items(): create_array(ds_group, column, column_schema, schema_chunks, False) for column, column_schema in schema.coords.items(): create_array(ds_group, column, column_schema, schema_chunks, True) ds_group.attrs.update({ **schema.attrs, DASKMS_ATTR_KEY: { "chunks": dict(dataset.chunks) } }) return dataset, ds_group
def open( self, mode: str = "r", cached: bool = True, cache_size_bytes: int = int(1e9)) -> "ChunkedDataset": """Opens a zarr dataset from disk from the path supplied in the constructor. :param mode: Mode to open dataset in, default to read-only (default: {"r"}) :param cached: Whether to cache files read from disk using a LRU cache. (default: {True}) :param cache_size_bytes: Size of cache in bytes (default: {1e9} (1GB)) """ if cached: self.root = zarr.open_group(store=zarr.LRUStoreCache( zarr.DirectoryStore(self.path), max_size=cache_size_bytes), mode=mode) else: self.root = zarr.open_group(self.path, mode=mode) self.frames = self.root[FRAME_ARRAY_KEY] self.agents = self.root[AGENT_ARRAY_KEY] self.scenes = self.root[SCENE_ARRAY_KEY] try: self.tl_faces = self.root[TL_FACE_ARRAY_KEY] except KeyError: # the real issue here is that frame doesn't have traffic_light_faces_index_interval warnings.warn( f"{TL_FACE_ARRAY_KEY} not found in {self.path}! " f"You won't be able to use this zarr into an Ego/AgentDataset", RuntimeWarning, stacklevel=2, ) self.tl_faces = np.empty((0, ), dtype=TL_FACE_DTYPE) return self
def init(release_dir): """Initialise data resources. Parameters ---------- release_dir : string Local filesystem path where data from the release are stored. """ # variation ########### global callset, callset_pass variation_dir = os.path.join(release_dir, 'variation') # main callset callset_zarr_fn = os.path.join(variation_dir, 'main', 'zarr2', 'ag1000g.phase1.ar3') if os.path.exists(callset_zarr_fn): callset = zarr.open_group(callset_zarr_fn, mode='r') # main callset, PASS variants only callset_pass_zarr_fn = os.path.join(variation_dir, 'main', 'zarr2', 'ag1000g.phase1.ar3.pass') if os.path.exists(callset_pass_zarr_fn): callset_pass = zarr.open_group(callset_pass_zarr_fn, mode='r') # haplotypes ############ global callset_phased, tbl_haplotypes, lkp_haplotypes, df_haplotypes haplotypes_dir = os.path.join(release_dir, 'haplotypes') # try HDF5 first callset_phased_h5_fn = os.path.join(haplotypes_dir, 'main', 'hdf5', 'ag1000g.phase1.ar3.1.haplotypes.h5') if os.path.exists(callset_phased_h5_fn): callset_phased = h5py.File(callset_phased_h5_fn, mode='r') # prefer Zarr if available # N.B., the Zarr data is not consistent with HDF5 or shapeit outputs, # it is based on a previous phasing run. # #callset_phased_zarr_fn = os.path.join(haplotypes_dir, 'main', 'zarr2', # 'ag1000g.phase1.ar3.1.haplotypes') #if os.path.exists(callset_phased_zarr_fn): # callset_phased = zarr.open_group(callset_phased_zarr_fn, mode='r') # haplotypes metadata haplotypes_fn = os.path.join(haplotypes_dir, 'haplotypes.meta.txt') if os.path.exists(haplotypes_fn): tbl_haplotypes = (etl.fromtsv(haplotypes_fn).convert( ('index', 'kt_2la', 'kt_2rb'), int)) lkp_haplotypes = tbl_haplotypes.recordlookupone('label') df_haplotypes = pandas.read_csv(haplotypes_fn, sep='\t', index_col='index')
def _concat_zarrs_optimized( zarr_files: List[str], output: PathType, vars_to_rechunk: List[Hashable], vars_to_copy: List[Hashable], ) -> None: zarr_groups = [zarr.open_group(f) for f in zarr_files] first_zarr_group = zarr_groups[0] # create the top-level group zarr.open_group(str(output), mode="w") # copy variables that are to be rechunked # NOTE: that this uses _to_zarr function defined here that is needed to avoid # race conditions between writing the array contents and its metadata # see https://github.com/pystatgen/sgkit/pull/486 delayed = [] # do all the rechunking operations in one computation for var in vars_to_rechunk: dtype = None if var in {"variant_id", "variant_allele"}: max_len = _get_max_len(zarr_groups, f"max_length_{var}") dtype = f"S{max_len}" arr = concatenate_and_rechunk([group[var] for group in zarr_groups], dtype=dtype) d = _to_zarr( # type: ignore[no-untyped-call] arr, str(output), component=var, overwrite=True, compute=False, fill_value=None, attrs=first_zarr_group[var].attrs.asdict(), ) delayed.append(d) da.compute(*delayed) # copy unchanged variables and top-level metadata with zarr.open_group(str(output)) as output_zarr: # copy variables that are not rechunked (e.g. sample_id) for var in vars_to_copy: output_zarr[var] = first_zarr_group[var] output_zarr[var].attrs.update(first_zarr_group[var].attrs) # copy top-level attributes output_zarr.attrs.update(first_zarr_group.attrs)
def initialize(self, mode: str = "w", scenes_num: int = 0, frames_num: int = 0, agents_num: int = 0) -> None: """Initializes a new zarr dataset, creating the underlying arrays. Keyword Arguments: mode (str): Mode to open dataset in, should be something that supports writing. (default: {"w"}) scenes_num (int): pre-allocate this number of scenes frames_num (int): pre-allocate this number of frames agents_num (int): pre-allocate this number of agents """ self.root = zarr.open_group(self.path, mode=mode) self.frames = self.root.require_dataset(FRAME_ARRAY_KEY, dtype=FRAME_DTYPE, chunks=FRAME_CHUNK_SIZE, shape=(frames_num, )) self.agents = self.root.require_dataset(AGENT_ARRAY_KEY, dtype=AGENT_DTYPE, chunks=AGENT_CHUNK_SIZE, shape=(agents_num, )) self.scenes = self.root.require_dataset(SCENE_ARRAY_KEY, dtype=SCENE_DTYPE, chunks=SCENE_CHUNK_SIZE, shape=(scenes_num, )) self.root.attrs["format_version"] = FORMAT_VERSION self.root.attrs["labels"] = LABELS
def _overwrite_time_array_with_single_chunk(target: str, time: xr.DataArray, dim: str): if time is not None: del zarr.open_group(fsspec.get_mapper(target))[dim] with tempfile.TemporaryDirectory() as tmpdir: xr.Dataset({dim: time}).to_zarr(tmpdir) upload_dir(tmpdir, target)
def load_zarr_data(zarr_fn, chrom, s1, s2, gdistkey=None): import zarr samples1 = get_sample_ids(s1) samples2 = get_sample_ids(s2) zfh = zarr.open_group(zarr_fn, mode="r")[chrom] samples_x = zfh["samples"][:] sample_name = [sid.decode() for sid in samples_x.tolist()] idx1 = np.array([sample_name.index(sid) for sid in samples1]) idx2 = np.array([sample_name.index(sid) for sid in samples2]) g = allel.GenotypeChunkedArray(zfh["calldata"]["genotype"]) pos = allel.SortedIndex(zfh["variants"]["POS"][:]) if gdistkey is not None: gdist = h5fh["variants"][gdistkey][:] else: gdist = None return g.take(idx1, axis=1), g.take(idx2, axis=1), pos, gdist
def test_write_output_vars_batch(self, store_batch, model_batch1, model_batch2): model_batch1.state[("profile", "u")] = np.array([1.0, 2.0, 3.0]) model_batch2.state[("profile", "u")] = np.array([4.0, 5.0, 6.0]) model_batch1.state[("roll", "u_diff")] = np.array([-1.0, 1.0, 0.0]) model_batch2.state[("roll", "u_diff")] = np.array([0.0, 1.0, -1.0]) model_batch1.state[("add", "offset")] = 2.0 model_batch2.state[("add", "offset")] = 3.0 store_batch.write_output_vars(0, 0, model=model_batch1) store_batch.write_output_vars(1, 0, model=model_batch2) ztest = zarr.open_group(store_batch.zgroup.store, mode="r") assert ztest.profile__u.ndim == 3 np.testing.assert_array_equal( ztest.profile__u[:, 0, :], np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])) store_batch.write_output_vars(0, -1, model=model_batch1) store_batch.write_output_vars(1, -1, model=model_batch2) np.testing.assert_array_equal(ztest.add__offset[:], np.array([2.0, 3.0])) # test default chunk size along batch dim assert ztest.profile__u.chunks[0] == 1
def test_write_index_vars(self, store): store.model.state[("init_profile", "x")] = np.array([1.0, 2.0, 3.0]) store.write_index_vars() ztest = zarr.open_group(store.zgroup.store, mode="r") np.testing.assert_array_equal(ztest.x, np.array([1.0, 2.0, 3.0]))
def _get_zarr_group(store): if store is None: # memory store return None, zarr.group() elif isinstance(store, str): store = zarr.DirectoryStore(store) return store, zarr.open_group(store=store, mode="a")
def test_write_output_vars(self, in_ds, store): model = store.model model.state[("profile", "u")] = np.array([1.0, 2.0, 3.0]) model.state[("roll", "u_diff")] = np.array([-1.0, 1.0, 0.0]) model.state[("add", "offset")] = 2.0 store.write_output_vars(-1, 0) ztest = zarr.open_group(store.zgroup.store, mode="r") assert ztest.profile__u.shape == (in_ds.clock.size, 3) np.testing.assert_array_equal(ztest.profile__u[0], np.array([1.0, 2.0, 3.0])) assert ztest.roll__u_diff.shape == (in_ds.out.size, 3) np.testing.assert_array_equal(ztest.roll__u_diff[0], np.array([-1.0, 1.0, 0.0])) assert ztest.add__u_diff.shape == (in_ds.out.size, ) np.testing.assert_array_equal(ztest.add__u_diff, np.array([2.0, np.nan, np.nan])) # test save main clock but not out clock store.write_output_vars(-1, 1) np.testing.assert_array_equal(ztest.profile__u[1], np.array([1.0, 2.0, 3.0])) np.testing.assert_array_equal(ztest.roll__u_diff[1], np.array([np.nan, np.nan, np.nan])) # test save no-clock outputs store.write_output_vars(-1, -1) np.testing.assert_array_equal(ztest.profile__u_opp, np.array([-1.0, -2.0, -3.0])) assert ztest.add__offset[()] == 2.0
def _build_output(self, ds, ds0, fss): import zarr out = {} ds.to_zarr(out, chunk_store={}, compute=False) # fills in metadata&coords z = zarr.open_group(out, mode='a') for dim in self.extra_dims.union(self.concat_dims): # derived and concatenated dims stored as absolute data z[dim][:] = ds[dim].values for dim in self.same_dims: # duplicated coordinates stored as references just once out.update({k: v for k, v in fss[0].references.items() if k.startswith(dim)}) for variable in ds.variables: if variable in ds.dims: # already handled continue var, var0 = ds[variable], ds0[variable] assert var.dims[-len(var0.dims):] == var0.dims concats = {d: 0 for d in self.concat_dims} for i, fs in enumerate(fss): for k, v in fs.references.items(): start, part = os.path.split(k) if start != variable or part in ['.zgroup', '.zarray', '.zattrs']: # OK, so we go through all the keys multiple times continue if var.shape == var0.shape: out[k] = v # copy else: out[f"{start}/{i}.{part}"] = v return out
def read_vcfzarr(path: PathType) -> xr.Dataset: """Read a VCF Zarr file created using scikit-allel. Loads VCF variant, sample, and genotype data as Dask arrays within a Dataset from a Zarr file created using scikit-allel's ``vcf_to_zarr`` function. Since ``vcf_to_zarr`` does not preserve phasing information, there is no :data:`sgkit.variables.call_genotype_phased_spec` variable in the resulting dataset. Parameters ---------- path Path to the Zarr file. Returns ------- A dataset containing the following variables: - :data:`sgkit.variables.variant_id_spec` (variants) - :data:`sgkit.variables.variant_contig_spec` (variants) - :data:`sgkit.variables.variant_position_spec` (variants) - :data:`sgkit.variables.variant_allele_spec` (variants) - :data:`sgkit.variables.sample_id_spec` (samples) - :data:`sgkit.variables.call_genotype_spec` (variants, samples, ploidy) - :data:`sgkit.variables.call_genotype_mask_spec` (variants, samples, ploidy) """ vcfzarr = zarr.open_group(str(path), mode="r") # don't fix strings since it requires a pass over the whole dataset return _vcfzarr_to_dataset(vcfzarr, fix_strings=False)
def http_pull_file(remote_file, remote_mtime, local_file, LIST, CLOBBER, MODE): #-- split extension from input local file fileBasename, fileExtension = os.path.splitext(local_file) #-- copy HDF5 file from server into new zarr file if (fileExtension == '.h5'): local_file = '{0}.zarr'.format(fileBasename) #-- if file exists in file system: check if remote file is newer TEST = False OVERWRITE = ' (clobber)' #-- check if local version of file exists if os.access(local_file, os.F_OK): #-- check last modification time of local file local_mtime = os.stat(local_file).st_mtime #-- if remote file is newer: overwrite the local file if (remote_mtime > local_mtime): TEST = True OVERWRITE = ' (overwrite)' else: TEST = True OVERWRITE = ' (new)' #-- if file does not exist locally, is to be overwritten, or CLOBBER is set if TEST or CLOBBER: #-- output string for printing files transferred output = '{0} -->\n\t{1}{2}\n'.format(remote_file, local_file, OVERWRITE) #-- if executing copy command (not only printing the files) if not LIST and (fileExtension == '.h5'): #-- Create and submit request. There are a wide range of exceptions #-- that can be thrown here, including HTTPError and URLError. request = urllib2.Request(remote_file) fid = io.BytesIO(urllib2.urlopen(request).read()) #-- copy everything from the HDF5 file to the zarr file with h5py.File(fid, 'r') as source: dest = zarr.open_group(local_file, mode='w') #-- value checks on output zarr if not hasattr(dest, 'create_dataset'): raise ValueError( 'dest must be a group, got {!r}'.format(dest)) #-- for each key in the root of the hdf5 file structure for k in source.keys(): copy_from_HDF5(source[k], dest, name=k) #-- keep remote modification time of file and local access time os.utime(local_file, (os.stat(local_file).st_atime, remote_mtime)) os.chmod(local_file, MODE) elif not LIST: #-- Create and submit request. There are a wide range of exceptions #-- that can be thrown here, including HTTPError and URLError. request = urllib2.Request(remote_file) response = urllib2.urlopen(request) #-- chunked transfer encoding size CHUNK = 16 * 1024 #-- copy contents to local file using chunked transfer encoding #-- transfer should work properly with ascii and binary data formats with open(local_file, 'wb') as f: shutil.copyfileobj(response, f, CHUNK) #-- keep remote modification time of file and local access time os.utime(local_file, (os.stat(local_file).st_atime, remote_mtime)) os.chmod(local_file, MODE) #-- return the output string return output
def __init__(self, data_path, volume_indices, nb_io_workers=1, nb_proc_workers=0, downscale=False, return_vol_idx=False, num_consecutive=None): self.data_path = data_path self.volume_indices = volume_indices self.nb_io_workers = nb_io_workers self.nb_proc_workers = nb_proc_workers self.downscale = downscale self.return_vol_idx = return_vol_idx self.num_consecutive = num_consecutive try: zgroup = zarr.open_group(data_path, mode='r') except: print("Failed to open data: {}".format(data_path)) raise # Assemble volumes and corresponding segmentations self.volumes = [] self.segmentations = [] for idx in self.volume_indices: subgroup = zgroup[str(idx)] self.volumes.append(subgroup['volume']) self.segmentations.append(subgroup['segmentation']) # Length self.num_volumes = len(self.volumes) assert (len(self.segmentations) == self.num_volumes)
def main(): usage = 'usage: %prog [options] <in_h5_file> <out_zarr_file>' parser = OptionParser(usage) parser.add_option('-c', dest='chunk_size', default=None, type='int') parser.add_option('-v', dest='verbose', default=False, action='store_true') (options,args) = parser.parse_args() if len(args) != 2: parser.error('Must provide input HDF5 and output BigWig.') else: hdf5_file = args[0] zarr_file = args[1] # open files h5_in = h5py.File(hdf5_file) zarr_out = zarr.open_group(zarr_file, 'w') # foreach chromosome for chrom in h5_in.keys(): if options.verbose: print(chrom) # read values x = np.array(h5_in[chrom], dtype='float16') # write gzipped into HDF5 z = zarr_out.create_dataset(chrom, data=x, shape=x.shape, dtype='float16', chunks=options.chunk_size) if options.verbose: print(z) # close files h5_in.close()
def __init__(self, zarr_root, zarr_group, fold_num, conv2d=True, transpose=False, **kwargs): 'Initialization' # Zarr dataset handling zarr_root = zarr.open_group(str(zarr_root), mode='r') # Get cross-validation fold metadata zarr_fold = zarr_root[f'{zarr_group}/folds/fold{fold_num}'] # Get metadata self.metadata = zarr_root[zarr_group].attrs.asdict() self.scene_labels = self.metadata['scene_labels'] # Normalization data self.norm_data = {} self.norm_data['mean'] = zarr_fold['norm_data']['mean'][:] self.norm_data['std'] = zarr_fold['norm_data']['std'][:] # Set features dimensions self.set_dim(transpose, conv2d)
def from_schema( store: zarr.ABSStore, schema: xr.Dataset, dims: Sequence[str], coords: Mapping[str, xr.DataArray], ) -> "ZarrMapping": """Initialize a ZarrMapping using an xarray dataset as a template Args: store: A object implementing the mutable mapping interface required by zarr.open_group schema: A template for the datasets that will be inserted into the ZarrMapping. dims: The list of dimensions that will be managed by the zarr mapping. The zarr dataset produced by ZarrMapping will have these dimensions pre-pendended to the list of dimensions of each variable in the schema object. coords: the coordinate labels corresponding to the dimensions in dims Returns: an initialized ZarrMapping object """ group = zarr.open_group(store, mode="w") coords = { name: xr.DataArray(coords[name], name=name, dims=[name]) for name in coords } _create_zarr(dims, coords, group, schema) return ZarrMapping(store)
def test_write_global_vars(self): # ensure that variable metadata (dims, etc.) is properly accessed for global references @xs.process class Foo: var = xs.variable(dims="x", global_name="global_var", intent="out") @xs.process class Bar: var = xs.global_ref("global_var") model = xs.Model({"foo": Foo, "bar": Bar}) in_ds = xs.create_setup( model=model, clocks={"clock": [0, 1]}, output_vars={"bar__var": None}, ) store = ZarrSimulationStore(in_ds, model) model.state[("foo", "var")] = np.array([1, 2, 3]) store.write_output_vars(-1, -1) ztest = zarr.open_group(store.zgroup.store, mode="r") np.testing.assert_array_equal(ztest.bar__var, np.array([1, 2, 3]))
def main(): usage = 'usage: %prog [options] <in_zarr_file> <out_h5_file>' parser = OptionParser(usage) parser.add_option('-v', dest='verbose', default=False, action='store_true') (options,args) = parser.parse_args() if len(args) != 2: parser.error('Must provide input Zarr and output HDF5.') else: zarr_file = args[0] hdf5_file = args[1] # open files zarr_in = zarr.open_group(zarr_file, 'r') h5_out = h5py.File(hdf5_file, 'w') # foreach chromosome for chrom in sorted(zarr_in.keys()): if options.verbose: print(chrom) # read values x = np.array(zarr_in[chrom]) # write gzipped into HDF5 h5_out.create_dataset(chrom, data=x, dtype='float16', chunks=True, compression='lzf', shuffle=True) # close files h5_out.close()
def initialize( self, mode: str = "w", num_scenes: int = 0, num_frames: int = 0, num_agents: int = 0, num_tl_faces: int = 0 ) -> "ChunkedDataset": """Initializes a new zarr dataset, creating the underlying arrays. Keyword Arguments: mode (str): Mode to open dataset in, should be something that supports writing. (default: {"w"}) num_scenes (int): pre-allocate this number of scenes num_frames (int): pre-allocate this number of frames num_agents (int): pre-allocate this number of agents num_tl_faces (int): pre-allocate this number of traffic lights """ self.root = zarr.open_group(self.path, mode=mode) self.frames = self.root.require_dataset( FRAME_ARRAY_KEY, dtype=FRAME_DTYPE, chunks=FRAME_CHUNK_SIZE, shape=(num_frames,) ) self.agents = self.root.require_dataset( AGENT_ARRAY_KEY, dtype=AGENT_DTYPE, chunks=AGENT_CHUNK_SIZE, shape=(num_agents,) ) self.scenes = self.root.require_dataset( SCENE_ARRAY_KEY, dtype=SCENE_DTYPE, chunks=SCENE_CHUNK_SIZE, shape=(num_scenes,) ) self.tl_faces = self.root.require_dataset( TL_FACE_ARRAY_KEY, dtype=TL_FACE_DTYPE, chunks=TL_FACE_CHUNK_SIZE, shape=(num_tl_faces,) ) self.root.attrs["format_version"] = FORMAT_VERSION self.root.attrs["labels"] = PERCEPTION_LABELS return self
def open_group( cls, store, mode="r", synchronizer=None, group=None, consolidated=False, consolidate_on_close=False, chunk_store=None, append_dim=None, write_region=None, ): import zarr # zarr doesn't support pathlib.Path objects yet. zarr-python#601 if isinstance(store, pathlib.Path): store = os.fspath(store) open_kwargs = dict(mode=mode, synchronizer=synchronizer, path=group) if chunk_store: open_kwargs["chunk_store"] = chunk_store if consolidated: # TODO: an option to pass the metadata_key keyword zarr_group = zarr.open_consolidated(store, **open_kwargs) else: zarr_group = zarr.open_group(store, **open_kwargs) return cls(zarr_group, consolidate_on_close, append_dim, write_region)
def main(cfg): # data_preprocessing data_dir = cfg["data_dir"] out_dir = cfg["out_dir"] dat = HubmapDataset(data_dir, out_dir) # db_zarr db = zarr.open_group(store=zarr.DirectoryStore(cfg["zarr_db_dir"]), mode="r") # results dir results_dir = dat.path.out / "normalization" / cfg["version"] if cfg["version"] == "debug": os.makedirs(results_dir, exist_ok=True) else: try: os.makedirs(results_dir, exist_ok=False) except: raise Exception(f"Version {cfg['version']} exists!") # save config dat.jsn_dump(cfg, results_dir / "config.json") # tiles tile_dct = dat.pkl_load(dat.path.out / "tiles" / cfg['tiles_version'] / "tile_dct.pkl") tiles = tile_dct["train_df"] # mean sum = np.zeros(3) N = (tiles["tile"].apply(lambda x: x[1] - x[0]) * tiles["tile"].apply(lambda x: x[3] - x[2])).sum() for _, row in tqdm(tiles.iterrows(), total=len(tiles)): id_ = row["id"] c = row["tile"] slc = np.s_[c[0]:c[1], c[2]:c[3]] img = db[id_]["img"][slc] / 255 sum += img.sum(axis=(0, 1)) mean = sum / N dat.pkl_dump(mean, results_dir / "mean.pkl") # std diff_squared = np.zeros(3) for _, row in tqdm(tiles.iterrows(), total=len(tiles)): id_ = row["id"] c = row["tile"] slc = np.s_[c[0]:c[1], c[2]:c[3]] img = db[id_]["img"][slc] / 255 diff_squared += ((img - mean)**2).sum(axis=(0, 1)) std = np.sqrt(diff_squared / N) dat.pkl_dump(std, results_dir / "std.pkl") print(f"MEAN: {mean}") print(f"STD: {std}")
def open_group(cls, store, mode='r', synchronizer=None, group=None, consolidated=False, consolidate_on_close=False): import zarr min_zarr = '2.2' if LooseVersion(zarr.__version__) < min_zarr: # pragma: no cover raise NotImplementedError("Zarr version %s or greater is " "required by xarray. See zarr " "installation " "http://zarr.readthedocs.io/en/stable/" "#installation" % min_zarr) if consolidated or consolidate_on_close: if LooseVersion( zarr.__version__) <= '2.2.1.dev2': # pragma: no cover raise NotImplementedError("Zarr version 2.2.1.dev2 or greater " "is required by for consolidated " "metadata.") open_kwargs = dict(mode=mode, synchronizer=synchronizer, path=group) if consolidated: # TODO: an option to pass the metadata_key keyword zarr_group = zarr.open_consolidated(store, **open_kwargs) else: zarr_group = zarr.open_group(store, **open_kwargs) return cls(zarr_group, consolidate_on_close)
def append_slice(store: Union[str, MutableMapping], dataslice: xr.Dataset, dimension: str = "time") -> None: """ Append data slice to existing zarr dataset. :param store: A zarr store. :param dataslice: Data slice to insert :param dimension: name of dimension perpendicular to the slice """ # Unfortunately slice.to_zarr(store, mode='a', append_dim='time') will # replace global attributes of store with attributes of slice (xarray # bug?), which are usually empty in our case. Hence, we must save our old # attributes in a copy of slice. ds = zarr.open_group(store, mode='r') dataslice = dataslice.copy() dataslice.attrs.update(ds.attrs) if 'coordinates' in dataslice.attrs: # Remove 'coordinates', otherwise we get ValueError: cannot serialize # coordinates because the global attribute 'coordinates' already # exists from next slice.to_zarr(...) call. dataslice.attrs.pop('coordinates') dataslice.to_zarr(store, mode='a', append_dim=dimension)
def get_singletons(zarr_folder, chrom, samples, start=-9, stop=-9): callset = zarr.open_group(zarr_folder, mode='r') pos = callset[chrom]['variants']['POS'] # pdb.set_trace() ref = callset[chrom]['variants']['REF'] alt = callset[chrom]['variants']['ALT'] ids = callset[chrom]['variants']['ID'] gt = allel.GenotypeDaskArray( callset[str(chrom)]['calldata']['GT']) # Retrieve genotype data gt = gt.take(samples, axis=1).compute() # subset data to samples of interest ac = gt.count_alleles() if start == -9: start = min(pos) if stop == -9: stop = max(pos) flt = ac.is_singleton(1) pos2 = pos.get_mask_selection(flt) gf = gt.compress(flt, axis=0) sing_dict = {p: i for p, i in zip(pos2, np.where(gf.is_het())[1])} ind_dict = {} for key, value in sing_dict.items(): if value in ind_dict: ind_dict[value].append(key) else: ind_dict[value] = [key] return ind_dict, gt, ids, ref, alt, pos, start, stop
def test_convert_to_zarr(self): input_vcf_path = "./tests/data/trio.2010_06.ychr.genotypes.vcf" output_zarr_path = "trio.2010_06.ychr.genotypes.zarr" # Attempt to remove local file in case a previous unit test failed to do so (prevents false positive) if os.path.isdir(output_zarr_path): shutil.rmtree(output_zarr_path) if os.path.isfile(input_vcf_path): # Setup test settings for Zarr conversion vcf_to_zarr_config = config.VCFtoZarrConfigurationRepresentation() vcf_to_zarr_config.fields = 'variants/numalt' vcf_to_zarr_config.enabled = True vcf_to_zarr_config.compressor = "Blosc" vcf_to_zarr_config.blosc_compression_algorithm = "zstd" vcf_to_zarr_config.blosc_compression_level = 1 vcf_to_zarr_config.blosc_shuffle_mode = -1 # Convert VCF file to Zarr data_service.convert_to_zarr(input_vcf_path=input_vcf_path, output_zarr_path=output_zarr_path, conversion_config=vcf_to_zarr_config) # Load the Zarr data from storage for testing callset = zarr.open_group(output_zarr_path, mode="r") numalt = callset['variants/numalt'] self.assertEqual(np.size(numalt), 959) self.assertEqual(np.max(numalt), 1) else: self.fail("Test data file does not exist. Please ensure the file exists and try running test again") # Remove the Zarr test data if os.path.isdir(output_zarr_path): shutil.rmtree(output_zarr_path)
def initialize_output_zarr(out_dir, sad_stats, snps, target_ids, target_labels): """Initialize an output Zarr file for SAD stats.""" num_targets = len(target_ids) num_snps = len(snps) sad_out = zarr.open_group('%s/sad.zarr' % out_dir, 'w') # write SNPs sad_out.create_dataset('snp', data=[snp.rsid for snp in snps], chunks=(32768, )) # write targets sad_out.create_dataset('target_ids', data=target_ids, compressor=None) sad_out.create_dataset('target_labels', data=target_labels, compressor=None) # initialize SAD stats for sad_stat in sad_stats: sad_out.create_dataset(sad_stat, shape=(num_snps, num_targets), chunks=(128, num_targets), dtype='float16') return sad_out
def append_time_slice(store: Union[str, MutableMapping], time_slice: xr.Dataset, chunk_sizes: Dict[str, int] = None): """ Append time slice to existing zarr dataset. :param store: A zarr store. :param time_slice: Time slice to insert :param chunk_sizes: desired chunk sizes """ if chunk_sizes: time_slice = chunk_dataset(time_slice, chunk_sizes, format_name='zarr') # Unfortunately time_slice.to_zarr(store, mode='a', append_dim='time') will replace global attributes of store # with attributes of time_slice (xarray bug?), which are usually empty in our case. # Hence, we must save our old attributes in a copy of time_slice. ds = zarr.open_group(store, mode='r') time_slice = time_slice.copy() time_slice.attrs.update(ds.attrs) if 'coordinates' in time_slice.attrs: # Remove 'coordinates', otherwise we get # ValueError: cannot serialize coordinates because the global attribute 'coordinates' already exists # from next time_slice.to_zarr(...) call. time_slice.attrs.pop('coordinates') time_slice.to_zarr(store, mode='a', append_dim='time') unchunk_dataset(store, coords_only=True)
def extract2D(dataset, datatable, row_idx, col_idx, two_d_properties): zarr_file = zarr.DirectoryStore(os.path.join(config.BASEDIR, '2D_data', dataset + '_' + datatable + '.zarr')) root_group = zarr.open_group(zarr_file) two_d_properties = dict((prop, da.from_array(root_group[prop], chunks=root_group[prop].chunks, fancy=False)) for prop in two_d_properties) if len(col_idx) == 0 or len(row_idx) == 0: two_d_result = {} for prop in list(two_d_properties.keys()): two_d_result[prop] = np.array([], dtype=two_d_properties[prop].dtype) else: two_d_result = select_by_list(two_d_properties, row_idx, col_idx) return two_d_result
def open_group(cls, store, mode='r', synchronizer=None, group=None, writer=None): import zarr min_zarr = '2.2' if LooseVersion(zarr.__version__) < min_zarr: # pragma: no cover raise NotImplementedError("Zarr version %s or greater is " "required by xarray. See zarr " "installation " "http://zarr.readthedocs.io/en/stable/" "#installation" % min_zarr) zarr_group = zarr.open_group(store=store, mode=mode, synchronizer=synchronizer, path=group) return cls(zarr_group, writer=writer)
def main(): usage = 'usage: %prog [options] <in_zarr_file> <out_bw_file>' parser = OptionParser(usage) parser.add_option('-v', dest='verbose', default=False, action='store_true') (options,args) = parser.parse_args() if len(args) != 2: parser.error('Must provide input HDF5 and output BigWig.') else: zarr_file = args[0] bw_file = args[1] # open files zarr_in = zarr.open_group(zarr_file, 'r') bw_out = pyBigWig.open(bw_file, 'w') # construct header header = [] chroms = sorted(zarr_in.keys()) for chrom in chroms: # chromosome and length header.append((chrom,len(zarr_in[chrom]))) # write header bw_out.addHeader(header) for chrom, length in header: if options.verbose: print(chrom) # read values x = np.array(zarr_in[chrom]) # write gzipped into HDF5 bw_out.addEntries(chrom, 0, values=x, span=1, step=1) # close files bw_out.close()