def save_zarr(id_patient, lung_mask, cand): lung_mask_group.array(id_patient, lung_mask, chunks=(1, 17, 21, 21), compressor=zarr.Blosc(clevel=9, cname="zstd", shuffle=2), synchronizer=zarr.ThreadSynchronizer()) cand_group.array(id_patient, cand, chunks=(1, 17, 21, 21), compressor=zarr.Blosc(clevel=9, cname="zstd", shuffle=2), synchronizer=zarr.ThreadSynchronizer()) return
def save_zarr(id_patient, lung_mask, nodule_mask): lung_mask_group.array(id_patient, lung_mask, chunks=(10, 1, 512, 512), compressor=zarr.Blosc(clevel=9, cname="zstd", shuffle=2), synchronizer=zarr.ThreadSynchronizer()) nodule_mask_group.array(id_patient, nodule_mask, chunks=(10, 1, 512, 512), compressor=zarr.Blosc(clevel=9, cname="zstd", shuffle=2), synchronizer=zarr.ThreadSynchronizer()) return
def _get_write_encodings(cls, dataset, compress, cname, clevel, shuffle, blocksize, chunksizes): encoding = None if chunksizes: encoding = {} for var_name in dataset.data_vars: var = dataset[var_name] chunks: List[int] = [] for i in range(len(var.dims)): dim_name = var.dims[i] if dim_name in chunksizes: chunks.append(chunksizes[dim_name]) else: chunks.append(var.shape[i]) encoding[var_name] = dict(chunks=chunks) if compress: blosc_kwargs = dict(cname=cname, clevel=clevel, shuffle=shuffle, blocksize=blocksize) for k in list(blosc_kwargs.keys()): if blosc_kwargs[k] is None: del blosc_kwargs[k] compressor = zarr.Blosc(**blosc_kwargs) if encoding: for var_name in encoding.keys(): encoding[var_name].update(compressor=compressor) else: encoding = { var_name: dict(compressor=compressor) for var_name in dataset.data_vars } return encoding
def create_variable_zarr( self, handler_zarr, kwargs_variable, attr_variable, data, scale_factor=None, add_offset=None, filters=None, compressor=None, ): kwargs_variable["shape"] = data.shape kwargs_variable["compressor"] = (zarr.Blosc(cname="zstd", clevel=2) if compressor is None else compressor) kwargs_variable["filters"] = list() store_dtype = kwargs_variable.pop("store_dtype", None) if scale_factor is not None or add_offset is not None: if add_offset is None: add_offset = 0 kwargs_variable["filters"].append( zarr.FixedScaleOffset( offset=float64(add_offset), scale=1 / float64(scale_factor), dtype=kwargs_variable["dtype"], astype=store_dtype, )) if filters is not None: kwargs_variable["filters"].extend(filters) dims = kwargs_variable.get("dimensions", None) # Manage chunk in 2d case if len(dims) == 1: kwargs_variable["chunks"] = (2500000, ) if len(dims) == 2: second_dim = data.shape[1] kwargs_variable["chunks"] = (200000, second_dim) kwargs_variable.pop("dimensions") v = handler_zarr.create_dataset(**kwargs_variable) attrs = list(attr_variable.keys()) attrs.sort() for attr in attrs: attr_value = attr_variable[attr] v.attrs[attr] = str(attr_value) if self.raw_data: if scale_factor is not None: s_bloc = kwargs_variable["chunks"][0] nb_bloc = int(ceil(data.shape[0] / s_bloc)) for i in range(nb_bloc): sl = slice(i * s_bloc, (i + 1) * s_bloc) v[sl] = data[sl] * scale_factor + add_offset else: v[:] = data if not self.raw_data: v[:] = data try: if v.size < 1e8: v.attrs["min"] = str(v[:].min()) v.attrs["max"] = str(v[:].max()) except ValueError: logger.warning("Data is empty")
def nc2zarr(fns,zpath,s3store=True,chunks=None,parallel=True): ''' Convert netcdf files to zarr format and save to local or s3 store Parameters ---------- fns : a list of netcdf file names with full path zpath : path to the local or s3 store s3store : flag of whether to save to s3 store, boolean chunks : chunks used to read and write data parallel: flag to use dask to read files in parallel, boolean ''' # --- remove lat/long from the list of vars to be concatenated. with xr.open_mfdataset(fns,parallel=True,chunks=chunks,combine='nested',concat_dim='time') as ds: vns = list(ds.data_vars) for vn in ['lat','long']: if vn in vns: vns.remove(vn) with xr.open_mfdataset(fns,chunks=chunks,parallel=parallel, data_vars=vns,combine='nested',concat_dim='time') as ds: if s3store: fs = s3fs.S3FileSystem(anon=False) ds_store = s3fs.S3Map(root=zpath,s3=fs,check=False,create=True) else: ds_store = zpath if chunks is not None: ds = ds.chunk(chunks=chunks) else: ds = ds.chunk(chunks={x:ds.chunks[x][0] for x in ds.chunks}) compressor = zarr.Blosc(cname='zstd', clevel=4) encoding = {vname: {'compressor': compressor} for vname in ds.data_vars} ds.to_zarr(store=ds_store,encoding=encoding,consolidated=True) return
def main(args=None): args = args if args is not None else sys.argv[1:] if len(args) != 2: print(f'Usage: {sys.argv[0]} OUTPUT.zarr (INPUT.nc | INPUT.dir)') sys.exit(2) output_dir = args[0] input_file = args[1] if os.path.isdir(input_file): input_dir = input_file input_files = list(os.listdir(input_dir)) # Shuffle files for i in range(len(input_files)): i1 = random.randint(0, len(input_files) - 1) i2 = random.randint(0, len(input_files) - 1) t = input_files[i1] input_files[i1] = input_files[i2] input_files[i2] = t for input_file in input_files: print(f'processing {input_file}') subprocess.run([ sys.executable, sys.argv[0], output_dir, os.path.join(input_dir, input_file) ]) return synchronizer = zarr.ProcessSynchronizer(output_dir + '.sync') input_ds = xr.open_dataset(input_file, decode_times=False) dropped_vars = set( input_ds.data_vars.keys()) - {"analysed_sst", "analysis_error"} input_ds = input_ds.drop(dropped_vars) if not os.path.isdir(output_dir): compressor = zarr.Blosc(cname='zstd', clevel=3, shuffle=2) encoding = dict() for var_name in input_ds.data_vars: new_var = input_ds[var_name] chunks = new_var.shape encoding[var_name] = {'compressor': compressor, 'chunks': chunks} input_ds.to_zarr(output_dir, encoding=encoding, synchronizer=synchronizer) print(f'written {input_file} to {output_dir}') else: # cube_ds = xr.open_zarr(output_dir, synchronizer=synchronizer) # cube_ds = xr.concat([cube_ds, input_ds], dim='time') # cube_ds.close() root_group = zarr.open(output_dir, mode='a', synchronizer=synchronizer) for var_name, var_array in root_group.arrays(): if var_name in input_ds: var = input_ds[var_name] if 'time' in var.dims: if var_name == 'time': print('time:', var, var.values) axis = var.dims.index('time') # Note: all append operations are forced to be sequential! # See https://github.com/zarr-developers/zarr/issues/75 var_array.append(var, axis=axis) print(f'appended {input_file} to {output_dir}')
def encode_variables( ds: Dataset, chunk_length: int, chunk_width: int, compressor: Optional[Any] = zarr.Blosc(cname="zstd", clevel=7, shuffle=2), probability_dtype: Optional[Any] = "uint8", ) -> Dict[Hashable, Dict[str, Any]]: encoding = {} for v in ds: e = {} if compressor is not None: e.update({"compressor": compressor}) if v in GT_DATA_VARS: e.update({"chunks": (chunk_length, chunk_width) + ds[v].shape[2:]}) if probability_dtype is not None and v == "call_genotype_probability": dtype = np.dtype(probability_dtype) # Xarray will decode into float32 so any int greater than # 16 bits will cause overflow/underflow # See https://en.wikipedia.org/wiki/Floating-point_arithmetic#Internal_representation # *bits precision column for single precision floats if dtype not in [np.uint8, np.uint16]: # type: ignore[comparison-overlap] raise ValueError("Probability integer dtype invalid, must " f"be uint8 or uint16 not {probability_dtype}") divisor = np.iinfo(dtype).max - 1 e.update({ "dtype": probability_dtype, "add_offset": -1.0 / divisor, "scale_factor": 1.0 / divisor, "_FillValue": 0, }) if e: encoding[v] = e return encoding
def test_rechunk_dataset(tmp_path, shape, source_chunks, target_chunks, max_mem, pass_temp, executor): target_store = str(tmp_path / "target.zarr") temp_store = str(tmp_path / "temp.zarr") a = numpy.arange(numpy.prod(shape)).reshape(shape).astype("f4") a[-1] = numpy.nan ds = xarray.Dataset( dict( a=xarray.DataArray(a, dims=["x", "y"], attrs={ "a1": 1, "a2": [1, 2, 3], "a3": "x" }), b=xarray.DataArray(numpy.ones(shape[0]), dims=["x"]), c=xarray.DataArray(numpy.ones(shape[1]), dims=["y"]), ), attrs={ "a1": 1, "a2": [1, 2, 3], "a3": "x" }, ) ds = ds.chunk(chunks=dict(zip(["x", "y"], source_chunks))) encoding = dict( a=dict( chunks=target_chunks, compressor=zarr.Blosc(cname="zstd"), dtype="int32", scale_factor=0.1, _FillValue=-9999, ), b=dict(chunks=target_chunks[:1]), ) rechunked = api.rechunk_dataset( ds, encoding=encoding, max_mem=max_mem, target_store=target_store, temp_store=temp_store if pass_temp else None, executor=executor, ) assert isinstance(rechunked, api.Rechunked) rechunked.execute() # Validate encoded variables dst = xarray.open_zarr(target_store, decode_cf=False) assert dst.a.dtype == encoding["a"]["dtype"] assert all(dst.a.values[-1] == encoding["a"]["_FillValue"]) # Validate decoded variables dst = xarray.open_zarr(target_store, decode_cf=True) assert dst.a.data.chunksize == target_chunks assert dst.b.data.chunksize == target_chunks[:1] assert dst.c.data.chunksize == source_chunks[1:] xarray.testing.assert_equal(ds.compute(), dst.compute())
def write(self, dataset: xr.Dataset, output_path: str, **kwargs): compressor = zarr.Blosc(cname='zstd', clevel=3, shuffle=2) encoding = dict() for var_name in dataset.data_vars: new_var = dataset[var_name] # TODO: get chunks from configuration chunks = new_var.shape encoding[var_name] = {'compressor': compressor, 'chunks': chunks} dataset.to_zarr(output_path, encoding=encoding)
def test_rechunk_dataset( tmp_path, shape, source_chunks, target_chunks, max_mem, executor, target_store, temp_store, ): if target_store.startswith("mapper"): fsspec = pytest.importorskip("fsspec") target_store = fsspec.get_mapper(str(tmp_path) + target_store) temp_store = fsspec.get_mapper(str(tmp_path) + temp_store) else: target_store = str(tmp_path / target_store) temp_store = str(tmp_path / temp_store) ds = example_dataset(shape).chunk( chunks=dict(zip(["x", "y"], source_chunks))) options = dict(a=dict( compressor=zarr.Blosc(cname="zstd"), dtype="int32", scale_factor=0.1, _FillValue=-9999, )) rechunked = api.rechunk( ds, target_chunks=target_chunks, max_mem=max_mem, target_store=target_store, target_options=options, temp_store=temp_store, executor=executor, ) assert isinstance(rechunked, api.Rechunked) with dask.config.set(scheduler="single-threaded"): rechunked.execute() # Validate encoded variables dst = xarray.open_zarr(target_store, decode_cf=False) assert dst.a.dtype == options["a"]["dtype"] assert all(dst.a.values[-1] == options["a"]["_FillValue"]) assert dst.a.encoding["compressor"] is not None # Validate decoded variables dst = xarray.open_zarr(target_store, decode_cf=True) target_chunks_expected = (target_chunks["a"] if isinstance( target_chunks["a"], tuple) else (target_chunks["a"]["x"], target_chunks["a"]["y"])) assert dst.a.data.chunksize == target_chunks_expected assert dst.b.data.chunksize == target_chunks_expected[:1] assert dst.c.data.chunksize == source_chunks[1:] xarray.testing.assert_equal(ds.compute(), dst.compute()) assert ds.attrs == dst.attrs
def create_dataset(save_name, save_dir, data_dir, proportions): save_name += ".zarr" save_path = os.path.join(save_dir, save_name) zgroup = zarr.open_group(store=save_path, mode='w', path="/") zarr_kwargs = { 'chunks': (1, 512, 512), 'compressor': zarr.Blosc(cname='lz4', clevel=9, shuffle=1) } for i in range(130): print("Processing volume {}".format(i)) volume = sitk.ReadImage( os.path.join(data_dir, "volume-" + str(i) + ".nii")) volume_np = sitk.GetArrayFromImage(volume) seg = sitk.ReadImage( os.path.join(data_dir, "segmentation-" + str(i) + ".nii")) seg_np = sitk.GetArrayFromImage(seg) slices = [] if proportions[0] > 0: slices.extend( get_slices(seg_np, target_class=0, exclude_class=[1, 2], proportion=proportions[0])) if proportions[1] > 0: slices.extend( get_slices(seg_np, target_class=1, exclude_class=2, proportion=proportions[1])) if proportions[2] > 0: slices.extend( get_slices(seg_np, target_class=2, proportion=proportions[2])) volume_np = volume_np[slices] seg_np = seg_np[slices] if len(volume_np) == 0: print("WARNING! Skipping empty volume #{}".format(i)) continue print("Saving {} slices".format(volume_np.shape[0])) subgroup = zgroup.create_group(str(i)) subgroup.create_dataset("volume", shape=volume_np.shape, data=volume_np, dtype=np.float32, **zarr_kwargs) subgroup.create_dataset("segmentation", shape=seg_np.shape, data=seg_np, dtype=np.int16, **zarr_kwargs)
def __init__(self, data_element_shape, dtype, batch_size, filename, array_name, length=None, append=False, kwargs=None): import zarr super(zarr_array_writer, self).__init__(None, data_element_shape, dtype, batch_size, length) self.filename = filename self.array_name = array_name self.kwargs = kwargs # Set up array kwargs self.arr_kwargs = { 'name': array_name, 'chunks': (batch_size, ) + data_element_shape, 'compressor': zarr.Blosc(cname='lz4', clevel=5, shuffle=1), 'dtype': dtype } if self.length is None: self.arr_kwargs['shape'] = (1, ) + self.data_element_shape else: self.arr_kwargs['shape'] = ( self.length, ) + self.data_element_shape if kwargs is not None: self.arr_kwargs.update(kwargs) # Open the file for writing. self.group = None if append: self.write_mode = 'a' else: self.write_mode = 'w' try: self.group = zarr.open_group(filename, self.write_mode) except: print("Error: failed to open file %s" % filename) raise # Open an array interface (check if the array exists; if not, create it) if self.length is None: ds_args = (self.array_name, (1, ) + self.data_element_shape) else: ds_args = (self.array_name, (self.length, ) + self.data_element_shape) try: self.storage_array = self.group[self.array_name] self.storage_array_ptr = len(self.storage_array) except KeyError: self.storage_array = self.group.create_dataset(**self.arr_kwargs) self.storage_array_ptr = 0
def test_rechunk_option_compression(rechunk_args): def rechunk(compressor): options = _wrap_options(rechunk_args["source"], dict(overwrite=True, compressor=compressor)) rechunked = api.rechunk(**rechunk_args, target_options=options) rechunked.execute() return sum(file.stat().st_size for file in Path(rechunked._target.store.path).rglob("*")) size_uncompressed = rechunk(None) size_compressed = rechunk( zarr.Blosc(cname="zstd", clevel=9, shuffle=zarr.Blosc.SHUFFLE)) assert size_compressed < size_uncompressed
def zarrify(x, dest, chunk=512, compression=DEFAULT_COMPRESSION): compressor = None if compression: compressor = zarr.Blosc(**compression) os.makedirs(os.path.dirname(dest), exist_ok=True) z = zarr.open(dest, mode="w", shape=x.shape, chunks=(chunk, chunk, None), dtype="<u2", compressor=compressor) z[:] = x return z
def test_rechunk_option_compression(rechunked_fn): def rechunk(compressor): rechunked = rechunked_fn( temp_options=dict(overwrite=True, compressor=compressor), target_options=dict(overwrite=True, compressor=compressor), ) rechunked.execute() return sum(file.stat().st_size for file in Path(rechunked._target.store.path).rglob("*")) size_uncompressed = rechunk(None) size_compressed = rechunk( zarr.Blosc(cname="zstd", clevel=9, shuffle=zarr.Blosc.SHUFFLE)) assert size_compressed < size_uncompressed
def test_rechunk_dataset_dimchunks( tmp_path, shape, source_chunks, target_chunks, max_mem, ): temp_store = "temp.zarr" target_store = "target.zarr" target_store = str(tmp_path / target_store) temp_store = str(tmp_path / temp_store) ds = example_dataset(shape).chunk( chunks=dict(zip(["x", "y"], source_chunks))) options = dict(a=dict( compressor=zarr.Blosc(cname="zstd"), dtype="int32", scale_factor=0.1, _FillValue=-9999, )) rechunked = api.rechunk( ds, target_chunks=target_chunks, max_mem=max_mem, target_store=target_store, target_options=options, temp_store=temp_store, ) assert isinstance(rechunked, api.Rechunked) with dask.config.set(scheduler="single-threaded"): rechunked.execute() # Validate decoded variables dst = xarray.open_zarr(target_store, decode_cf=True) target_chunks_expected = [ target_chunks.get("x", source_chunks[0]), target_chunks.get("y", source_chunks[1]), ] if target_chunks_expected[1] < 0 or target_chunks_expected[1] > len(ds.y): target_chunks_expected[1] = len(ds.y) target_chunks_expected = tuple(target_chunks_expected) assert dst.a.data.chunksize == target_chunks_expected assert dst.b.data.chunksize == target_chunks_expected[:1] assert dst.c.data.chunksize == target_chunks_expected[1:] xarray.testing.assert_equal(ds.compute(), dst.compute()) assert ds.attrs == dst.attrs
def save_results(conn, image, data, dataset, path): filename, file_extension = os.path.splitext(image.getName()) # Save the probabilities file as an image print("Saving Probabilities as zarr file attached to the original Image") name = filename + "_Probabilities_zarr.zip" desc = "ilastik probabilities from Image:%s" % image.getId() # Re-organise array from tzyxc to zctyx order expected by OMERO # data = data.swapaxes(0, 1).swapaxes(3, 4).swapaxes(2, 3).swapaxes(1, 2) namespace = "ilastik.zarr.demo" fp = os.path.join(path, name) with zarr.ZipStore(fp, mode='w') as store: zarr.array(data, store=store, dtype='int16', compressor=zarr.Blosc(cname='zstd')) ann = conn.createFileAnnfromLocalFile(fp, mimetype="application/zip", ns=namespace, desc=desc) image.linkAnnotation(ann)
def encode_variables( ds: Dataset, compressor: Any = zarr.Blosc(cname="zstd", clevel=7, shuffle=2) ) -> Dict[Hashable, Dict[str, Any]]: # Set compressor, chunking and floating point encoding encoding = {} for v in ds: e = {"compressor": compressor} if v == "call_genotype_probability": e.update({ "dtype": "uint8", "add_offset": -1.0 / 254.0, "scale_factor": 1.0 / 254.0, "_FillValue": 0, }) encoding[v] = e return encoding
def _save_zarr( image: Any, uri: str, compress: bool = True, partition: Optional[str] = None, # Format-specific kwargs compression_type: str = "zstd", compression_level: int = 4): # image = image.chunk({"i": 1}) if compress: compressor = zarr.Blosc(cname=compression_type, clevel=compression_level) encoding = {k: {"compressor": compressor} for k in image.keys()} else: encoding = {} image.to_zarr(uri, consolidated=True, encoding=encoding)
def read_covar_matrix(fname): dname_zarr = fname.replace('.mat', '.zarr') if not exists(dname_zarr): covMat = sio.loadmat(fname) names = [ 'fl063_c', 'fl063_l', 'fl064_c', 'fl064_l', 'fl065_c', 'fl065_l', 'fl068_c', 'fl068_l' ] covMats = zip(*(names, covMat['CovMat'][0])) def to_darray(cmatrix): cmatrix = cmatrix.reshape(3, 3, 1024, 1024) return xr.DataArray(cmatrix, dims=('i', 'j', 'x', 'y')) ds = xr.Dataset({name: to_darray(cm) for name, cm in covMats}) compressor = zarr.Blosc(cname='zstd', clevel=9, shuffle=2) encoding = {v: {'compressor': compressor} for v in list(ds.variables)} ds.to_zarr(dname_zarr) else: ds = xr.open_zarr(dname_zarr) return ds
def convert_netcdf_zarr(self, outname19="zarr19", outname37="zarr37"): """ Convert netCDF files into zarr directories for storage in S3 Parameters ---------- outname19: string (optional) name of the directory to store 19H file outname37: string (optional) name of the directory to store 37H file Returns ------- dict: {outname19:zarr_obj, outname37:zarr_obj} dictionary with filename as key and the zarr object generated as the value """ ds19 = xarray.open_dataset(self.outfile_19) ds37 = xarray.open_dataset(self.outfile_37) compressor = zarr.Blosc(cname="zstd", clevel=3) encoding19 = { vname: { "compressor": compressor } for vname in ds19.data_vars } encoding37 = { vname: { "compressor": compressor } for vname in ds37.data_vars } self.zarr19 = ds19.to_zarr(store=outname19, encoding=encoding19, consolidated=True) self.zarr37 = ds37.to_zarr(store=outname37, encoding=encoding37, consolidated=True) return {outname19: self.zarr19, outname37: self.zarr37}
def _get_write_encodings(cls, dataset, compressor, chunksizes, packing): encoding = None if chunksizes: encoding = {} for var_name in dataset.data_vars: var = dataset[var_name] chunks: List[int] = [] for i in range(len(var.dims)): dim_name = var.dims[i] if dim_name in chunksizes: chunks.append(chunksizes[dim_name]) else: chunks.append(var.shape[i]) encoding[var_name] = dict(chunks=chunks) if packing: if encoding: for var_name in packing.keys(): if var_name in encoding.keys(): encoding[var_name].update(dict(packing[var_name])) else: encoding[var_name] = dict(packing[var_name]) else: encoding = {} for var_name in packing.keys(): encoding[var_name] = dict(packing[var_name]) if compressor: compressor = zarr.Blosc(**compressor) if encoding: for var_name in encoding.keys(): encoding[var_name].update(compressor=compressor) else: encoding = { var_name: dict(compressor=compressor) for var_name in dataset.data_vars } return encoding
def compress_and_save_to_zarr(ds, mode='a'): # Chunk the array thinking about how we access spatial slices # Here we have assumed we don't use forecast time slices or many step slices chunk_dict = {'time': 1, 'step': 10, 'y': -1, 'x': -1} if 'isobaricInhPa' in ds.dims: chunk_dict['isobaricInhPa'] = 5 ds = ds.chunk(chunk_dict) if mode == 'w': # if this is the first addition to the zarr file use this # This encoding good compression and was as fast to load as any other # set level. It also took a reasonably short amount of time to encode # compared to level 9 for only a couple of percent more stored data. encoding = { var_name: { 'filters': [zarr.Delta(dtype='float32')], 'compressor': zarr.Blosc(cname='zstd', clevel=4, shuffle=zarr.Blosc.AUTOSHUFFLE) } for var_name in ds.keys() } gcsmap = gcsfs.mapping.GCSMap(gcssavepath, gcs=fs, check=False, create=True) ds.to_zarr(store=gcsmap, consolidated=True, encoding=encoding) elif mode == 'a': # if we are appending to an existing zarr file use this gcsmap = gcsfs.mapping.GCSMap(gcssavepath, gcs=fs, check=True, create=False) ds.to_zarr(store=gcsmap, append_dim='time', consolidated=True) else: raise ValueError
def bgen_to_zarr( input_path_bgen: str, input_path_variants: str, input_path_samples: str, output_path: str, contig_name: str, contig_index: int, max_mem: str = "500MB", # per-worker remote: bool = True, region: Optional[Tuple[int, int]] = None, ): """Convert UKB BGEN to Zarr""" paths = BGENPaths( bgen_path=input_path_bgen, variants_path=input_path_variants, samples_path=input_path_samples, ) contig = Contig(name=contig_name, index=contig_index) ds = load_bgen(paths, contig, region=region) # Chosen with expected shape across all chroms (~128MB chunks): # normalize_chunks('auto', shape=(97059328, 487409), dtype='float32') chunks = (5216, 5792) ds = rechunk_dataset( ds, output=output_path, contig=contig, fn=rechunk_bgen, chunks=chunks, max_mem=max_mem, remote=remote, compressor=zarr.Blosc(cname="zstd", clevel=7, shuffle=2, blocksize=0), probability_dtype="uint8", pack=True, ) logger.info("Done")
MODELS = { "AZFP": { "ext": ".01A", "xml": True, "parser": ParseAZFP, "set_groups": SetGroupsAZFP, }, "EK60": {"ext": ".raw", "xml": False, "parser": ParseEK60, "set_groups": SetGroupsEK60}, "EK80": {"ext": ".raw", "xml": False, "parser": ParseEK80, "set_groups": SetGroupsEK80}, "EA640": {"ext": ".raw", "xml": False, "parser": ParseEK80, "set_groups": SetGroupsEK80}, } COMPRESSION_SETTINGS = { 'netcdf4': {'zlib': True, 'complevel': 4}, 'zarr': {'compressor': zarr.Blosc(cname='zstd', clevel=3, shuffle=2)}, } DEFAULT_CHUNK_SIZE = {'range_bin': 25000, 'ping_time': 2500} NMEA_SENTENCE_DEFAULT = ["GGA", "GLL", "RMC"] def _normalize_path(out_f, convert_type, output_storage_options): if convert_type == "zarr": return fsspec.get_mapper(out_f, **output_storage_options) elif convert_type == "netcdf4": return out_f def _validate_path(
def set_nmea(self, nmea_dict): """Set the Platform/NMEA group in the nc file. Parameters ---------- nmea_dict dictionary containing platform parameters """ # Only save platform group if file_path exists save_path = nmea_dict['path'] if 'path' in nmea_dict else self.file_path if not os.path.exists(save_path): print( 'netCDF file does not exist, exiting without saving Platform group...' ) else: # Convert np.datetime64 numbers to seconds since 1900-01-01 # due to xarray.to_netcdf() error on encoding np.datetime64 objects directly time = (nmea_dict['nmea_time'] - np.datetime64('1900-01-01T00:00:00')) \ / np.timedelta64(1, 's') ds = xr.Dataset( { 'NMEA_datagram': (['time'], nmea_dict['nmea_datagram'], { 'long_name': 'NMEA datagram' }) }, coords={ 'time': (['time'], time, { 'axis': 'T', 'calendar': 'gregorian', 'long_name': 'Timestamps for NMEA datagrams', 'standard_name': 'time', 'units': 'seconds since 1900-01-01' }) }, attrs={'description': 'All NMEA sensor datagrams'}) # Splits up the time dimension. Used for when range bin length varies with time if 'ping_slice' in nmea_dict: # Slice using ping_time which does not map perfectly with nmea_time. # Rounds ping_time slice values to nmea_time lower = (nmea_dict['ping_slice'][0] - np.datetime64('1900-01-01T00:00:00')) \ / np.timedelta64(1, 's') lower = time[(np.abs(time - lower)).argmin()] upper = (nmea_dict['ping_slice'][-1] - np.datetime64('1900-01-01T00:00:00')) \ / np.timedelta64(1, 's') upper = time[(np.abs(time - upper)).argmin()] ds = ds.sel(time=slice(lower, upper)) # Configure compression settings nc_encoding = {} zarr_encoding = {} if self.compress: nc_settings = dict(zlib=True, complevel=4) nc_encoding = {var: nc_settings for var in ds.data_vars} zarr_settings = dict( compressor=zarr.Blosc(cname='zstd', clevel=3, shuffle=2)) zarr_encoding = {var: zarr_settings for var in ds.data_vars} # save to file if self.format == '.nc': ds.to_netcdf(path=save_path, mode='a', group='Platform/NMEA', encoding=nc_encoding) elif self.format == '.zarr': if not self.append_zarr: ds.to_zarr(store=save_path, mode='a', group='Platform/NMEA', encoding=zarr_encoding) else: ds.to_zarr(store=save_path, mode='a', group='Platform/NMEA', append_dim='time')
FORMAT = 'netcdf' PRODUCT = 'reanalysis-era5-single-levels' VARIABLE = ['total_precipitation'] TYPE = 'reanalysis' # TYPE = 'ensemble_members' MONTH = [str(i + 1).zfill(2) for i in range(12)] DAY = [str(i + 1).zfill(2) for i in range(31)] TIME = ['{}:00'.format(i).zfill(5) for i in range(24)] START_YEAR = 2018 END_YEAR = 2018 DTYPE = 'float32' CHUNKS = {'time': -1, 'latitude': 16, 'longitude': 16} GEN_FLOAT_ENCODING = { 'dtype': DTYPE, 'compressor': zarr.Blosc(cname='lz4', clevel=9) } ENCODING = {'precipitation': GEN_FLOAT_ENCODING} def get_url(year, month): cds_client = cdsapi.Client() query = { 'variable': VARIABLE, 'product_type': TYPE, 'year': year, 'month': month, 'day': DAY, 'time': TIME, 'format': FORMAT }
import logging logging_format = '%(asctime)s - %(name)s - %(message)s' logging.root.setLevel(logging.INFO) logging.basicConfig(level=logging.INFO, format=logging_format, datefmt='%Y-%m-%d %H:%M:%S') logger = logging.getLogger("ProcessData") # radarBase = "/wave/mlp/cwb-ci/Radar/raw/" # satBase = "/wave/mlp/cwb-ci/Satellite/raw/" radarBase = "../../data/cwb-ci/Radar/raw/" satBase = "../../data/cwb-ci/Satellite/raw/" global compressor compressor = zarr.Blosc(cname='zstd', clevel=3, shuffle=2) synchronizer = zarr.ProcessSynchronizer('example.sync') def bilinear_resize(image, height, width): """ `image` is a 2-D numpy array `height` and `width` are the desired spatial dimension of the new 2-D array. """ img_height, img_width = image.shape image = image.ravel() x_ratio = float(img_width - 1) / (width - 1) if width > 1 else 0 y_ratio = float(img_height - 1) / (height - 1) if height > 1 else 0
import zarr from datatree import DataTree # fmt: off # black and isort have conflicting ideas about how this should be formatted from ..core import SONAR_MODELS if TYPE_CHECKING: from ..core import EngineHint, PathHint, SonarModelsHint # fmt: on from ..echodata.echodata import XARRAY_ENGINE_MAP, EchoData from ..utils import io COMPRESSION_SETTINGS = { "netcdf4": {"zlib": True, "complevel": 4}, "zarr": {"compressor": zarr.Blosc(cname="zstd", clevel=3, shuffle=2)}, } DEFAULT_CHUNK_SIZE = {"range_sample": 25000, "ping_time": 2500} NMEA_SENTENCE_DEFAULT = ["GGA", "GLL", "RMC"] BEAM_SUBGROUP_DEFAULT = "Beam_group1" def to_file( echodata: EchoData, engine: "EngineHint", save_path: Optional["PathHint"] = None, compress: bool = True, overwrite: bool = False,
def set_beam(self, beam_dict): """Set the Beam group in the AZFP nc file. Parameters ---------- beam_dict dictionary containing general beam parameters """ ds = xr.Dataset( { 'backscatter_r': (['frequency', 'ping_time', 'range_bin' ], beam_dict['backscatter_r']), 'equivalent_beam_angle': (['frequency'], beam_dict['EBA']), 'gain_correction': (['frequency'], beam_dict['gain_correction']), 'sample_interval': (['frequency'], beam_dict['sample_interval'], { 'units': 's' }), 'transmit_duration_nominal': (['frequency'], beam_dict['transmit_duration_nominal'], { 'long_name': 'Nominal bandwidth of transmitted pulse', 'units': 's', 'valid_min': 0.0 }), 'temperature_counts': (['ping_time'], beam_dict['temperature_counts']), 'tilt_x_count': (['ping_time'], beam_dict['tilt_x_count']), 'tilt_y_count': (['ping_time'], beam_dict['tilt_y_count']), 'tilt_x': (['ping_time'], beam_dict['tilt_x']), 'tilt_y': (['ping_time'], beam_dict['tilt_y']), 'cos_tilt_mag': (['ping_time'], beam_dict['cos_tilt_mag']), 'DS': (['frequency'], beam_dict['DS']), 'EL': (['frequency'], beam_dict['EL']), 'TVR': (['frequency'], beam_dict['TVR']), 'VTX': (['frequency'], beam_dict['VTX']), 'Sv_offset': (['frequency'], beam_dict['Sv_offset']), 'number_of_samples_digitized_per_pings': (['frequency'], beam_dict['range_samples']), 'number_of_digitized_samples_averaged_per_pings': (['frequency'], beam_dict['range_averaging_samples']) }, coords={ 'frequency': (['frequency'], beam_dict['frequency'], { 'units': 'Hz', 'valid_min': 0.0 }), 'ping_time': (['ping_time'], beam_dict['ping_time'], { 'axis': 'T', 'calendar': 'gregorian', 'long_name': 'Timestamp of each ping', 'standard_name': 'time', 'units': 'seconds since 1970-01-01' }), 'range_bin': (['range_bin'], beam_dict['range_bin']) }, attrs={ 'beam_mode': '', 'conversion_equation_t': 'type_4', 'number_of_frequency': beam_dict['number_of_frequency'], 'number_of_pings_per_burst': beam_dict['number_of_pings_per_burst'], 'average_burst_pings_flag': beam_dict['average_burst_pings_flag'], # Temperature coefficients 'temperature_ka': beam_dict['temperature_ka'], 'temperature_kb': beam_dict['temperature_kb'], 'temperature_kc': beam_dict['temperature_kc'], 'temperature_A': beam_dict['temperature_A'], 'temperature_B': beam_dict['temperature_B'], 'temperature_C': beam_dict['temperature_C'], # Tilt coefficients 'tilt_X_a': beam_dict['tilt_X_a'], 'tilt_X_b': beam_dict['tilt_X_b'], 'tilt_X_c': beam_dict['tilt_X_c'], 'tilt_X_d': beam_dict['tilt_X_d'], 'tilt_Y_a': beam_dict['tilt_Y_a'], 'tilt_Y_b': beam_dict['tilt_Y_b'], 'tilt_Y_c': beam_dict['tilt_Y_c'], 'tilt_Y_d': beam_dict['tilt_Y_d'] }) n_settings = {} z_settings = {} if self.compress: n_settings = {'backscatter_r': {'zlib': True, 'complevel': 4}} z_settings = { 'backscatter_r': { 'compressor': zarr.Blosc(cname='zstd', clevel=3, shuffle=2) } } if self.format == '.nc': ds.to_netcdf(path=self.file_path, mode='a', group='Beam', encoding=n_settings) elif self.format == '.zarr': if not self.append_zarr: ds.to_zarr(store=self.file_path, mode='a', group='Beam', encoding=z_settings) else: ds.to_zarr(store=self.file_path, mode='a', group='Beam', append_dim='ping_time')