예제 #1
0
def save_zarr(id_patient, lung_mask, cand):
    lung_mask_group.array(id_patient, lung_mask, 
            chunks=(1, 17, 21, 21), compressor=zarr.Blosc(clevel=9, cname="zstd", shuffle=2), 
            synchronizer=zarr.ThreadSynchronizer())
    cand_group.array(id_patient, cand, 
            chunks=(1, 17, 21, 21), compressor=zarr.Blosc(clevel=9, cname="zstd", shuffle=2), 
            synchronizer=zarr.ThreadSynchronizer())
    return
예제 #2
0
def save_zarr(id_patient, lung_mask, nodule_mask):
    lung_mask_group.array(id_patient, lung_mask, 
            chunks=(10, 1, 512, 512), compressor=zarr.Blosc(clevel=9, cname="zstd", shuffle=2), 
            synchronizer=zarr.ThreadSynchronizer())
    nodule_mask_group.array(id_patient, nodule_mask, 
            chunks=(10, 1, 512, 512), compressor=zarr.Blosc(clevel=9, cname="zstd", shuffle=2), 
            synchronizer=zarr.ThreadSynchronizer())
    return
예제 #3
0
파일: dsio.py 프로젝트: sfoucher/xcube
    def _get_write_encodings(cls, dataset, compress, cname, clevel, shuffle,
                             blocksize, chunksizes):
        encoding = None
        if chunksizes:
            encoding = {}
            for var_name in dataset.data_vars:
                var = dataset[var_name]
                chunks: List[int] = []
                for i in range(len(var.dims)):
                    dim_name = var.dims[i]
                    if dim_name in chunksizes:
                        chunks.append(chunksizes[dim_name])
                    else:
                        chunks.append(var.shape[i])
                encoding[var_name] = dict(chunks=chunks)
        if compress:
            blosc_kwargs = dict(cname=cname,
                                clevel=clevel,
                                shuffle=shuffle,
                                blocksize=blocksize)
            for k in list(blosc_kwargs.keys()):
                if blosc_kwargs[k] is None:
                    del blosc_kwargs[k]
            compressor = zarr.Blosc(**blosc_kwargs)

            if encoding:
                for var_name in encoding.keys():
                    encoding[var_name].update(compressor=compressor)
            else:
                encoding = {
                    var_name: dict(compressor=compressor)
                    for var_name in dataset.data_vars
                }
        return encoding
예제 #4
0
    def create_variable_zarr(
        self,
        handler_zarr,
        kwargs_variable,
        attr_variable,
        data,
        scale_factor=None,
        add_offset=None,
        filters=None,
        compressor=None,
    ):
        kwargs_variable["shape"] = data.shape
        kwargs_variable["compressor"] = (zarr.Blosc(cname="zstd", clevel=2)
                                         if compressor is None else compressor)
        kwargs_variable["filters"] = list()
        store_dtype = kwargs_variable.pop("store_dtype", None)
        if scale_factor is not None or add_offset is not None:
            if add_offset is None:
                add_offset = 0
            kwargs_variable["filters"].append(
                zarr.FixedScaleOffset(
                    offset=float64(add_offset),
                    scale=1 / float64(scale_factor),
                    dtype=kwargs_variable["dtype"],
                    astype=store_dtype,
                ))
        if filters is not None:
            kwargs_variable["filters"].extend(filters)
        dims = kwargs_variable.get("dimensions", None)
        # Manage chunk in 2d case
        if len(dims) == 1:
            kwargs_variable["chunks"] = (2500000, )
        if len(dims) == 2:
            second_dim = data.shape[1]
            kwargs_variable["chunks"] = (200000, second_dim)

        kwargs_variable.pop("dimensions")
        v = handler_zarr.create_dataset(**kwargs_variable)
        attrs = list(attr_variable.keys())
        attrs.sort()
        for attr in attrs:
            attr_value = attr_variable[attr]
            v.attrs[attr] = str(attr_value)
        if self.raw_data:
            if scale_factor is not None:
                s_bloc = kwargs_variable["chunks"][0]
                nb_bloc = int(ceil(data.shape[0] / s_bloc))
                for i in range(nb_bloc):
                    sl = slice(i * s_bloc, (i + 1) * s_bloc)
                    v[sl] = data[sl] * scale_factor + add_offset
            else:
                v[:] = data
        if not self.raw_data:
            v[:] = data
        try:
            if v.size < 1e8:
                v.attrs["min"] = str(v[:].min())
                v.attrs["max"] = str(v[:].max())
        except ValueError:
            logger.warning("Data is empty")
예제 #5
0
def nc2zarr(fns,zpath,s3store=True,chunks=None,parallel=True):
    '''
    Convert netcdf files to zarr format and save to local or s3 store
    
    Parameters
    ----------
    fns     : a list of netcdf file names with full path
    zpath   : path to the local or s3 store
    s3store : flag of whether to save to s3 store, boolean
    chunks  : chunks used to read and write data
    parallel: flag to use dask to read files in parallel, boolean
    '''
    # --- remove lat/long from the list of vars to be concatenated.
    with xr.open_mfdataset(fns,parallel=True,chunks=chunks,combine='nested',concat_dim='time') as ds:
        vns = list(ds.data_vars)
    for vn in ['lat','long']:
        if vn in vns: vns.remove(vn)    
        
    with xr.open_mfdataset(fns,chunks=chunks,parallel=parallel, data_vars=vns,combine='nested',concat_dim='time') as ds:
        if s3store:
            fs = s3fs.S3FileSystem(anon=False)
            ds_store = s3fs.S3Map(root=zpath,s3=fs,check=False,create=True)
        else:
            ds_store = zpath
        if chunks is not None: 
            ds = ds.chunk(chunks=chunks) 
        else:
            ds = ds.chunk(chunks={x:ds.chunks[x][0] for x in ds.chunks})
        compressor = zarr.Blosc(cname='zstd', clevel=4)
        encoding = {vname: {'compressor': compressor} for vname in ds.data_vars}
        ds.to_zarr(store=ds_store,encoding=encoding,consolidated=True) 
        
    return 
예제 #6
0
def main(args=None):
    args = args if args is not None else sys.argv[1:]
    if len(args) != 2:
        print(f'Usage: {sys.argv[0]} OUTPUT.zarr (INPUT.nc | INPUT.dir)')
        sys.exit(2)

    output_dir = args[0]
    input_file = args[1]

    if os.path.isdir(input_file):
        input_dir = input_file
        input_files = list(os.listdir(input_dir))
        # Shuffle files
        for i in range(len(input_files)):
            i1 = random.randint(0, len(input_files) - 1)
            i2 = random.randint(0, len(input_files) - 1)
            t = input_files[i1]
            input_files[i1] = input_files[i2]
            input_files[i2] = t
        for input_file in input_files:
            print(f'processing {input_file}')
            subprocess.run([
                sys.executable, sys.argv[0], output_dir,
                os.path.join(input_dir, input_file)
            ])
        return

    synchronizer = zarr.ProcessSynchronizer(output_dir + '.sync')
    input_ds = xr.open_dataset(input_file, decode_times=False)
    dropped_vars = set(
        input_ds.data_vars.keys()) - {"analysed_sst", "analysis_error"}
    input_ds = input_ds.drop(dropped_vars)

    if not os.path.isdir(output_dir):
        compressor = zarr.Blosc(cname='zstd', clevel=3, shuffle=2)
        encoding = dict()
        for var_name in input_ds.data_vars:
            new_var = input_ds[var_name]
            chunks = new_var.shape
            encoding[var_name] = {'compressor': compressor, 'chunks': chunks}
        input_ds.to_zarr(output_dir,
                         encoding=encoding,
                         synchronizer=synchronizer)
        print(f'written {input_file} to {output_dir}')
    else:
        # cube_ds = xr.open_zarr(output_dir, synchronizer=synchronizer)
        # cube_ds = xr.concat([cube_ds, input_ds], dim='time')
        # cube_ds.close()
        root_group = zarr.open(output_dir, mode='a', synchronizer=synchronizer)
        for var_name, var_array in root_group.arrays():
            if var_name in input_ds:
                var = input_ds[var_name]
                if 'time' in var.dims:
                    if var_name == 'time':
                        print('time:', var, var.values)
                    axis = var.dims.index('time')
                    # Note: all append operations are forced to be sequential!
                    # See https://github.com/zarr-developers/zarr/issues/75
                    var_array.append(var, axis=axis)
        print(f'appended {input_file} to {output_dir}')
예제 #7
0
def encode_variables(
    ds: Dataset,
    chunk_length: int,
    chunk_width: int,
    compressor: Optional[Any] = zarr.Blosc(cname="zstd", clevel=7, shuffle=2),
    probability_dtype: Optional[Any] = "uint8",
) -> Dict[Hashable, Dict[str, Any]]:
    encoding = {}
    for v in ds:
        e = {}
        if compressor is not None:
            e.update({"compressor": compressor})
        if v in GT_DATA_VARS:
            e.update({"chunks": (chunk_length, chunk_width) + ds[v].shape[2:]})
        if probability_dtype is not None and v == "call_genotype_probability":
            dtype = np.dtype(probability_dtype)
            # Xarray will decode into float32 so any int greater than
            # 16 bits will cause overflow/underflow
            # See https://en.wikipedia.org/wiki/Floating-point_arithmetic#Internal_representation
            # *bits precision column for single precision floats
            if dtype not in [np.uint8,
                             np.uint16]:  # type: ignore[comparison-overlap]
                raise ValueError("Probability integer dtype invalid, must "
                                 f"be uint8 or uint16 not {probability_dtype}")
            divisor = np.iinfo(dtype).max - 1
            e.update({
                "dtype": probability_dtype,
                "add_offset": -1.0 / divisor,
                "scale_factor": 1.0 / divisor,
                "_FillValue": 0,
            })
        if e:
            encoding[v] = e
    return encoding
예제 #8
0
def test_rechunk_dataset(tmp_path, shape, source_chunks, target_chunks,
                         max_mem, pass_temp, executor):
    target_store = str(tmp_path / "target.zarr")
    temp_store = str(tmp_path / "temp.zarr")

    a = numpy.arange(numpy.prod(shape)).reshape(shape).astype("f4")
    a[-1] = numpy.nan
    ds = xarray.Dataset(
        dict(
            a=xarray.DataArray(a,
                               dims=["x", "y"],
                               attrs={
                                   "a1": 1,
                                   "a2": [1, 2, 3],
                                   "a3": "x"
                               }),
            b=xarray.DataArray(numpy.ones(shape[0]), dims=["x"]),
            c=xarray.DataArray(numpy.ones(shape[1]), dims=["y"]),
        ),
        attrs={
            "a1": 1,
            "a2": [1, 2, 3],
            "a3": "x"
        },
    )
    ds = ds.chunk(chunks=dict(zip(["x", "y"], source_chunks)))
    encoding = dict(
        a=dict(
            chunks=target_chunks,
            compressor=zarr.Blosc(cname="zstd"),
            dtype="int32",
            scale_factor=0.1,
            _FillValue=-9999,
        ),
        b=dict(chunks=target_chunks[:1]),
    )
    rechunked = api.rechunk_dataset(
        ds,
        encoding=encoding,
        max_mem=max_mem,
        target_store=target_store,
        temp_store=temp_store if pass_temp else None,
        executor=executor,
    )
    assert isinstance(rechunked, api.Rechunked)
    rechunked.execute()

    # Validate encoded variables
    dst = xarray.open_zarr(target_store, decode_cf=False)
    assert dst.a.dtype == encoding["a"]["dtype"]
    assert all(dst.a.values[-1] == encoding["a"]["_FillValue"])

    # Validate decoded variables
    dst = xarray.open_zarr(target_store, decode_cf=True)
    assert dst.a.data.chunksize == target_chunks
    assert dst.b.data.chunksize == target_chunks[:1]
    assert dst.c.data.chunksize == source_chunks[1:]
    xarray.testing.assert_equal(ds.compute(), dst.compute())
예제 #9
0
 def write(self, dataset: xr.Dataset, output_path: str, **kwargs):
     compressor = zarr.Blosc(cname='zstd', clevel=3, shuffle=2)
     encoding = dict()
     for var_name in dataset.data_vars:
         new_var = dataset[var_name]
         # TODO: get chunks from configuration
         chunks = new_var.shape
         encoding[var_name] = {'compressor': compressor, 'chunks': chunks}
     dataset.to_zarr(output_path, encoding=encoding)
예제 #10
0
def test_rechunk_dataset(
    tmp_path,
    shape,
    source_chunks,
    target_chunks,
    max_mem,
    executor,
    target_store,
    temp_store,
):
    if target_store.startswith("mapper"):
        fsspec = pytest.importorskip("fsspec")
        target_store = fsspec.get_mapper(str(tmp_path) + target_store)
        temp_store = fsspec.get_mapper(str(tmp_path) + temp_store)
    else:
        target_store = str(tmp_path / target_store)
        temp_store = str(tmp_path / temp_store)

    ds = example_dataset(shape).chunk(
        chunks=dict(zip(["x", "y"], source_chunks)))
    options = dict(a=dict(
        compressor=zarr.Blosc(cname="zstd"),
        dtype="int32",
        scale_factor=0.1,
        _FillValue=-9999,
    ))
    rechunked = api.rechunk(
        ds,
        target_chunks=target_chunks,
        max_mem=max_mem,
        target_store=target_store,
        target_options=options,
        temp_store=temp_store,
        executor=executor,
    )
    assert isinstance(rechunked, api.Rechunked)
    with dask.config.set(scheduler="single-threaded"):
        rechunked.execute()

    # Validate encoded variables
    dst = xarray.open_zarr(target_store, decode_cf=False)
    assert dst.a.dtype == options["a"]["dtype"]
    assert all(dst.a.values[-1] == options["a"]["_FillValue"])
    assert dst.a.encoding["compressor"] is not None

    # Validate decoded variables
    dst = xarray.open_zarr(target_store, decode_cf=True)
    target_chunks_expected = (target_chunks["a"] if isinstance(
        target_chunks["a"], tuple) else (target_chunks["a"]["x"],
                                         target_chunks["a"]["y"]))
    assert dst.a.data.chunksize == target_chunks_expected
    assert dst.b.data.chunksize == target_chunks_expected[:1]
    assert dst.c.data.chunksize == source_chunks[1:]
    xarray.testing.assert_equal(ds.compute(), dst.compute())
    assert ds.attrs == dst.attrs
예제 #11
0
def create_dataset(save_name, save_dir, data_dir, proportions):
    save_name += ".zarr"
    save_path = os.path.join(save_dir, save_name)

    zgroup = zarr.open_group(store=save_path, mode='w', path="/")
    zarr_kwargs = {
        'chunks': (1, 512, 512),
        'compressor': zarr.Blosc(cname='lz4', clevel=9, shuffle=1)
    }

    for i in range(130):
        print("Processing volume {}".format(i))
        volume = sitk.ReadImage(
            os.path.join(data_dir, "volume-" + str(i) + ".nii"))
        volume_np = sitk.GetArrayFromImage(volume)
        seg = sitk.ReadImage(
            os.path.join(data_dir, "segmentation-" + str(i) + ".nii"))
        seg_np = sitk.GetArrayFromImage(seg)

        slices = []
        if proportions[0] > 0:
            slices.extend(
                get_slices(seg_np,
                           target_class=0,
                           exclude_class=[1, 2],
                           proportion=proportions[0]))
        if proportions[1] > 0:
            slices.extend(
                get_slices(seg_np,
                           target_class=1,
                           exclude_class=2,
                           proportion=proportions[1]))
        if proportions[2] > 0:
            slices.extend(
                get_slices(seg_np, target_class=2, proportion=proportions[2]))

        volume_np = volume_np[slices]
        seg_np = seg_np[slices]

        if len(volume_np) == 0:
            print("WARNING! Skipping empty volume #{}".format(i))
            continue

        print("Saving {} slices".format(volume_np.shape[0]))
        subgroup = zgroup.create_group(str(i))
        subgroup.create_dataset("volume",
                                shape=volume_np.shape,
                                data=volume_np,
                                dtype=np.float32,
                                **zarr_kwargs)
        subgroup.create_dataset("segmentation",
                                shape=seg_np.shape,
                                data=seg_np,
                                dtype=np.int16,
                                **zarr_kwargs)
예제 #12
0
    def __init__(self,
                 data_element_shape,
                 dtype,
                 batch_size,
                 filename,
                 array_name,
                 length=None,
                 append=False,
                 kwargs=None):
        import zarr
        super(zarr_array_writer, self).__init__(None, data_element_shape,
                                                dtype, batch_size, length)
        self.filename = filename
        self.array_name = array_name
        self.kwargs = kwargs

        # Set up array kwargs
        self.arr_kwargs = {
            'name': array_name,
            'chunks': (batch_size, ) + data_element_shape,
            'compressor': zarr.Blosc(cname='lz4', clevel=5, shuffle=1),
            'dtype': dtype
        }
        if self.length is None:
            self.arr_kwargs['shape'] = (1, ) + self.data_element_shape
        else:
            self.arr_kwargs['shape'] = (
                self.length, ) + self.data_element_shape
        if kwargs is not None:
            self.arr_kwargs.update(kwargs)

        # Open the file for writing.
        self.group = None
        if append:
            self.write_mode = 'a'
        else:
            self.write_mode = 'w'
        try:
            self.group = zarr.open_group(filename, self.write_mode)
        except:
            print("Error: failed to open file %s" % filename)
            raise

        # Open an array interface (check if the array exists; if not, create it)
        if self.length is None:
            ds_args = (self.array_name, (1, ) + self.data_element_shape)
        else:
            ds_args = (self.array_name,
                       (self.length, ) + self.data_element_shape)
        try:
            self.storage_array = self.group[self.array_name]
            self.storage_array_ptr = len(self.storage_array)
        except KeyError:
            self.storage_array = self.group.create_dataset(**self.arr_kwargs)
            self.storage_array_ptr = 0
예제 #13
0
def test_rechunk_option_compression(rechunk_args):
    def rechunk(compressor):
        options = _wrap_options(rechunk_args["source"],
                                dict(overwrite=True, compressor=compressor))
        rechunked = api.rechunk(**rechunk_args, target_options=options)
        rechunked.execute()
        return sum(file.stat().st_size
                   for file in Path(rechunked._target.store.path).rglob("*"))

    size_uncompressed = rechunk(None)
    size_compressed = rechunk(
        zarr.Blosc(cname="zstd", clevel=9, shuffle=zarr.Blosc.SHUFFLE))
    assert size_compressed < size_uncompressed
예제 #14
0
def zarrify(x, dest, chunk=512, compression=DEFAULT_COMPRESSION):
    compressor = None
    if compression:
        compressor = zarr.Blosc(**compression)
    os.makedirs(os.path.dirname(dest), exist_ok=True)
    z = zarr.open(dest,
                  mode="w",
                  shape=x.shape,
                  chunks=(chunk, chunk, None),
                  dtype="<u2",
                  compressor=compressor)
    z[:] = x
    return z
예제 #15
0
def test_rechunk_option_compression(rechunked_fn):
    def rechunk(compressor):
        rechunked = rechunked_fn(
            temp_options=dict(overwrite=True, compressor=compressor),
            target_options=dict(overwrite=True, compressor=compressor),
        )
        rechunked.execute()
        return sum(file.stat().st_size
                   for file in Path(rechunked._target.store.path).rglob("*"))

    size_uncompressed = rechunk(None)
    size_compressed = rechunk(
        zarr.Blosc(cname="zstd", clevel=9, shuffle=zarr.Blosc.SHUFFLE))
    assert size_compressed < size_uncompressed
예제 #16
0
def test_rechunk_dataset_dimchunks(
    tmp_path,
    shape,
    source_chunks,
    target_chunks,
    max_mem,
):
    temp_store = "temp.zarr"
    target_store = "target.zarr"
    target_store = str(tmp_path / target_store)
    temp_store = str(tmp_path / temp_store)

    ds = example_dataset(shape).chunk(
        chunks=dict(zip(["x", "y"], source_chunks)))
    options = dict(a=dict(
        compressor=zarr.Blosc(cname="zstd"),
        dtype="int32",
        scale_factor=0.1,
        _FillValue=-9999,
    ))
    rechunked = api.rechunk(
        ds,
        target_chunks=target_chunks,
        max_mem=max_mem,
        target_store=target_store,
        target_options=options,
        temp_store=temp_store,
    )
    assert isinstance(rechunked, api.Rechunked)
    with dask.config.set(scheduler="single-threaded"):
        rechunked.execute()

    # Validate decoded variables
    dst = xarray.open_zarr(target_store, decode_cf=True)
    target_chunks_expected = [
        target_chunks.get("x", source_chunks[0]),
        target_chunks.get("y", source_chunks[1]),
    ]
    if target_chunks_expected[1] < 0 or target_chunks_expected[1] > len(ds.y):
        target_chunks_expected[1] = len(ds.y)

    target_chunks_expected = tuple(target_chunks_expected)

    assert dst.a.data.chunksize == target_chunks_expected
    assert dst.b.data.chunksize == target_chunks_expected[:1]
    assert dst.c.data.chunksize == target_chunks_expected[1:]

    xarray.testing.assert_equal(ds.compute(), dst.compute())
    assert ds.attrs == dst.attrs
def save_results(conn, image, data, dataset, path):
    filename, file_extension = os.path.splitext(image.getName())
    # Save the probabilities file as an image
    print("Saving Probabilities as zarr file attached to the original Image")
    name = filename + "_Probabilities_zarr.zip"
    desc = "ilastik probabilities from Image:%s" % image.getId()
    # Re-organise array from tzyxc to zctyx order expected by OMERO
    # data = data.swapaxes(0, 1).swapaxes(3, 4).swapaxes(2, 3).swapaxes(1, 2)
    namespace = "ilastik.zarr.demo"
    fp = os.path.join(path, name)
    with zarr.ZipStore(fp, mode='w') as store:
        zarr.array(data, store=store, dtype='int16',
                   compressor=zarr.Blosc(cname='zstd'))
    ann = conn.createFileAnnfromLocalFile(fp, mimetype="application/zip",
                                          ns=namespace, desc=desc)
    image.linkAnnotation(ann)
예제 #18
0
def encode_variables(
    ds: Dataset,
    compressor: Any = zarr.Blosc(cname="zstd", clevel=7, shuffle=2)
) -> Dict[Hashable, Dict[str, Any]]:
    # Set compressor, chunking and floating point encoding
    encoding = {}
    for v in ds:
        e = {"compressor": compressor}
        if v == "call_genotype_probability":
            e.update({
                "dtype": "uint8",
                "add_offset": -1.0 / 254.0,
                "scale_factor": 1.0 / 254.0,
                "_FillValue": 0,
            })
        encoding[v] = e
    return encoding
예제 #19
0
def _save_zarr(
        image: Any,
        uri: str,
        compress: bool = True,
        partition: Optional[str] = None,

        # Format-specific kwargs
        compression_type: str = "zstd",
        compression_level: int = 4):
    # image = image.chunk({"i": 1})
    if compress:
        compressor = zarr.Blosc(cname=compression_type,
                                clevel=compression_level)
        encoding = {k: {"compressor": compressor} for k in image.keys()}
    else:
        encoding = {}
    image.to_zarr(uri, consolidated=True, encoding=encoding)
예제 #20
0
def read_covar_matrix(fname):
    dname_zarr = fname.replace('.mat', '.zarr')
    if not exists(dname_zarr):
        covMat = sio.loadmat(fname)
        names = [
            'fl063_c', 'fl063_l', 'fl064_c', 'fl064_l', 'fl065_c', 'fl065_l',
            'fl068_c', 'fl068_l'
        ]
        covMats = zip(*(names, covMat['CovMat'][0]))

        def to_darray(cmatrix):
            cmatrix = cmatrix.reshape(3, 3, 1024, 1024)
            return xr.DataArray(cmatrix, dims=('i', 'j', 'x', 'y'))

        ds = xr.Dataset({name: to_darray(cm) for name, cm in covMats})

        compressor = zarr.Blosc(cname='zstd', clevel=9, shuffle=2)
        encoding = {v: {'compressor': compressor} for v in list(ds.variables)}
        ds.to_zarr(dname_zarr)
    else:
        ds = xr.open_zarr(dname_zarr)
    return ds
예제 #21
0
    def convert_netcdf_zarr(self, outname19="zarr19", outname37="zarr37"):
        """
        Convert netCDF files into zarr directories for storage in S3

        Parameters
        ----------
        outname19: string (optional)
            name of the directory to store 19H file
        outname37: string (optional)
            name of the directory to store 37H file

        Returns
        -------
        dict: {outname19:zarr_obj, outname37:zarr_obj}
            dictionary with filename as key and the zarr object generated as the value
        """
        ds19 = xarray.open_dataset(self.outfile_19)
        ds37 = xarray.open_dataset(self.outfile_37)
        compressor = zarr.Blosc(cname="zstd", clevel=3)
        encoding19 = {
            vname: {
                "compressor": compressor
            }
            for vname in ds19.data_vars
        }
        encoding37 = {
            vname: {
                "compressor": compressor
            }
            for vname in ds37.data_vars
        }
        self.zarr19 = ds19.to_zarr(store=outname19,
                                   encoding=encoding19,
                                   consolidated=True)
        self.zarr37 = ds37.to_zarr(store=outname37,
                                   encoding=encoding37,
                                   consolidated=True)
        return {outname19: self.zarr19, outname37: self.zarr37}
예제 #22
0
    def _get_write_encodings(cls, dataset, compressor, chunksizes, packing):
        encoding = None
        if chunksizes:
            encoding = {}
            for var_name in dataset.data_vars:
                var = dataset[var_name]
                chunks: List[int] = []
                for i in range(len(var.dims)):
                    dim_name = var.dims[i]
                    if dim_name in chunksizes:
                        chunks.append(chunksizes[dim_name])
                    else:
                        chunks.append(var.shape[i])
                encoding[var_name] = dict(chunks=chunks)
        if packing:
            if encoding:
                for var_name in packing.keys():
                    if var_name in encoding.keys():
                        encoding[var_name].update(dict(packing[var_name]))
                    else:
                        encoding[var_name] = dict(packing[var_name])
            else:
                encoding = {}
                for var_name in packing.keys():
                    encoding[var_name] = dict(packing[var_name])

        if compressor:
            compressor = zarr.Blosc(**compressor)

            if encoding:
                for var_name in encoding.keys():
                    encoding[var_name].update(compressor=compressor)
            else:
                encoding = {
                    var_name: dict(compressor=compressor)
                    for var_name in dataset.data_vars
                }
        return encoding
예제 #23
0
def compress_and_save_to_zarr(ds, mode='a'):
    # Chunk the array thinking about how we access spatial slices
    # Here we have assumed we don't use forecast time slices or many step slices
    chunk_dict = {'time': 1, 'step': 10, 'y': -1, 'x': -1}
    if 'isobaricInhPa' in ds.dims:
        chunk_dict['isobaricInhPa'] = 5
    ds = ds.chunk(chunk_dict)

    if mode == 'w':  # if this is the first addition to the zarr file use this
        # This encoding good compression and was as fast to load as any other
        # set  level. It also took a reasonably short amount of time to encode
        # compared to level 9 for only a couple of percent more stored data.
        encoding = {
            var_name: {
                'filters': [zarr.Delta(dtype='float32')],
                'compressor':
                zarr.Blosc(cname='zstd',
                           clevel=4,
                           shuffle=zarr.Blosc.AUTOSHUFFLE)
            }
            for var_name in ds.keys()
        }
        gcsmap = gcsfs.mapping.GCSMap(gcssavepath,
                                      gcs=fs,
                                      check=False,
                                      create=True)
        ds.to_zarr(store=gcsmap, consolidated=True, encoding=encoding)

    elif mode == 'a':  # if we are appending to an existing zarr file use this
        gcsmap = gcsfs.mapping.GCSMap(gcssavepath,
                                      gcs=fs,
                                      check=True,
                                      create=False)
        ds.to_zarr(store=gcsmap, append_dim='time', consolidated=True)
    else:
        raise ValueError
예제 #24
0
def bgen_to_zarr(
    input_path_bgen: str,
    input_path_variants: str,
    input_path_samples: str,
    output_path: str,
    contig_name: str,
    contig_index: int,
    max_mem: str = "500MB",  # per-worker
    remote: bool = True,
    region: Optional[Tuple[int, int]] = None,
):
    """Convert UKB BGEN to Zarr"""
    paths = BGENPaths(
        bgen_path=input_path_bgen,
        variants_path=input_path_variants,
        samples_path=input_path_samples,
    )
    contig = Contig(name=contig_name, index=contig_index)
    ds = load_bgen(paths, contig, region=region)

    # Chosen with expected shape across all chroms (~128MB chunks):
    # normalize_chunks('auto', shape=(97059328, 487409), dtype='float32')
    chunks = (5216, 5792)
    ds = rechunk_dataset(
        ds,
        output=output_path,
        contig=contig,
        fn=rechunk_bgen,
        chunks=chunks,
        max_mem=max_mem,
        remote=remote,
        compressor=zarr.Blosc(cname="zstd", clevel=7, shuffle=2, blocksize=0),
        probability_dtype="uint8",
        pack=True,
    )
    logger.info("Done")
예제 #25
0
파일: api.py 프로젝트: Badboy-16/echopype
MODELS = {
    "AZFP": {
        "ext": ".01A",
        "xml": True,
        "parser": ParseAZFP,
        "set_groups": SetGroupsAZFP,
    },
    "EK60": {"ext": ".raw", "xml": False, "parser": ParseEK60, "set_groups": SetGroupsEK60},
    "EK80": {"ext": ".raw", "xml": False, "parser": ParseEK80, "set_groups": SetGroupsEK80},
    "EA640": {"ext": ".raw", "xml": False, "parser": ParseEK80, "set_groups": SetGroupsEK80},
}

COMPRESSION_SETTINGS = {
    'netcdf4': {'zlib': True, 'complevel': 4},
    'zarr': {'compressor': zarr.Blosc(cname='zstd', clevel=3, shuffle=2)},
}

DEFAULT_CHUNK_SIZE = {'range_bin': 25000, 'ping_time': 2500}

NMEA_SENTENCE_DEFAULT = ["GGA", "GLL", "RMC"]


def _normalize_path(out_f, convert_type, output_storage_options):
    if convert_type == "zarr":
        return fsspec.get_mapper(out_f, **output_storage_options)
    elif convert_type == "netcdf4":
        return out_f


def _validate_path(
예제 #26
0
    def set_nmea(self, nmea_dict):
        """Set the Platform/NMEA group in the nc file.

        Parameters
        ----------
        nmea_dict
            dictionary containing platform parameters
        """
        # Only save platform group if file_path exists
        save_path = nmea_dict['path'] if 'path' in nmea_dict else self.file_path
        if not os.path.exists(save_path):
            print(
                'netCDF file does not exist, exiting without saving Platform group...'
            )
        else:
            # Convert np.datetime64 numbers to seconds since 1900-01-01
            # due to xarray.to_netcdf() error on encoding np.datetime64 objects directly
            time = (nmea_dict['nmea_time'] - np.datetime64('1900-01-01T00:00:00')) \
                   / np.timedelta64(1, 's')
            ds = xr.Dataset(
                {
                    'NMEA_datagram': (['time'], nmea_dict['nmea_datagram'], {
                        'long_name': 'NMEA datagram'
                    })
                },
                coords={
                    'time': (['time'], time, {
                        'axis': 'T',
                        'calendar': 'gregorian',
                        'long_name': 'Timestamps for NMEA datagrams',
                        'standard_name': 'time',
                        'units': 'seconds since 1900-01-01'
                    })
                },
                attrs={'description': 'All NMEA sensor datagrams'})

            # Splits up the time dimension. Used for when range bin length varies with time
            if 'ping_slice' in nmea_dict:
                # Slice using ping_time which does not map perfectly with nmea_time.
                # Rounds ping_time slice values to nmea_time
                lower = (nmea_dict['ping_slice'][0] - np.datetime64('1900-01-01T00:00:00')) \
                            / np.timedelta64(1, 's')
                lower = time[(np.abs(time - lower)).argmin()]
                upper = (nmea_dict['ping_slice'][-1] - np.datetime64('1900-01-01T00:00:00')) \
                            / np.timedelta64(1, 's')
                upper = time[(np.abs(time - upper)).argmin()]
                ds = ds.sel(time=slice(lower, upper))

            # Configure compression settings
            nc_encoding = {}
            zarr_encoding = {}
            if self.compress:
                nc_settings = dict(zlib=True, complevel=4)
                nc_encoding = {var: nc_settings for var in ds.data_vars}
                zarr_settings = dict(
                    compressor=zarr.Blosc(cname='zstd', clevel=3, shuffle=2))
                zarr_encoding = {var: zarr_settings for var in ds.data_vars}

            # save to file
            if self.format == '.nc':
                ds.to_netcdf(path=save_path,
                             mode='a',
                             group='Platform/NMEA',
                             encoding=nc_encoding)
            elif self.format == '.zarr':
                if not self.append_zarr:
                    ds.to_zarr(store=save_path,
                               mode='a',
                               group='Platform/NMEA',
                               encoding=zarr_encoding)
                else:
                    ds.to_zarr(store=save_path,
                               mode='a',
                               group='Platform/NMEA',
                               append_dim='time')
예제 #27
0
FORMAT = 'netcdf'
PRODUCT = 'reanalysis-era5-single-levels'
VARIABLE = ['total_precipitation']
TYPE = 'reanalysis'
# TYPE = 'ensemble_members'
MONTH = [str(i + 1).zfill(2) for i in range(12)]
DAY = [str(i + 1).zfill(2) for i in range(31)]
TIME = ['{}:00'.format(i).zfill(5) for i in range(24)]
START_YEAR = 2018
END_YEAR = 2018

DTYPE = 'float32'
CHUNKS = {'time': -1, 'latitude': 16, 'longitude': 16}
GEN_FLOAT_ENCODING = {
    'dtype': DTYPE,
    'compressor': zarr.Blosc(cname='lz4', clevel=9)
}
ENCODING = {'precipitation': GEN_FLOAT_ENCODING}


def get_url(year, month):
    cds_client = cdsapi.Client()
    query = {
        'variable': VARIABLE,
        'product_type': TYPE,
        'year': year,
        'month': month,
        'day': DAY,
        'time': TIME,
        'format': FORMAT
    }
예제 #28
0
import logging

logging_format = '%(asctime)s - %(name)s - %(message)s'
logging.root.setLevel(logging.INFO)
logging.basicConfig(level=logging.INFO,
                    format=logging_format,
                    datefmt='%Y-%m-%d %H:%M:%S')
logger = logging.getLogger("ProcessData")

# radarBase = "/wave/mlp/cwb-ci/Radar/raw/"
# satBase = "/wave/mlp/cwb-ci/Satellite/raw/"
radarBase = "../../data/cwb-ci/Radar/raw/"
satBase = "../../data/cwb-ci/Satellite/raw/"

global compressor
compressor = zarr.Blosc(cname='zstd', clevel=3, shuffle=2)
synchronizer = zarr.ProcessSynchronizer('example.sync')


def bilinear_resize(image, height, width):
    """
  `image` is a 2-D numpy array
  `height` and `width` are the desired spatial dimension of the new 2-D array.
  """
    img_height, img_width = image.shape

    image = image.ravel()

    x_ratio = float(img_width - 1) / (width - 1) if width > 1 else 0
    y_ratio = float(img_height - 1) / (height - 1) if height > 1 else 0
예제 #29
0
import zarr
from datatree import DataTree

# fmt: off
# black and isort have conflicting ideas about how this should be formatted
from ..core import SONAR_MODELS

if TYPE_CHECKING:
    from ..core import EngineHint, PathHint, SonarModelsHint
# fmt: on
from ..echodata.echodata import XARRAY_ENGINE_MAP, EchoData
from ..utils import io

COMPRESSION_SETTINGS = {
    "netcdf4": {"zlib": True, "complevel": 4},
    "zarr": {"compressor": zarr.Blosc(cname="zstd", clevel=3, shuffle=2)},
}

DEFAULT_CHUNK_SIZE = {"range_sample": 25000, "ping_time": 2500}

NMEA_SENTENCE_DEFAULT = ["GGA", "GLL", "RMC"]

BEAM_SUBGROUP_DEFAULT = "Beam_group1"


def to_file(
    echodata: EchoData,
    engine: "EngineHint",
    save_path: Optional["PathHint"] = None,
    compress: bool = True,
    overwrite: bool = False,
예제 #30
0
    def set_beam(self, beam_dict):
        """Set the Beam group in the AZFP nc file.

        Parameters
        ----------
        beam_dict
            dictionary containing general beam parameters
        """

        ds = xr.Dataset(
            {
                'backscatter_r': (['frequency', 'ping_time', 'range_bin'
                                   ], beam_dict['backscatter_r']),
                'equivalent_beam_angle': (['frequency'], beam_dict['EBA']),
                'gain_correction':
                (['frequency'], beam_dict['gain_correction']),
                'sample_interval':
                (['frequency'], beam_dict['sample_interval'], {
                    'units': 's'
                }),
                'transmit_duration_nominal':
                (['frequency'], beam_dict['transmit_duration_nominal'], {
                    'long_name': 'Nominal bandwidth of transmitted pulse',
                    'units': 's',
                    'valid_min': 0.0
                }),
                'temperature_counts':
                (['ping_time'], beam_dict['temperature_counts']),
                'tilt_x_count': (['ping_time'], beam_dict['tilt_x_count']),
                'tilt_y_count': (['ping_time'], beam_dict['tilt_y_count']),
                'tilt_x': (['ping_time'], beam_dict['tilt_x']),
                'tilt_y': (['ping_time'], beam_dict['tilt_y']),
                'cos_tilt_mag': (['ping_time'], beam_dict['cos_tilt_mag']),
                'DS': (['frequency'], beam_dict['DS']),
                'EL': (['frequency'], beam_dict['EL']),
                'TVR': (['frequency'], beam_dict['TVR']),
                'VTX': (['frequency'], beam_dict['VTX']),
                'Sv_offset': (['frequency'], beam_dict['Sv_offset']),
                'number_of_samples_digitized_per_pings':
                (['frequency'], beam_dict['range_samples']),
                'number_of_digitized_samples_averaged_per_pings':
                (['frequency'], beam_dict['range_averaging_samples'])
            },
            coords={
                'frequency': (['frequency'], beam_dict['frequency'], {
                    'units': 'Hz',
                    'valid_min': 0.0
                }),
                'ping_time': (['ping_time'], beam_dict['ping_time'], {
                    'axis': 'T',
                    'calendar': 'gregorian',
                    'long_name': 'Timestamp of each ping',
                    'standard_name': 'time',
                    'units': 'seconds since 1970-01-01'
                }),
                'range_bin': (['range_bin'], beam_dict['range_bin'])
            },
            attrs={
                'beam_mode': '',
                'conversion_equation_t': 'type_4',
                'number_of_frequency': beam_dict['number_of_frequency'],
                'number_of_pings_per_burst':
                beam_dict['number_of_pings_per_burst'],
                'average_burst_pings_flag':
                beam_dict['average_burst_pings_flag'],
                # Temperature coefficients
                'temperature_ka': beam_dict['temperature_ka'],
                'temperature_kb': beam_dict['temperature_kb'],
                'temperature_kc': beam_dict['temperature_kc'],
                'temperature_A': beam_dict['temperature_A'],
                'temperature_B': beam_dict['temperature_B'],
                'temperature_C': beam_dict['temperature_C'],
                # Tilt coefficients
                'tilt_X_a': beam_dict['tilt_X_a'],
                'tilt_X_b': beam_dict['tilt_X_b'],
                'tilt_X_c': beam_dict['tilt_X_c'],
                'tilt_X_d': beam_dict['tilt_X_d'],
                'tilt_Y_a': beam_dict['tilt_Y_a'],
                'tilt_Y_b': beam_dict['tilt_Y_b'],
                'tilt_Y_c': beam_dict['tilt_Y_c'],
                'tilt_Y_d': beam_dict['tilt_Y_d']
            })
        n_settings = {}
        z_settings = {}
        if self.compress:
            n_settings = {'backscatter_r': {'zlib': True, 'complevel': 4}}
            z_settings = {
                'backscatter_r': {
                    'compressor': zarr.Blosc(cname='zstd', clevel=3, shuffle=2)
                }
            }

        if self.format == '.nc':
            ds.to_netcdf(path=self.file_path,
                         mode='a',
                         group='Beam',
                         encoding=n_settings)
        elif self.format == '.zarr':
            if not self.append_zarr:
                ds.to_zarr(store=self.file_path,
                           mode='a',
                           group='Beam',
                           encoding=z_settings)
            else:
                ds.to_zarr(store=self.file_path,
                           mode='a',
                           group='Beam',
                           append_dim='ping_time')