Пример #1
0
def test_open():
    path = tempfile.mktemp()
    atexit.register(
        lambda: shutil.rmtree(path) if os.path.exists(path) else None
    )
    z = open(path, mode='w', shape=100, chunks=10, dtype='i4')
    z[:] = 42
    eq((100,), z.shape)
    eq((10,), z.chunks)
    assert_array_equal(np.full(100, fill_value=42, dtype='i4'), z[:])
    z2 = open(path, mode='r')
    eq((100,), z2.shape)
    eq((10,), z2.chunks)
    assert_array_equal(z[:], z2[:])
Пример #2
0
    def _create_array(self, data, **kwargs):

        # determine chunks
        chunks = default_chunks(data)
        kwargs.setdefault('chunks', chunks)

        # create array
        if 'path' in kwargs:
            kwargs['mode'] = 'w'
            kwargs['shape'] = data.shape
            # ensure dtype is specified
            dtype = kwargs.get('dtype', None)
            if not dtype:
                kwargs['dtype'] = data.dtype
            z = zarr.open(**kwargs)
            z[:] = data
        else:
            z = zarr.array(data, **kwargs)

        return z
Пример #3
0
def test_index_image(Image, image_search, tmp_path):
    """Test that indexing images is working correctly"""
    tmp_storage = zarr.open(
        str(tmp_path / 'tmp.zarr'),
        mode='a',
        shape=image_search.storage['/image_features'].shape,
        chunks=image_search.storage['/image_features'].chunks,
        dtype=np.float32)

    tmp_storage[:] = image_search.storage["/image_features"][:]

    # choose a random image to delete from the image index
    image_to_be_deleted = Image.query.order_by(func.random()).first()
    image_search.delete_index(image_to_be_deleted)

    assert np.sum(np.any(tmp_storage[:] != 0, axis=1)) - 1 == np.sum(
        np.any(image_search.storage['/image_features'][:] != 0, axis=1))

    image_search.index_model(Image, threaded=False)  # index all missing images

    assert np.sum(np.any(tmp_storage[:] != 0, axis=1)) == np.sum(
        np.any(image_search.storage['/image_features'][:] != 0, axis=1))
Пример #4
0
def execute(
    array,
    write_path=None,
    cluster_kwargs={},
    worker_buffer=4,
):
    """
    """

    # start the cluster
    with ClusterWrap.cluster(**cluster_kwargs) as cluster:

        # print dashboard url
        print("cluster dashboard link: ", cluster.get_dashboard())
        sys.stdout.flush()

        # scale cluster based on array chunks and buffer
        nchunks = np.prod(array.numblocks)
        cluster.scale_cluster(nchunks + worker_buffer)

        # if the user wants to write result to disk
        if write_path:
            compressor = Blosc(cname='zstd',
                               clevel=9,
                               shuffle=Blosc.BITSHUFFLE)
            array_disk = zarr.open(
                write_path,
                'w',
                shape=array.shape,
                chunks=array.chunksize,
                dtype=array.dtype,
                compressor=compressor,
            )
            da.to_zarr(array, array_disk)
            return array_disk

        # if the user wants the result back in memory
        else:
            return array.compute()
Пример #5
0
 def getMetaData(self, key=None):
     iZTable = zarr.open(self._FactorDB.MainDir + os.sep + self.Name,
                         mode="r")
     with self._FactorDB._DataLock:
         if key is not None:
             if key not in iZTable.attrs: return None
             MetaData = iZTable.attrs[key]
             if isinstance(MetaData, dict):
                 Type = MetaData.get("_Type")
                 if Type == "Array": return np.array(MetaData["List"])
                 elif Type == "Series":
                     return pd.read_json(MetaData["Json"], typ="series")
                 elif Type == "DataFrame":
                     return pd.read_json(MetaData["Json"], typ="frame")
                 else:
                     return MetaData
             else:
                 return MetaData
     MetaData = {}
     for iKey in iZTable.attrs:
         MetaData[iKey] = self.getMetaData(key=iKey)
     return MetaData
Пример #6
0
def merge_and_store(i, src, dst):
    """
    Merge a dimension of the Dask Array and store the results.

    Paramters
    ---------
    i : int
        The block number of the final array.
    src, dst: pathlike
        The files to read from and write to.
    """
    mapper = get_mapper(src)
    A = zarr.Array(mapper, read_only=True)
    zchunks = (A.shape[0], ) + A.chunks[1:]
    store = zarr.open(dst,
                      mode="a",
                      shape=A.shape,
                      chunks=zchunks,
                      dtype=A.dtype)
    ochunks = da.core.normalize_chunks(zchunks, A.shape)
    slices = da.core.slices_from_chunks(ochunks)
    store[slices[i]] = A[slices[i]]
Пример #7
0
def batchwise_to_zarr(arr: dask.array.core.Array,
                      zarr_dir_name: str,
                      rm: bool = False,
                      batch_size: int = 1):
    dir_p = Path(zarr_dir_name)
    if dir_p.exists():
        if rm:
            print("##########################################")
            print("removing " + str(dir_p))
            shutil.rmtree(dir_p)
        else:
            print("##########################################")
            print(str(dir_p) + " already exists")
            return

    if False:  # arr.nbytes < 8 * 1024 ** 3:
        # if the array fits into memory
        # the direct call of the to_zarr method
        # is possible (allthough it seems to imply a compute()
        # for the whole array or at least a part that is too big
        # to handle for bigger arrays
        arr.to_zarr(zarr_dir_name)
    else:
        # if the array is bigger than memory we compute explicitly
        # a part of it and write it to the zarr array.
        # This takes longer but gives us control over the
        # memory usage
        z = zr.open(zarr_dir_name,
                    mode="w",
                    shape=arr.shape,
                    chunks=arr.chunksize)
        #ncores = 32
        slices = batchSlices(arr.shape[-1], batch_size)
        print("result shape:", arr.shape)
        #        print(629, slices)
        #        for s in slices:

        for s in tqdm(slices):  # Holger
            z[..., s] = arr[..., s].compute()
Пример #8
0
 def __init__(self,
              cr: CrReader,
              zarr_fn: str,
              chunk_size=(1000, 1000),
              dtype: str = 'uint32'):
     """
     Args:
         cr: A CrReader object, containing the Cellranger data.
         zarr_fn: The file name for the Zarr hierarchy.
         chunk_size: The requested size of chunks to load into memory and process.
         dtype: the dtype of the data.
     """
     self.cr = cr
     self.fn = zarr_fn
     self.chunkSizes = chunk_size
     self.z = zarr.open(self.fn, mode='w')
     self._ini_cell_data()
     for assay_name in set(self.cr.assayFeats.columns):
         create_zarr_count_assay(self.z, assay_name, chunk_size,
                                 self.cr.nCells,
                                 self.cr.feature_ids(assay_name),
                                 self.cr.feature_names(assay_name), dtype)
Пример #9
0
def read_series(dataset: zarr.Array) -> Union[np.ndarray, pd.Categorical]:
    if "categories" in dataset.attrs:
        categories = dataset.attrs["categories"]
        if isinstance(categories, str):
            categories_key = categories
            parent_name = dataset.name.rstrip(dataset.basename)
            parent = zarr.open(dataset.store)[parent_name]
            categories_dset = parent[categories_key]
            categories = categories_dset[...]
            ordered = categories_dset.attrs.get("ordered", False)
        else:
            # TODO: remove this code at some point post 0.7
            # TODO: Add tests for this
            warn(
                f"Your file {str(dataset.file.name)!r} has invalid categorical "
                "encodings due to being written from a development version of "
                "AnnData. Rewrite the file ensure you can read it in the future.",
                FutureWarning,
            )
        return pd.Categorical.from_codes(dataset[...], categories, ordered=ordered)
    else:
        return dataset[...]
Пример #10
0
    def __enter__(self):

        if self.separate:

            if self.out_block_type.lower() == 'zarr':
                self.root = zarr.open(self.zarr_file, mode='w')

        else:

            if 'compress' in self.kwargs:
                logger.warning('\nCannot write concurrently to a compressed raster when using a combination of processes and threads.\nTherefore, compression will be applied after the initial write.')
                del self.kwargs['compress']

            # An alternative here is to leave the writeable object open as self.
            # However, this does not seem to work when used within a Dask
            #   client environment because the `self.dst_` object cannot be pickled.

            # Create the output file
            with rio.open(self.filename, mode='w', **self.kwargs) as dst_:
                pass

        return self
Пример #11
0
    def generate_mel_cache(self, audio_params):
        record_ids_list = list(
            sorted(set(self.data_all[SampleDataset.k_recording_id])))
        file_path = self.get_file_path(record_ids_list[0])
        single_item = read_as_melspectrogram(audio_params, file_path, None)
        dataset_shape = (len(record_ids_list), ) + single_item.shape
        chunks = (1, ) + single_item.shape

        zarr_group_name = self.gen_group_name(audio_params)
        zarr_root = zarr.open(self.folder_path + '_cache.zarr', mode='a')
        zarr_mel = zarr.convenience.open(str(
            Path(zarr_root.store.path).joinpath(zarr_group_name)),
                                         mode='a')
        stored_records = zarr_mel.attrs.get('record_ids_list', [])

        if stored_records != record_ids_list:
            shutil.rmtree(Path(zarr_root.store.path).joinpath(zarr_group_name))
            zarr_mel = zarr_root.create_dataset(zarr_group_name,
                                                shape=dataset_shape,
                                                dtype=np.float16,
                                                chunks=chunks)

            record_path_list = list(map(self.get_file_path, record_ids_list))
            map_iterator = zip(record_path_list,
                               (audio_params, ) * len(record_path_list))
            with Pool(multiprocessing.cpu_count() // 2) as pool:
                with tqdm(desc=f"Preparing mel cache [{zarr_group_name}]",
                          total=len(record_ids_list)) as t:
                    for record_id, record_mel in pool.imap(
                            get_mel, map_iterator):  # type: str, np.ndarray
                        idx = record_ids_list.index(record_id)
                        zarr_mel[idx, ...] = record_mel.astype(np.float16)
                        t.update()

            zarr_mel.attrs['record_ids_list'] = record_ids_list

        return zarr.convenience.open(str(
            Path(zarr_root.store.path).joinpath(zarr_group_name)),
                                     mode='r')
Пример #12
0
 def getFactorMetaData(self, factor_names=None, key=None):
     if factor_names is None: factor_names = self.FactorNames
     elif set(factor_names).isdisjoint(self.FactorNames):
         return super().getFactorMetaData(factor_names=factor_names,
                                          key=key)
     if key == "DataType": return self._DataType.loc[factor_names]
     with self._FactorDB._DataLock:
         MetaData = {}
         ZTable = zarr.open(self._FactorDB.MainDir + os.sep + self.Name,
                            mode="r")
         for iFactorName in factor_names:
             if iFactorName in self.FactorNames:
                 iZFactor = ZTable[iFactorName]
                 if key is None:
                     MetaData[iFactorName] = pd.Series(iZFactor.attrs)
                 elif key in iZFactor.attrs:
                     MetaData[iFactorName] = iZFactor.attrs[key]
     if not MetaData:
         return super().getFactorMetaData(factor_names=factor_names,
                                          key=key)
     if key is None: return pd.DataFrame(MetaData).loc[:, factor_names]
     else: return pd.Series(MetaData).loc[factor_names]
Пример #13
0
    def _write_segment_info(self):
        """ This function creates the info file needed to segment the image """
        if self.image_type != 'segmentation':
            raise TypeError(
                'The NeuroglancerWriter object must have image_type = "segmentation" to use write_segment_info.'
            )

        op = pathlib.Path(self.base_path).joinpath("infodir")
        op.mkdir(exist_ok=True)
        op = op.joinpath("info")

        # Get the labels
        root = zarr.open(str(self.base_path.joinpath("labels.zarr")))
        labels = set()
        for d in root.array_keys():
            labels = labels.union(set(root[d][:].squeeze().tolist()))

        inlineinfo = {
            "ids": [str(item) for item in labels],
            "properties": [{
                "id": "label",
                "type": "label",
                "values": [str(item) for item in labels]
            }, {
                "id": "description",
                "type": "label",
                "values": [str(item) for item in labels]
            }]
        }

        info = {
            "@type": "neuroglancer_segment_properties",
            "inline": inlineinfo
        }

        # writing all the information into the file
        with open(op, 'w') as writer:
            json.dump(info, writer, indent=2)
Пример #14
0
def test_rechunk_array(tmp_path, shape, source_chunks, dtype, dims,
                       target_chunks, max_mem):

    ### Create source array ###
    store_source = str(tmp_path / "source.zarr")
    source_array = zarr.ones(shape,
                             chunks=source_chunks,
                             dtype=dtype,
                             store=store_source)
    # add some attributes
    source_array.attrs["foo"] = "bar"
    if dims:
        source_array.attrs[_DIMENSION_KEY] = dims

    ### Create targets ###
    target_store = str(tmp_path / "target.zarr")
    temp_store = str(tmp_path / "temp.zarr")

    delayed = api.rechunk(source_array,
                          target_chunks,
                          max_mem,
                          target_store,
                          temp_store=temp_store)
    assert isinstance(delayed, api.Rechunked)

    target_array = zarr.open(target_store)

    if isinstance(target_chunks, dict):
        target_chunks_list = [target_chunks[d] for d in dims]
    else:
        target_chunks_list = target_chunks
    assert target_array.chunks == tuple(target_chunks_list)
    assert dict(source_array.attrs) == dict(target_array.attrs)

    result = delayed.execute()
    assert isinstance(result, zarr.Array)
    a_tar = dsa.from_zarr(target_array)
    assert dsa.equal(a_tar, 1).all().compute()
Пример #15
0
    def validate_gp(self, pattern):
        min_supp = self.d_set.thd_supp
        n = self.d_set.attr_size
        gen_pattern = GP()

        z_root = zarr.open(self.d_set.z_file, 'r')
        grp_name = 'dataset/' + self.d_set.step_name + '/rank_matrix'
        ranks = z_root[grp_name][:]  # [:] TO BE REMOVED

        main_bin = ranks[:, pattern.gradual_items[0].attribute_col]
        for i in range(len(pattern.gradual_items)):
            gi = pattern.gradual_items[i]
            if i == 0:
                if gi.is_decrement():
                    main_bin = np.where(main_bin == 0.5, 1,
                                        np.where(main_bin == 1, 0.5, 0))
                gen_pattern.add_gradual_item(gi)
                continue
            else:
                bin_2 = ranks[:, gi.attribute_col].copy()
                if gi.is_decrement():
                    bin_2 = np.where(bin_2 == 0.5, 1,
                                     np.where(bin_2 == 1, 0.5, 0))

                # Rank multiplication
                temp_bin = np.where(main_bin == bin_2, main_bin, 0)
                # print(str(main_bin) + ' + ' + str(bin_2) + ' = ' + str(temp_bin))
                supp = float(np.count_nonzero(temp_bin)) / float(
                    n * (n - 1.0) / 2.0)
                if supp >= min_supp:
                    main_bin = temp_bin.copy()
                    gen_pattern.add_gradual_item(gi)
                    gen_pattern.set_support(supp)

        if len(gen_pattern.gradual_items) <= 1:
            return pattern
        else:
            return gen_pattern
Пример #16
0
    def __init__(self, filename, ds_name):

        self.filename = filename
        self.ds_name = ds_name

        ds = zarr.open(filename)[ds_name]
        self.voxel_size = gp.Coordinate(ds.attrs['resolution'])
        self.spatial_dims = len(self.voxel_size)
        if 'offset' in ds.attrs:
            self.offset = gp.Coordinate(ds['resolution'])
        else:
            self.offset = gp.Coordinate((0, ) * self.spatial_dims)
        self.shape = gp.Coordinate(ds.shape)
        self.spatial_shape = gp.Coordinate(self.shape[-self.spatial_dims:])
        self.roi = gp.Roi(self.offset, self.spatial_shape * self.voxel_size)

        self.axes = {d: a for d, a in enumerate(ds.attrs['axes'])}

        if 'c' in self.axes:
            self.num_channels = self.shape[self.axes['c']]
        else:
            self.num_channels = 0

        if 's' in self.axes:
            self.num_samples = self.shape[self.axes['s']]
        else:
            self.num_samples = 0

        # gt specific

        if 'num_classes' in ds.attrs:
            self.num_classes = ds.attrs['num_classes']
        else:
            self.num_classes = 0
        if 'background_label' in ds.attrs:
            self.background_label = ds.attrs['background_label']
        else:
            self.background_label = None
Пример #17
0
 def get_validation_errors(
     self,
     schema_version: Optional[str] = None,
     devel_debug: bool = False,
 ) -> List[str]:
     try:
         data = zarr.open(self.filepath)
     except Exception as e:
         if devel_debug:
             raise
         lgr.warning(
             "Error opening %s: %s: %s",
             self.filepath,
             type(e).__name__,
             e,
             extra={"validating": True},
         )
         return [str(e)]
     if isinstance(data, zarr.Group) and not data:
         msg = "Zarr group is empty"
         if devel_debug:
             raise ValueError(msg)
         lgr.warning("%s: %s", self.filepath, msg, extra={"validating": True})
         return [msg]
     try:
         next(self.filepath.glob(f"*{os.sep}" + os.sep.join(["*"] * MAX_ZARR_DEPTH)))
     except StopIteration:
         pass
     else:
         msg = f"Zarr directory tree more than {MAX_ZARR_DEPTH} directories deep"
         if devel_debug:
             raise ValueError(msg)
         lgr.warning("%s: %s", self.filepath, msg, extra={"validating": True})
         return [msg]
     # TODO: Should this be appended to the above errors?
     return super().get_validation_errors(
         schema_version=schema_version, devel_debug=devel_debug
     )
Пример #18
0
    def _open_zarr_root(self, path):

        #TODO: Use case where user opens an already HCS-store?
        """
        Change current zarr to an existing store
        if zarr doesn't exist, raise error

        Parameters
        ----------
        path:       (str) path to store. Must end in .zarr

        Returns
        -------

        """

        if os.path.exists(path):
            self.store = zarr.open(path)
            self.__root_store_path = path
        else:
            raise FileNotFoundError(
                f'No store found at {path}, check spelling or create new store with create_zarr'
            )
Пример #19
0
def napari_get_reader(path):
    """Implementation of the napari_get_reader hook specification.
    
    Parameters
    ----------
    path : str or list of str
        Path to file, or list of paths.
    
    Returns
    -------
    function or None
        If the path is a recognized format, return a function that accepts the
        same path or list of paths, and returns a list of layer data tuples.
    """
    # Inspect dataset
    path = Path(path)
    dataset = zarr.open(path.as_posix(), mode='r')

    # If dataset is a full dataset return reader
    if dataset.attrs['waver'] and dataset.attrs['dataset']:
        return load_simulation_dataset
    else:
        return None
Пример #20
0
def encode_dset(model, hparams, dset_path, emb_path):
    dataset = TripleEmbeddingDataset(dset_path, emb_path)
    loader = DataLoader(
        dataset,
        batch_size=args.batch_size,
        num_workers=args.num_workers,
        pin_memory=True,
    )
    # model props
    model_props = get_model_properties(hparams)
    print(model_props)
    # encode
    z = zarr.open(str(dset_path), "r+")
    out_shape = (len(z.id), int(hparams["model_n"]["n"][-1]))
    z_result = {}
    for k in ["query", "pos", "neg"]:
        z_result[k] = z.zeros(
            f"{k}/result/{model_props}",
            shape=out_shape,
            chunks=(args.batch_size, None),
            overwrite=True,
            compressor=Zstd(),
        )
    def __init__(self,
                 img_directory,
                 whole_dataset=False,
                 training=True,
                 transform=None):
        super(IDR0017_FullImgs_ZarrDataset, self).__init__()
        self.img_directory = img_directory
        self.training = training
        self.transform = transform
        img_dataset = zarr.open(img_directory, mode="r")
        img_array = img_dataset["images"]
        assert len(img_array) > 0, "No images found in path"

        self.data = img_array
        if whole_dataset:
            self.idxs = list(range(len(img_array)))
        else:
            train_idxs, test_idxs = train_test_split(list(range(
                len(img_array))),
                                                     test_size=0.1,
                                                     random_state=42,
                                                     shuffle=True)
            self.idxs = train_idxs if training else test_idxs
Пример #22
0
def load_sim_data(load_path, use_zarr=False):
    """Loads 

    Parameters
    ----------
        load_path : str
            Path of the simulation data folder.
        use_zarr : bool, optional
            If True, the simulation data will be given as a zarr array,
            rather than as a numpy array. The former is useful if the
            data is very large.

    Returns
    -------
    tuple
        A tuple `(node_mappings, cnode_mappings, ts, X_states)`, containing
        all simulation data. `X_states` is either an `np.ndarray` or a `zarr.core.Array`.
        If `use_zarr=True`, the latter will be given.
    """
    node_mappings_path = 'node_mappings.pkl'
    cnode_mappings_path = 'cnode_mappings.pkl'
    ts_path = 'ts.npy'
    X_states_path = 'X_states.zarr'

    node_mappings = pickle.load(
        open("%s/%s" % (load_path, node_mappings_path), "rb"))
    cnode_mappings = pickle.load(
        open("%s/%s" % (load_path, cnode_mappings_path), "rb"))
    ts = np.load("%s/%s" % (load_path, ts_path))
    X_states = zarr.open("%s/%s" % (load_path, X_states_path),
                         chunks=(len(ts), 1))

    if not use_zarr:
        X_states = X_states[:]

    sim_data = (node_mappings, cnode_mappings, ts, X_states)
    return sim_data
Пример #23
0
    def run(self):
        progress = 0.0
        self.set_progress_percentage(progress)
        if "unaligned" in self.de:
            aligned = False
        else:
            aligned = True
        for s in self.samples:
            filename = os.path.join(
                os.path.dirname(self.input()[0].fn), self.de, s + ".n5"
            )
            datasets_src = ["clefts", "pre_dist", "post_dist"]
            datasets_tgt = ["clefts_cropped", "pre_dist_cropped", "post_dist_cropped"]
            off = offsets[s][aligned]
            sh = shapes[s][aligned]
            f = zarr.open(filename, mode="a")
            for dss, dst in zip(datasets_src, datasets_tgt):
                chunk_size = tuple(min(c, shi) for c, shi in zip(f[dss].chunks, sh))
                f.create_dataset(
                    name=dst,
                    shape=sh,
                    compressor=numcodecs.GZip(6),
                    dtype=f[dss].dtype,
                    chunks=chunk_size,
                )
                bb = tuple(slice(o, o + shi, None) for o, shi in zip(off, sh))
                f[dst][:] = f[dss][bb]
                f[dst].attrs["offset"] = off[::-1]

                progress += 100.0 / (len(self.samples) * len(datasets_src))
                try:
                    self.set_progress_percentage(progress)
                except:
                    pass

        done = self.output().open("w")
        done.close()
Пример #24
0
def read_zarr(store: Union[str, Path, MutableMapping, zarr.Group]) -> AnnData:
    """Read from a hierarchical Zarr array store.

    Parameters
    ----------
    store
        The filename, a :class:`~typing.MutableMapping`, or a Zarr storage class.
    """
    if isinstance(store, Path):
        store = str(store)

    f = zarr.open(store, mode="r")
    d = {}
    for k in f.keys():
        # Backwards compat
        if k.startswith("raw."):
            continue
        if k in {"obs", "var"}:
            d[k] = read_dataframe(f[k])
        else:  # Base case
            d[k] = read_attribute(f[k])

    # Backwards compat
    raw = {}
    if "raw.var" in f:
        raw["var"] = read_dataframe(f["raw.var"])  # Backwards compat
    if "raw.varm" in f:
        raw["varm"] = read_attribute(f["raw.varm"])
    if "raw.X" in f:
        raw["X"] = read_attribute(f["raw.X"])
    if len(raw) > 0:
        assert "raw" not in d
        d["raw"] = raw

    _clean_uns(d)

    return AnnData(**d)
Пример #25
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("zarr",
                        metavar="PATH_OR_URL",
                        description="Update a Zarr's start_date and"
                        "stop_date attributes to match its data.")
    parser.add_argument("--dry-run",
                        "-d",
                        action="store_true",
                        help="Don't actually write metadata")
    parser.add_argument("--verbose",
                        "-v",
                        action="store_true",
                        help="Report progress to standard output")
    args = parser.parse_args()
    ds = xr.open_zarr(args.zarr)
    z = zarr.open(args.zarr)
    t0 = ds.time[0].values
    t1 = ds.time[-1].values
    if args.verbose:
        print("First/last times:", t0, t1)
    new_attrs = dict(start_date=pd.to_datetime(t0).strftime("%Y-%m-%d"),
                     stop_date=pd.to_datetime(t1).strftime("%Y-%m-%d"))
    if args.verbose:
        for title, dic in ("Old", z.attrs), ("New", new_attrs):
            print(f"{title} attributes:")
            for key in "start_date", "stop_date":
                print(f'    {key}: ' +
                      (dic[key] if key in dic else "not present"))
    if args.dry_run:
        if args.verbose:
            print("Dry run -- not updating.")
    else:
        z.attrs.update(new_attrs)
        zarr.consolidate_metadata(args.zarr)
        if args.verbose:
            print("Attributes updated.")
Пример #26
0
def fromzarr(path, group=None, dataset=None, chunk_size=None):
    import zarr

    if isinstance(path, zarr.Array):
        arr = path
        if isinstance(arr.store, FSMap):
            root = arr.store.root
            path, dataset = root.rsplit('/', 1)
        else:
            path = arr.store.path
            if '/' in arr.path and group is None:
                group = arr.path.rsplit('/', 1)[0]
            dataset = arr.basename
            if not dataset:
                path, dataset = path.rsplit('/', 1)
        shape = arr.shape
    elif isinstance(path, str):
        fs = get_fs(path, None)
        fs_map = FSMap(path, fs)

        if group is None and dataset is None:
            arr = zarr.open(fs_map)
            if isinstance(arr, zarr.Array):
                return fromzarr(arr, chunk_size=chunk_size)

        g = zarr.group(store=fs_map)
        arr = g[TensorFromZarr.get_path(group, dataset)]
        shape = arr.shape
    else:
        raise TypeError('`path` passed has wrong type, '
                        'expect str, or zarr.Array'
                        f'got {type(path)}')

    chunk_size = chunk_size if chunk_size is not None else arr.chunks
    op = TensorFromZarr(filename=path, group=group, dataset=dataset,
                        dtype=arr.dtype)
    return op(shape, chunk_size=chunk_size, order=TensorOrder(arr.order))
Пример #27
0
 def setFactorMetaData(self,
                       table_name,
                       ifactor_name,
                       key=None,
                       value=None,
                       meta_data=None):
     with self._DataLock:
         iZTable = zarr.open(self.MainDir + os.sep + table_name, mode="a")
         iZFactor = iZTable[ifactor_name]
         if key is not None:
             if key in iZFactor.attrs:
                 del iZFactor.attrs[key]
             if isinstance(value, np.ndarray):
                 iZFactor.attrs[key] = {
                     "_Type": "Array",
                     "List": value.tolist()
                 }
             elif isinstance(value, pd.Series):
                 iZFactor.attrs[key] = {
                     "_Type": "Series",
                     "Json": value.to_json(index=True)
                 }
             elif isinstance(value, pd.DataFrame):
                 iZFactor.attrs[key] = {
                     "_Type": "DataFrame",
                     "Json": value.to_json(index=True)
                 }
             elif value is not None:
                 iZFactor.attrs[key] = value
     if meta_data is not None:
         for iKey in meta_data:
             self.setFactorMetaData(table_name,
                                    ifactor_name=ifactor_name,
                                    key=iKey,
                                    value=meta_data[iKey],
                                    meta_data=None)
     return 0
Пример #28
0
    def __init__(
        self,
        file: str,
        band: int = 0,
        as_crs: Optional[int] = 4326,
        crs_code: Optional[int] = None,
    ):
        """For representing a geotiff

        Args:
            file (str): Location of the geotiff file
            band (int): The band of the tiff file to use. Defaults to 0.
            as_crs (Optional[int]): The epsg crs code to read the data as.  Defaults to 4326 (WGS84).
            crs_code (Optional[int]): The epsg crs code of the tiff file. Include this if the crs code can't be detected.

        """
        self.file = file
        self._as_crs = crs_code if as_crs is None else as_crs
        tif = TiffFile(self.file)

        if not tif.is_geotiff:
            raise Exception("Not a geotiff file")

        store = tif.aszarr(key=band)
        self._z = zarr.open(store, mode="r")
        store.close()
        if isinstance(crs_code, int):
            self._crs_code: int = crs_code
        else:
            self._crs_code = self._get_crs_code(tif.geotiff_metadata)
        self._tif_shape: List[int] = self._z.shape
        scale: Tuple[float, float,
                     float] = tif.geotiff_metadata["ModelPixelScale"]
        tilePoint: List[float] = tif.geotiff_metadata["ModelTiepoint"]
        self._tifTrans: TifTransformer = TifTransformer(
            self._tif_shape[0], self._tif_shape[1], scale, tilePoint)
        tif.close()
def main(input_paths, out_path):
    in_files = [h5py.File(in_path) for in_path in input_paths]
    in_file_sizes = [0] * len(in_files)
    out_file = zarr.open(out_path, mode='a')

    datasets = list(in_files[0].keys())
    datasets_per_file = defaultdict(dict)
    for in_path, in_file in enumerate(in_files):
        in_file_sizes[in_path] = len(in_file[datasets[0]])
        for dataset in datasets:
            datasets_per_file[in_path][dataset] = in_file[dataset]

    for dataset in datasets:
        current_infile = datasets_per_file[0][dataset]
        args = dict(
            name=dataset,
            shape=current_infile.shape,
            chunks=CHUNK_SHAPES.get(dataset,
                                    (CHUNK_SIZE, ) + current_infile.shape[1:]),
            dtype=current_infile.dtype,
        )
        args.update(ENCODERS.get(dataset, DEFAULT_ENCODER))
        print("Dataset %s: %s" % (dataset, args))
        out_set = out_file.create_dataset(**args)

        for in_path in range(len(input_paths)):
            print("In-file %s..." % input_paths[in_path])
            current_infile = datasets_per_file[in_path][dataset]

            num_batches = int(np.ceil(len(current_infile) / BATCH_SIZE))
            for index_batch in tqdm.tqdm(range(num_batches)):
                start = index_batch * BATCH_SIZE
                end = min(len(current_infile), (index_batch + 1) * BATCH_SIZE)

                out_set[start:end] = current_infile[start:end]
    out_file.store.close()
    return
Пример #30
0
def labels_to_zarr(path, data, meta):
    """Write a 2D+ labels layer to zarr, chunked along the
    last two dimensions, presumed shape (..., y, x)

    Parameters
    ----------
    path : str 
        Path save to disk. Must end with .zarr
    data : array
        Labels data to be written
    meta : dict
        Labels metadata

    Returns
    -------
    str or None
        path if any labels were written, otherwise None
    """
    if not path.endswith('.zarr'):
        return None

    zarr_shape = data.shape
    zarr_dtype = data.dtype

    # we assume x,y are the final two dimensions and chunk accordingly
    zarr_chunks = tuple([1 for i in range(len(zarr_shape) - 2)] + [1024, 1024])

    # TODO: compression type? Get from user?
    out_zarr = zarr.open(
        path, 
        mode='w',
        shape=zarr_shape,
        dtype=zarr_dtype,
        chunks=zarr_chunks
    )
    out_zarr[:] = data[:]
    return path
Пример #31
0
    def set_provenance(self, src_file_names, prov_dict):
        """Set the Provenance group in the nc file.

        Parameters
        ----------
        src_file_names
            list of source filenames
        prov_dict
            dictionary containing file conversion parameters
                          prov_dict['conversion_software_name']
                          prov_dict['conversion_software_version']
                          prov_dict['conversion_time']
        """
        # create group
        files = ", ".join([os.path.basename(file) for file in src_file_names])
        if self.format == '.nc':
            file = netCDF4.Dataset(self.file_path, "a", format="NETCDF4")
            pr = file.createGroup("Provenance")
            # dimensions
            pr.createDimension("filenames", None)
            # variables
            pr_src_fnames = pr.createVariable(files, str, "filenames")
            pr_src_fnames.long_name = "Source filenames"

            # set group attributes
            for k, v in prov_dict.items():
                pr.setncattr(k, v)
            # close nc file
            file.close()

        elif self.format == '.zarr':
            file = zarr.open(self.file_path, 'a')
            pr = file.create_group('Provenance')
            pr_src_fnames = pr.create_dataset('filenames', data=files)
            pr_src_fnames.attrs['long_name'] = "Source filenames"
            for k, v in prov_dict.items():
                pr[k] = v
Пример #32
0
def subset_assay_zarr(
    zarr_fn: str,
    in_grp: str,
    out_grp: str,
    cells_idx: np.ndarray,
    feat_idx: np.ndarray,
    chunk_size: tuple,
):
    """
    Selects a subset of the data in an assay in the specified Zarr hierarchy.

    For the arguments `cells_idx` and `feat_idx`, refer to the documentation for numpy.split:
    https://numpy.org/doc/stable/reference/generated/numpy.split.html

    Args:
        zarr_fn: The file name for the Zarr hierarchy.
        in_grp: Group in Zarr hierarchy to subset.
        out_grp: Group name in Zarr hierarchy to write subsetted assay to.
        cells_idx: A list of cell indices to (keep | drop ?).
        feat_idx: A list of feature indices to (keep | drop ?).
        chunk_size: The requested size of chunks to load into memory and process.

    Returns:
        None
    """
    z = zarr.open(zarr_fn, "r+")
    ig = z[in_grp]
    og = create_zarr_dataset(z, out_grp, chunk_size, "uint32",
                             (len(cells_idx), len(feat_idx)))
    pos_start, pos_end = 0, 0
    for i in tqdmbar(
            np.array_split(cells_idx,
                           len(cells_idx) // chunk_size[0] + 1)):
        pos_end += len(i)
        og[pos_start:pos_end, :] = ig.get_orthogonal_selection((i, feat_idx))
        pos_start = pos_end
    return None
Пример #33
0
Файл: core.py Проект: elaeon/ML
 def open(self):
     if self.conn is None:
         self.conn = zarr.open(self.url, mode=self.mode)
         self.attrs = self.conn.attrs
     return self