예제 #1
0
def test_info():

    # setup
    g = zarr.group(store=dict(), chunk_store=dict(),
                   synchronizer=zarr.ThreadSynchronizer())
    g.create_group('foo')
    z = g.zeros('bar', shape=10, filters=[numcodecs.Adler32()])

    # test group info
    items = g.info_items()
    keys = sorted([k for k, _ in items])
    expected_keys = sorted([
        'Type', 'Read-only', 'Synchronizer type', 'Store type', 'Chunk store type',
        'No. members', 'No. arrays', 'No. groups', 'Arrays', 'Groups', 'Name'
    ])
    assert_list_equal(expected_keys, keys)

    # test array info
    items = z.info_items()
    keys = sorted([k for k, _ in items])
    expected_keys = sorted([
        'Type', 'Data type', 'Shape', 'Chunk shape', 'Order', 'Read-only', 'Filter [0]',
        'Compressor', 'Synchronizer type', 'Store type', 'Chunk store type', 'No. bytes',
        'No. bytes stored', 'Storage ratio', 'Chunks initialized', 'Name'
    ])
    assert_list_equal(expected_keys, keys)
예제 #2
0
def main():
    # Command line args are in sys.argv[1], sys.argv[2] ..
    # sys.argv[0] is the script name itself and can be ignored
    # parse arguments
    parser = argparse.ArgumentParser(
        description="Runs Conway's Game of Life simulation.")
    # add arguments
    parser.add_argument("--grid-size", dest="N", required=False)
    parser.add_argument("--mov-file", dest="movfile", required=False)
    parser.add_argument("--interval", dest="interval", required=False)
    parser.add_argument("--glider", action="store_true", required=False)
    parser.add_argument("--gosper", action="store_true", required=False)
    parser.add_argument("--port")
    args = parser.parse_args()

    # set grid size
    N = 100
    if args.N and int(args.N) > 8:
        N = int(args.N)

    # set animation update interval
    update_interval = 1
    if args.interval:
        update_interval = int(args.interval)

    # declare grid
    grid = np.array([])
    # check if "glider" demo flag is specified
    if args.glider:
        grid = np.zeros(N * N).reshape(N, N)
        add_glider(1, 1, grid)
    elif args.gosper:
        grid = np.zeros(N * N).reshape(N, N)
        add_gosper_glider_gun(10, 10, grid)
    else:
        # populate grid with random on/off - more off than on
        grid = random_grid(N)

    store = zarr.RedisStore(port=args.port)
    root = zarr.group(store=store, overwrite=True)
    t = 0
    while True:
        arr = root.zeros(f"{t}", shape=grid.shape, chunks=(25, 25))
        arr[...] = grid

        t += 1
        time.sleep(update_interval)
        grid = update(grid, N)
        print(t, grid)
예제 #3
0
def fromzarr(path, group=None, dataset=None, chunk_size=None):
    import zarr

    try:
        # since v2.11.0, zarr convert mutable mappings to KVStore
        from zarr.storage import KVStore as zarr_kvstore
    except ImportError:  # pragma: no cover
        zarr_kvstore = None

    if isinstance(path, zarr.Array):
        arr = path
        if zarr_kvstore is None and isinstance(arr.store,
                                               FSMap):  # pragma: no cover
            root = arr.store.root
            path, dataset = root.rsplit("/", 1)
        elif zarr_kvstore and isinstance(arr.store, zarr_kvstore):
            root = arr.store._mutable_mapping.root
            path, dataset = root.rsplit("/", 1)
        else:
            path = arr.store.path
            if "/" in arr.path and group is None:
                group = arr.path.rsplit("/", 1)[0]
            dataset = arr.basename
            if not dataset:
                path, dataset = path.rsplit("/", 1)
        shape = arr.shape
    elif isinstance(path, str):
        fs = get_fs(path, None)
        fs_map = FSMap(path, fs)

        if group is None and dataset is None:
            arr = zarr.open(fs_map)
            if isinstance(arr, zarr.Array):
                return fromzarr(arr, chunk_size=chunk_size)

        g = zarr.group(store=fs_map)
        arr = g[TensorFromZarr.get_path(group, dataset)]
        shape = arr.shape
    else:
        raise TypeError("`path` passed has wrong type, "
                        "expect str, or zarr.Array"
                        f"got {type(path)}")

    chunk_size = chunk_size if chunk_size is not None else arr.chunks
    op = TensorFromZarr(filename=path,
                        group=group,
                        dataset=dataset,
                        dtype=arr.dtype)
    return op(shape, chunk_size=chunk_size, order=TensorOrder(arr.order))
예제 #4
0
파일: data.py 프로젝트: jw4hv/lagomorph
def write_dataset_zarr(dataset, path, key='images'):
    """
    Given a PyTorch Dataset or array_like, write a Zarr dataset.

    We assume that the dataset returns either a single image, or a tuple whose
    first entry is an image. For example, in order to return both an image and a
    set of labels, the dataset can return those as a pair of torch Tensors. Note
    that the names of the extra members of the tuple can be overridden with the
    argument 'extra_keys'.
    """
    try:
        import zarr, lmdb
    except ImportError:
        print(
            'Please install the zarr and lmdb libraries to use write_dataset_zarr.'
        )
        raise
    from .utils import tqdm
    if not isinstance(key, tuple):
        # make key a tuple if it's not already
        key = (key, )
    store = zarr.DirectoryStore(path)
    root = zarr.group(store=store, overwrite=True)
    # determine size needed for h5 dataset
    ds0 = dataset[0]
    if not isinstance(ds0, tuple):
        ds0 = (ds0, )
    # check that the length of the tuple matches args
    if len(ds0) != len(key):
        raise Exception(f"Dataset returns tuple with {len(ds0)} entries, "
                        "but only {len(key)} keys given")
    ds = []
    for d, k in zip(ds0, key):
        dtype = d.dtype
        if isinstance(d, torch.Tensor):  # need a numpy dtype for h5py
            dtype = d.view(-1)[0].cpu().numpy().dtype
        sh = d.shape
        ds.append(
            root.zeros('/' + k,
                       shape=(len(dataset), *sh),
                       chunks=(1, *sh),
                       dtype=dtype))
    for i, di in enumerate(tqdm(dataset)):
        if not isinstance(di, (tuple, list)):
            di = [di]
        for I, dsi in zip(di, ds):
            if isinstance(I, torch.Tensor):
                I = I.cpu().numpy()
            dsi[i, ...] = I
예제 #5
0
 def __create_group(
     self, store: MutableMapping, base: np.ndarray, pyramid: List[np.ndarray]
 ) -> zarr.hierarchy.Group:
     """Create group and datasets."""
     grp = zarr.group(store)
     grp.create_dataset("base", data=base)
     series = []
     for i, dataset in enumerate(pyramid):
         if i == 0:
             path = "base"
         else:
             path = "%s" % i
             grp.create_dataset(path, data=pyramid[i])
         series.append({"path": path})
     return grp
예제 #6
0
def test_copy_all():
    """
    https://github.com/zarr-developers/zarr-python/issues/269

    copy_all used to not copy attributes as `.keys()` does not return hidden `.zattrs`.

    """
    original_group = zarr.group(store=MemoryStore(), overwrite=True)
    original_group.attrs["info"] = "group attrs"
    original_subgroup = original_group.create_group("subgroup")
    original_subgroup.attrs["info"] = "sub attrs"

    destination_group = zarr.group(store=MemoryStore(), overwrite=True)

    # copy from memory to directory store
    copy_all(
        original_group,
        destination_group,
        dry_run=False,
    )

    assert 'subgroup' in destination_group
    assert destination_group.attrs["info"] == "group attrs"
    assert destination_group.subgroup.attrs["info"] == "sub attrs"
예제 #7
0
def compress_zarr_dataset(data,
                          file_path,
                          compression='lz4',
                          clevel=5,
                          start_idx=0,
                          end_idx=0):
    """
    Loads in a zarr data set and exports it with a given compression type and level
    :param data: Zarr data set which will be compressed
    :param file_path: File name path where the data will be exported (e.g. "./export/data.zip")
    :param compression: Compression type
    :param clevel: Compression level
    :param start_idx: Starting index of data to be exported.
    :param end_idx: If end_idx != 0 the data set will be exported to the specified index,
    excluding the sample at end_idx (e.g. end_idx = len(x) will export it fully)
    :return: True if a NaN value was detected
    """
    compressor = Blosc(cname=compression, clevel=clevel, shuffle=Blosc.SHUFFLE)

    # open a dataset file and create arrays
    store = zarr.ZipStore(file_path, mode="w")
    zarr_file = zarr.group(store=store, overwrite=True)

    nan_detected = False
    for key in data.keys():
        if end_idx == 0:
            x = data[key]
        else:
            x = data[key][start_idx:end_idx]

        if np.isnan(x).any():
            nan_detected = True

        array_shape = list(x.shape)
        array_shape[0] = 128
        # export array
        zarr_file.create_dataset(
            name=key,
            data=x,
            shape=x.shape,
            dtype=type(x.flatten()[0]),
            chunks=array_shape,
            synchronizer=zarr.ThreadSynchronizer(),
            compression=compressor,
        )
    store.close()
    logging.info("dataset was exported to: %s", file_path)
    return nan_detected
예제 #8
0
def test_select_mask():

    # Setup.
    pos = np.arange(100)
    gt = np.random.randint(low=-1, high=4, size=(100, 10))
    mask = np.zeros(100, dtype=bool)
    mask[1:99:3] = True

    # Numpy array.
    for a in pos, gt:
        expect = a.compress(mask, axis=0)
        actual = select_mask(a, mask, axis=0)
        assert isinstance(actual, np.ndarray)
        assert_array_equal(expect, actual)

    # Dask array.
    for a in pos, gt:
        expect = a.compress(mask, axis=0)
        d = da.from_array(a)
        actual = select_mask(d, mask, axis=0)
        assert isinstance(actual, da.Array)
        assert_array_equal(expect, actual.compute())
        # With mask as dask array.
        actual = select_mask(d, da.from_array(mask), axis=0)
        assert isinstance(actual, da.Array)
        assert_array_equal(expect, actual.compute())

    # Numpy group.
    g = DictGroup({"variants": {"POS": pos}, "calldata": {"GT": gt}})
    actual = select_mask(g, mask, axis=0)
    assert isinstance(actual, GroupSelection)
    assert isinstance(actual["variants"]["POS"], np.ndarray)
    assert isinstance(actual["calldata"]["GT"], np.ndarray)
    assert_array_equal(pos.compress(mask, axis=0), actual["variants"]["POS"])
    assert_array_equal(gt.compress(mask, axis=0), actual["calldata"]["GT"])

    # Zarr group.
    g = zarr.group()
    g.create_dataset("variants/POS", data=pos)
    g.create_dataset("calldata/GT", data=gt)
    actual = select_mask(g, mask, axis=0)
    assert isinstance(actual, GroupSelection)
    assert isinstance(actual["variants"]["POS"], da.Array)
    assert isinstance(actual["calldata"]["GT"], da.Array)
    assert_array_equal(pos.compress(mask, axis=0),
                       actual["variants"]["POS"].compute())
    assert_array_equal(gt.compress(mask, axis=0),
                       actual["calldata"]["GT"].compute())
예제 #9
0
def generate_gt_data(sample: str, chromosome_gt: dict, chunks=1):
    root = zarr.group()
    sample_group = root.create_group(sample)
    sample_group.create_groups(*chromosome_gt.keys())
    for i in sample_group:
        gt = chromosome_gt[i]
        sites = len(gt)
        gt_data = [[x] for x in gt]
        calldata = sample_group[i].create_group("calldata")
        calldata.create_dataset('GT',
                                shape=(sites, 1, 2),
                                chunks=(chunks, 1, 2),
                                dtype='int8',
                                data=gt_data)

    return root
예제 #10
0
    def setUp(self):
        self.ndims = 7
        num_datasets = 3

        self.temp_dir_zarr = tempfile.TemporaryDirectory(suffix=".zgroup")
        self.zarr_group = zarr.group(store=self.temp_dir_zarr.name,
                                     overwrite=True)
        self.dset_list = list(
            self.zarr_group.create_dataset(
                name='zarray' + str(i),
                data=np.random.rand(*self.srand.choices(
                    range(1, 90 // self.ndims), k=self.ndims)))
            for i in range(num_datasets))
        self.dsetview_list = list(
            DatasetView(self.dset_list[i]) for i in range(num_datasets))
        print(LazyOpszarrTest)
예제 #11
0
파일: from_zarr.py 프로젝트: qinxuye/mars
    def execute(cls, ctx, op):
        import zarr

        axis_offsets = op.axis_offsets
        shape = op.outputs[0].shape

        fs = get_fs(op.filename, None)
        fs_map = FSMap(op.filename, fs)

        root = zarr.group(store=fs_map)
        path = cls.get_path(op.group, op.dataset)
        arr = root[path]

        data = arr[tuple(slice(offset, offset + size)
                         for offset, size in zip(axis_offsets, shape))]
        ctx[op.outputs[0].key] = data
예제 #12
0
def setup_input(samples, input_pattern, seqid, field):
    log('Setting up input array ...')
    input_paths = [input_pattern.format(sample=s) for s in samples]
    input_stores = [zarr.ZipStore(ip, mode='r') for ip in input_paths]
    input_roots = [zarr.group(store) for store in input_stores]
    input_arrays = [
        root[s][seqid][field] for root, s in zip(input_roots, samples)
    ]
    input_arrays = [da.from_array(a, chunks=a.chunks) for a in input_arrays]

    # here we add a dim to allow the hstack to work. must share the shape (X, 1, )
    input_arrays = [a[:, None] if a.ndim == 1 else a for a in input_arrays]

    input_array = da.hstack(input_arrays)
    log('Input array:', input_array)
    return input_array
예제 #13
0
def check_array_setup(samples, input_pattern, seqid, field):
    log('Determining number of variants ...')
    path = input_pattern.format(sample=samples[0])
    callset = zarr.group(zarr.ZipStore(path, mode='r'))
    # expect sample name in hierarchy

    try:
        array = callset[samples[0]][seqid][field]
    except KeyError:
        field = field.replace("calldata/", "variants/")
        array = callset[samples[0]][seqid][field]
        log("{field} found in `variants` not `calldata`".format(field=field))

    n_variants = array.shape[0]
    log('Found {:,} variants.'.format(n_variants))
    return array, field
예제 #14
0
    def __init__(self, h5f: BinaryIO, url: str,
                 xarray: bool = False, spec=1, inline_threshold=0):
        # Open HDF5 file in read mode...
        lggr.debug(f'HDF5 file: {h5f}')
        self.input_file = h5f
        lggr.debug(f'xarray: {xarray}')
        self.spec = spec
        self.inline = inline_threshold
        self._h5f = h5py.File(h5f, mode='r')
        self._xr = xarray

        self.store = {}
        self._zroot = zarr.group(store=self.store, overwrite=True)

        self._uri = url
        lggr.debug(f'HDF5 file URI: {self._uri}')
예제 #15
0
def prepare_zarr_storage(variations, out_path):
    store = zarr.DirectoryStore(str(out_path))
    root = zarr.group(store=store, overwrite=True)
    metadata = variations.metadata
    sources = []
    targets = []

    samples_array = variations.samples
    #samples_array.compute_chunk_sizes()
    sources.append(samples_array)

    object_codec = None
    if samples_array.dtype == object:
        object_codec = numcodecs.VLenUTF8()

    dataset = zarr.create(shape=samples_array.shape, path='samples', store=store,
                          dtype=samples_array.dtype, object_codec=object_codec)
    targets.append(dataset)

    variants = root.create_group(ZARR_VARIANTS_GROUP_NAME, overwrite=True)
    calls = root.create_group(ZARR_CALL_GROUP_NAME, overwrite=True)
    for field, array in variations.items():
        definition = ALLELE_ZARR_DEFINITION_MAPPINGS[field]

        field_metadata = metadata.get(field, None)
        array = variations[field]
        if array is None:
            continue
        array.compute_chunk_sizes()
        sources.append(array)

        group_name = definition['group']
        group = calls if group_name == ZARR_CALL_GROUP_NAME else variants
        path = os.path.sep + os.path.join(group.path, definition['field'])

        object_codec = None
        if array.dtype == object:
            object_codec = numcodecs.VLenUTF8()
        dataset = zarr.create(shape=array.shape, path=path, store=store,
                              object_codec=object_codec, dtype=array.dtype)
        if field_metadata is not None:
            for key, value in field_metadata.items():
                dataset.attrs[key] = value

        targets.append(dataset)
        lock = SerializableLock()
    return da.store(sources, targets, compute=False, lock=lock)
예제 #16
0
    def load(cls, path: PathType):
        """Load existing DirectoryStore state into a MemoryStore object."""
        memory_store = zarr.MemoryStore()
        directory_store = zarr.DirectoryStore(path)
        zarr.convenience.copy_store(source=directory_store, dest=memory_store)

        group = zarr.group(store=memory_store)
        #<<<<<<< HEAD
        #        xshape = cls._extract_xshape_from_zarr_group(group)
        #        zdim = cls._extract_zdim_from_zarr_group(group)
        #        return cls(zdim=zdim, xshape=xshape, store=memory_store)
        #        # sim_shapes = cls._extract_sim_shapes_from_zarr_group(group)
        #        # z = cls._extract_params_from_zarr_group(group)
        #        # return MemoryCache(params=z, sim_shapes=sim_shapes, store=memory_store)
        #=======
        zdim = group[cls._filesystem.pars].shape[1]
        return MemoryStore(params=zdim, zarr_store=memory_store)
예제 #17
0
 def create_tile_directory(self, resolution, width, height):
     tile_directory = os.path.join(self.slide_directory, str(resolution))
     if self.file_type in ("n5", "zarr"):
         tile_directory = os.path.join(self.slide_directory,
                                       "pyramid.%s" % self.file_type)
         self.zarr_store = zarr.DirectoryStore(tile_directory)
         if self.file_type == "n5":
             self.zarr_store = zarr.N5Store(tile_directory)
         self.zarr_group = zarr.group(store=self.zarr_store)
         self.zarr_group.create_dataset(str(resolution),
                                        shape=(3, height, width),
                                        chunks=(None, self.tile_height,
                                                self.tile_width),
                                        dtype='B')
     else:
         os.mkdir(tile_directory)
     return tile_directory
예제 #18
0
    def table(self, data, names=None, expectedlen=None, **kwargs):

        # setup
        names, columns = _util.check_table_like(data, names=names)
        kwargs = self._set_defaults(kwargs)
        g = zarr.group(**kwargs)

        # create columns
        chunks = kwargs.get('chunks', None)
        for n, c in zip(names, columns):
            if chunks is None:
                chunks = default_chunks(c, expectedlen)
            g.array(name=n, data=c, chunks=chunks)

        # create table
        ztbl = ZarrTable(g, names=names)
        return ztbl
예제 #19
0
    def create_tile_directory(self, series, resolution, width, height):
        tile_directory = os.path.join(self.slide_directory,
                                      "data.%s" % self.file_type)
        self.zarr_store = zarr.DirectoryStore(tile_directory)
        if self.file_type == "n5":
            self.zarr_store = zarr.N5Store(tile_directory)
        self.zarr_group = zarr.group(store=self.zarr_store)
        self.zarr_group.attrs['bioformats2raw.layout'] = LAYOUT_VERSION

        # important to explicitly set the chunk size to 1 for non-XY dims
        # setting to None may cause all planes to be chunked together
        # ordering is TZCYX and hard-coded since Z and T are not present
        self.zarr_group.create_dataset(
            "%s/%s" % (str(series), str(resolution)),
            shape=(1, 1, 3, height, width),
            chunks=(1, 1, 1, self.tile_height, self.tile_width),
            dtype='B')
예제 #20
0
def test_select_values():

    # Setup.
    pos = np.arange(1, 300, 3)
    gt = np.random.randint(low=-1, high=4, size=(100, 10))
    query = [31, 61]

    # Numpy array.
    for a in pos, gt:
        expect = a[[10, 20]]
        actual = select_values(a, pos, query, axis=0)
        assert isinstance(actual, np.ndarray)
        assert_array_equal(expect, actual)

    # Dask array.
    for a in pos, gt:
        expect = a[[10, 20]]
        d = da.from_array(a)
        actual = select_values(d, pos, query, axis=0)
        assert isinstance(actual, da.Array)
        assert_array_equal(expect, actual.compute())

    # Numpy group.
    g = DictGroup({"variants": {"POS": pos}, "calldata": {"GT": gt}})
    actual = select_values(g, "variants/POS", query, axis=0)
    assert isinstance(actual, GroupSelection)
    assert isinstance(actual["variants"]["POS"], np.ndarray)
    assert isinstance(actual["calldata"]["GT"], np.ndarray)
    assert_array_equal(pos[[10, 20]], actual["variants"]["POS"])
    assert_array_equal(gt[[10, 20]], actual["calldata"]["GT"])

    # Zarr group.
    g = zarr.group()
    g.create_dataset("variants/POS", data=pos)
    g.create_dataset("calldata/GT", data=gt)
    actual = select_values(g, "variants/POS", query, axis=0)
    assert isinstance(actual, GroupSelection)
    assert isinstance(actual["variants"]["POS"], da.Array)
    assert isinstance(actual["calldata"]["GT"], da.Array)
    assert_array_equal(pos[[10, 20]], actual["variants"]["POS"].compute())
    assert_array_equal(gt[[10, 20]], actual["calldata"]["GT"].compute())

    # Errors.
    with pytest.raises(KeyError):
        select_values(gt, pos, query=[1, 999], axis=0)
예제 #21
0
파일: array.py 프로젝트: bebatut/zappy
 def __init__(
     self,
     executor,
     dag,
     input,
     shape,
     chunks,
     dtype,
     partition_row_counts=None,
     intermediate_store=None,
 ):
     ZappyArray.__init__(self, shape, chunks, dtype, partition_row_counts)
     self.executor = executor
     self.dag = dag
     self.input = input
     if intermediate_store == None:
         intermediate_store = zarr.group()
     self.intermediate_store = intermediate_store
예제 #22
0
def output_to_zarr(path, seq_id, sample_id, arrays, cname, clevel, shuffle):

    log('Output zarr to {!r} ...'.format(path))

    store = zarr.ZipStore(path, mode='w')
    root = zarr.group(store=store)
    callset = root.create_group(sample_id)
    seq_group = callset.require_group(seq_id)
    calldata_group = seq_group.require_group('calldata')
    variants_group = seq_group.require_group('variants')

    compressor = numcodecs.Blosc(cname=cname, clevel=clevel, shuffle=shuffle)

    for key, value in arrays.items():
        calldata_group.create_dataset(key, data=value, compressor=compressor)
        log('Created output array: ' + repr(key))

    store.close()
예제 #23
0
def load(input_obj):
    """ """
    trx = TrxFile()
    if isinstance(input_obj, str):
        if os.path.isdir(input_obj):
            store = zarr.storage.DirectoryStore(input_obj)
        elif os.path.isfile(input_obj) and \
                os.path.splitext(input_obj)[1] in ['.zip', '.trx']:
            store = zarr.ZipStore(input_obj)
        else:
            raise ValueError('Invalid input path/filename.')
    else:
        store = input_obj

    trx._zcontainer = zarr.group(store=store, overwrite=False)
    trx.storage = store

    return trx
예제 #24
0
    def __init__(
        self,
        params: Union[int, list],
        zarr_store: Union[zarr.MemoryStore, zarr.DirectoryStore],
        simulator=None,
        sync_path: Optional[PathType] = None,
    ):
        """Initialize Store content dimensions.

        Args:
            params (list of strings or int): List of paramater names.  If int use ['z0', 'z1', ...].
            zarr_store: zarr storage.
            sync_path: path to the cache lock files. Must be accessible to all
                processes working on the cache.
        """
        self._zarr_store = zarr_store
        self._simulator = simulator

        if isinstance(params, int):
            params = ["z%i" % i for i in range(params)]
        self.params = params

        synchronizer = zarr.ProcessSynchronizer(sync_path) if sync_path else None
        self._root = zarr.group(store=self.zarr_store, synchronizer=synchronizer)

        logging.debug("  params = %s" % str(params))

        if set(["samples", "metadata"]) == set(self._root.keys()):
            logging.info("Loading existing store.")
            self._update()
        elif len(self._root.keys()) == 0:
            logging.info("Creating new store.")
            self._setup_new_zarr_store(
                len(self.params), simulator.sim_shapes, self._root
            )
            logging.debug("  sim_shapes = %s" % str(simulator.sim_shapes))
        else:
            raise KeyError(
                "The zarr storage is corrupted. It should either be empty or only have the keys ['samples', 'metadata']."
            )

        self._lock = None
        if sync_path is not None:
            self._setup_lock(sync_path)
예제 #25
0
    def read_features(cls, path: FilepathType) -> FeatureMap:
        path = Path(path).resolve()
        lock = FileLock(_lockfile(path))
        with lock, zarr.ZipStore(path, mode="r") as store:
            root = zarr.group(store=store)
            features = root.attrs[_FEATURES_KEY]
        raw_features = FeatureMap.deserialize(features)

        if len(cls.features) == 0:
            return raw_features

        for required_feat_name, required_feat in cls.features.items():
            if (required_feat_name not in raw_features
                    or raw_features[required_feat_name] != required_feat):
                raise RuntimeError(
                    "Dataset stored at %s is incompatible with %s" %
                    (path, cls.__class__))

        return cls.features
예제 #26
0
def test_rechunk_group(tmp_path, executor, source_store, target_store,
                       temp_store):
    if source_store.startswith("mapper"):
        fsspec = pytest.importorskip("fsspec")
        store_source = fsspec.get_mapper(str(tmp_path) + source_store)
        target_store = fsspec.get_mapper(str(tmp_path) + target_store)
        temp_store = fsspec.get_mapper(str(tmp_path) + temp_store)
    else:
        store_source = str(tmp_path / source_store)
        target_store = str(tmp_path / target_store)
        temp_store = str(tmp_path / temp_store)

    group = zarr.group(store_source)
    group.attrs["foo"] = "bar"
    # 800 byte chunks
    a = group.ones("a", shape=(5, 10, 20), chunks=(1, 10, 20), dtype="f4")
    a.attrs["foo"] = "bar"
    b = group.ones("b", shape=(20, ), chunks=(10, ), dtype="f4")
    b.attrs["foo"] = "bar"

    max_mem = 1600  # should force a two-step plan for a
    target_chunks = {"a": (5, 10, 4), "b": (20, )}

    rechunked = api.rechunk(
        group,
        target_chunks,
        max_mem,
        target_store,
        temp_store=temp_store,
        executor=executor,
    )
    assert isinstance(rechunked, api.Rechunked)

    target_group = zarr.open(target_store)
    assert "a" in target_group
    assert "b" in target_group
    assert dict(group.attrs) == dict(target_group.attrs)

    rechunked.execute()
    for aname in target_chunks:
        assert target_group[aname].chunks == target_chunks[aname]
        a_tar = dsa.from_zarr(target_group[aname])
        assert dsa.equal(a_tar, 1).all().compute()
예제 #27
0
    def to_filename(self, filename):
        """
        Stores the greyordinate data to the given filename.

        Type of storage is determined by the extension of the filename:

        - .dscalar/dconn/dlabel.nii: CIFTI file
        - .h5/hdf5/he2/he5: HDF5 file representing CIFTI data
        - .zarr: zarr file representing CIFTI data
        - .gii: GIFTI file (only stores surface data;
            raises error if more that one surface is represented in the greyordinates)
        - .nii: NIFTI file (only stores the volumetric data)

        :param filename: target filename
        """
        if hasExt(filename, ('.dscalar.nii', '.dconn.nii', '.dlabel.nii')):
            self.to_cifti().to_filename(filename)
        elif hasExt(filename, ('.h5', '.hdf5', '.he2', 'he5')):
            import h5py
            with h5py.File(filename, 'w') as f:
                self.to_hdf5(f)
        elif hasExt(filename, ('.zarr', )):
            import zarr
            f = zarr.group(filename)
            self.to_hdf5(f)
        elif hasExt(filename, ('.gii', )):
            surfaces = np.unique(
                self.brain_model_axis.name[self.brain_model_axis.surface_mask])
            if len(surfaces) > 1:
                raise ValueError(
                    f"Can not write to GIFTI file as more than one surface has been defined: {surfaces}"
                )
            if len(surfaces) == 0:
                raise ValueError(
                    "Can not write to GIFTI file as no surface has been provided"
                )
            write_gifti(filename, [self.surface(surfaces[0])], surfaces[0])
        elif hasExt(filename, ('.nii.gz', '.nii')):
            self.volume().to_filename(filename)
        else:
            raise IOError(
                f"Extension of {filename} not recognized for NIFTI, GIFTI, or CIFTI file"
            )
예제 #28
0
    def create_tile_directory(self, series, resolution, width, height):
        dimension_separator = '/'
        if not self.nested:
            dimension_separator = '.'
        self.zarr_store = FSStore(self.slide_directory,
                                  dimension_separator=dimension_separator,
                                  normalize_keys=True,
                                  auto_mkdir=True)
        self.zarr_group = zarr.group(store=self.zarr_store)
        self.zarr_group.attrs['bioformats2raw.layout'] = LAYOUT_VERSION

        # important to explicitly set the chunk size to 1 for non-XY dims
        # setting to None may cause all planes to be chunked together
        # ordering is TZCYX and hard-coded since Z and T are not present
        self.zarr_group.create_dataset(
            "%s/%s" % (str(series), str(resolution)),
            shape=(1, 1, 3, height, width),
            chunks=(1, 1, 1, self.tile_height, self.tile_width),
            dtype='B')
예제 #29
0
    def _parse_optimus_bundle(self, bundle_dir, bundle_manifest_path):
        """
        Parses optimus analysis files into PSV rows for cell and expression Redshift tables.
        """

        keys = self._parse_keys(bundle_dir)

        file_uuid = [f for f in json.load(open(bundle_manifest_path))["files"]
                     if f["name"].endswith(".zattrs")][0]["uuid"]
        file_version = [f for f in json.load(open(bundle_manifest_path))["files"]
                        if f["name"].endswith(".zattrs")][0]["version"]

        emptydrops_result = {}
        with open(os.path.join(bundle_dir, "empty_drops_result.csv")) as emptydrops_file:
            reader = csv.DictReader(emptydrops_file)
            for row in reader:
                emptydrops_result[row["CellId"]] = {"total_umi_count": int(row["Total"]),
                                                    "is_cell": row["IsCell"] == "TRUE"}
        # read expression matrix from zarr
        store = DCPZarrStore(bundle_dir=bundle_dir)
        root = zarr.group(store=store)

        n_cells = root.expression_matrix.cell_id.shape[0]
        chunk_size = root.expression_matrix.cell_id.chunks[0]
        n_chunks = root.expression_matrix.cell_id.nchunks
        cell_lines = set()
        expression_lines = []

        logger.info(f"Optimus bundle has {n_cells} cells and {n_chunks} chunks.")
        for i in range(n_chunks):
            self._parse_optimus_chunk(
                keys=keys,
                file_uuid=file_uuid,
                file_version=file_version,
                root=root,
                start_row=chunk_size * i,
                end_row=(i + 1) * chunk_size if (i + 1) * chunk_size < n_cells else n_cells,
                cell_lines=cell_lines,
                expression_lines=expression_lines,
                emptydrops_result=emptydrops_result
            )

        return cell_lines, expression_lines
예제 #30
0
    def __init__(self, path, transforms=None):
        self.path = path
        self.keys = ('images', 'labels')
        assert os.path.exists(path), 'file `{}` not exists!'.format(path)

        with zarr.LMDBStore(path) as store:
            zarr_db = zarr.group(store=store)
            self.num_examples = zarr_db['labels'].shape[0]
        self.datasets = None

        if transforms is None:
            transforms = {
                'labels':
                lambda v: torch.tensor(v, dtype=torch.long),
                'images':
                lambda v: torch.tensor(
                    (v - 127.5) / 127.5, dtype=torch.float32)
            }
        self.transforms = transforms
예제 #31
0
    def __init__(self, h5f, xarray=False):
        # Open HDF5 file in read mode...
        lggr.debug(f'HDF5 file: {h5f}')
        lggr.debug(f'xarray: {xarray}')
        self._h5f = h5py.File(h5f, mode='r')
        self._xr = xarray

        self.store = {}
        self._zroot = zarr.group(store=self.store, overwrite=True)

        # Figure out HDF5 file's URI...
        if hasattr(h5f, 'name'):
            self._uri = h5f.name
        elif hasattr(h5f, 'url'):
            parts = urlparse(h5f.url())
            self._uri = urlunparse(parts[:3] + ('', ) * 3)
        else:
            self._uri = None
        lggr.debug(f'Source URI: {self._uri}')
예제 #32
0
def convert_data_set(path, data_set, batch_size=1000):
    loader = DataLoader(
        data_set, batch_size=batch_size, shuffle=False, num_workers=4)

    num_examples = len(data_set)

    os.makedirs(path, exist_ok=True)
    with zarr.LMDBStore(path) as store:
        root = zarr.group(store=store, overwrite=True)
        images_set = root.zeros(
            'images',
            shape=(num_examples, 3, 96, 96),
            chunks=(1, None, None, None),
            dtype='u1')
        labels_set = root.zeros(
            'labels', shape=(num_examples, ), chunks=(1, ), dtype='u1')
        current_iter = 0
        for images, labels in tqdm(loader):
            size = images.shape[0]
            images_set[current_iter:current_iter + size] = images
            labels_set[current_iter:current_iter + size] = labels
            current_iter += size
예제 #33
0
    def ImportDataTable(self, tableid):
              
        with self._calculationObject.LogHeader('Importing 2D datatable {0}'.format(tableid)):

            DQXUtils.CheckValidTableIdentifier(tableid)
            self._calculationObject.credentialInfo.VerifyCanDo(DQXDbTools.DbOperationWrite(self._datasetId))
            self._calculationObject.credentialInfo.VerifyCanDo(DQXDbTools.DbOperationWrite(self._datasetId))
            
            max_line_count = None
            if self._maxLineCount > 0:
                max_line_count = self._maxLineCount
                
            table_settings = self.getSettings(tableid)


            settingsFile, data_file = self._getDataFiles(tableid)
            zarr_file = zarr.DirectoryStore(data_file)
            zarr_file = zarr.group(zarr_file)
            #Check that the referenced tables exist and have the primary key specified.

            if table_settings['columnDataTable']:
                columnTableSettings = SettingsDataTable()
                columnTableSettings.loadFile(
                    os.path.join(self._datasetFolder, 'datatables', table_settings['columnDataTable'], 'settings'))
                columnProperties = [prop['id'] for prop in columnTableSettings['properties']]
#                if table_settings['columnIndexField'] not in columnProperties:
 #                   raise Exception(table_settings['columnDataTable'] + ' does not have property ' + table_settings['columnIndexField'])
            if table_settings['rowDataTable']:
                rowTableSettings = SettingsDataTable()
                rowTableSettings.loadFile(
                    os.path.join(self._datasetFolder, 'datatables', table_settings['rowDataTable'], 'settings'))
                rowProperties = [prop['id'] for prop in rowTableSettings['properties']]
                if table_settings['rowIndexField'] not in rowProperties:
                    raise Exception(table_settings['rowDataTable'] + ' does not have property ' + table_settings['rowIndexField'])
    
            if table_settings['showInGenomeBrowser']:
                if not columnTableSettings['isPositionOnGenome']:
                    raise Exception(table_settings['columnDataTable'] + ' is not a genomic position based table (IsPositionOnGenome in config), but you have asked to use this table as a column index on a genome browseable 2D array.')

            if not self._importSettings['ConfigOnly']:
                #Insert an index column into the index tables
                if table_settings['columnDataTable']:
                    # Assume that index field has been created on import in LoadTable - it's much faster
                    # We could just run the command and ignore the error raised if it already exists
                    # sql = "ALTER TABLE `{0}` ADD `{1}_column_index` INT DEFAULT NULL;".format(table_settings['columnDataTable'], tableid)
                    # self._execSql(sql)
                    self._dao.insert2DIndexes(zarr_file, "column", tableid, table_settings,
                                              columnTableSettings['primKey'],
                                              max_line_count)

                if table_settings['rowDataTable']:
                    self._dao.insert2DIndexes(zarr_file, "row", tableid, table_settings,
                                              rowTableSettings['primKey'],
                                              None)

                ImpUtils.mkdir(os.path.join(self._config.getBaseDir(), '2D_data'))
                path_join = os.path.join(self._config.getBaseDir(), '2D_data', self._datasetId + '_' + tableid + '.zarr')
                try:
                    os.remove(path_join)
                except OSError:
                    pass
                print("Symlinking 2D data")
                os.symlink(data_file, path_join)
예제 #34
0
파일: core.py 프로젝트: elaeon/ML
 def open(self):
     if self.conn is None:
         self.conn = zarr.group()
         self.attrs = self.conn.attrs