Exemplo n.º 1
0
    def _compute(self):
        # create a zarr groups to materialize arrays to
        sym = gensym("asndarray")
        store = zarr.TempStore()
        root = zarr.open(store, mode="w")  # TODO: allow cloud storage

        # save arrays
        def save(indexed_row):
            index, row = indexed_row
            # remove array in case we are being materialized again
            zarr.storage.rmdir(store, "/{}".format(index))
            root = zarr.group(store)
            root.array(str(index), row, chunks=False)

        self.pcollection | sym >> beam.Map(save)
        result = self.pipeline.run()
        result.wait_until_finish()

        # read back arrays
        local_rows = [None] * len(self.partition_row_counts)
        for (name, row) in root.arrays():
            index = int(name)
            local_rows[index] = row

        return local_rows
Exemplo n.º 2
0
def load(savepath,
         lazy: bool = False,
         normalize_strings: bool = True,
         use_temp: bool = False):
    """[summary]

    Args:
        savepath ([type]): [description]
        lazy (bool, optional): [description]. Defaults to True.
        normalize_strings (bool, optional): [description]. Defaults to True.
        use_temp (bool, optional): Unpack zip to temp file - potentially speeds up loading and allows overwriting existing zarr file.
                                   Defaults to True.
    Returns:
        [type]: [description]
    """
    zarr_store = zarr.ZipStore(savepath, mode='r')
    if use_temp:
        dest = zarr.TempStore()
        zarr.copy_store(zarr_store, dest)
        zarr_store.close()
        zarr_store = dest
    dataset = xr.open_zarr(zarr_store)
    if not lazy:
        dataset.load()
        zarr_store.close()

    if normalize_strings:
        dataset = _normalize_strings(dataset)

    return dataset
Exemplo n.º 3
0
 def _set_defaults(self, kwargs):
     kwargs = super(ZarrTmpStorage, self)._set_defaults(kwargs)
     suffix = kwargs.pop('suffix', '.zarr')
     prefix = kwargs.pop('prefix', 'scikit_allel_')
     # noinspection PyShadowingBuiltins
     dir = kwargs.pop('dir', None)
     kwargs.setdefault(
         'store', zarr.TempStore(suffix=suffix, prefix=prefix, dir=dir))
     return kwargs
Exemplo n.º 4
0
 def test_write_zarr(self, adata, adata_dist):
     log1p(adata_dist)
     temp_store = zarr.TempStore()
     chunks = adata_dist.X.chunks
     # write metadata using regular anndata
     adata.write_zarr(temp_store, chunks)
     if isinstance(adata_dist.X, da.Array):
         adata_dist.X.to_zarr(temp_store.dir_path("X"))
     else:
         adata_dist.X.to_zarr(temp_store.dir_path("X"), chunks)
     # read back as zarr (without using RDDs) and check it is the same as adata.X
     adata_log1p = ad.read_zarr(temp_store)
     log1p(adata)
     npt.assert_allclose(adata_log1p.X, adata.X)
Exemplo n.º 5
0
    def xd_and_temp_store(self, sc, x, xz, chunks, request):
        if request.param == "direct_ndarray":
            yield zappy.direct.from_ndarray(x.copy(), chunks), zarr.TempStore()
        elif request.param == "direct_zarr":
            yield zappy.direct.from_zarr(xz), zarr.TempStore()
        elif request.param == "executor_ndarray":
            with concurrent.futures.ThreadPoolExecutor(
                    max_workers=2) as executor:
                yield zappy.executor.from_ndarray(executor, x.copy(),
                                                  chunks), zarr.TempStore()
        elif request.param == "executor_zarr":
            with concurrent.futures.ThreadPoolExecutor(
                    max_workers=2) as executor:
                yield zappy.executor.from_zarr(executor, xz), zarr.TempStore()
        elif request.param == "spark_ndarray":
            yield zappy.spark.from_ndarray(sc, x.copy(),
                                           chunks), zarr.TempStore()
        elif request.param == "spark_zarr":
            yield zappy.spark.from_zarr(sc, xz), zarr.TempStore()
        elif request.param == "beam_ndarray":
            pipeline_options = PipelineOptions()
            pipeline = beam.Pipeline(options=pipeline_options)
            yield zappy.beam.from_ndarray(pipeline, x.copy(),
                                          chunks), zarr.TempStore()
        elif request.param == "beam_zarr":
            pipeline_options = PipelineOptions()
            pipeline = beam.Pipeline(options=pipeline_options)
            yield zappy.beam.from_zarr(pipeline, xz), zarr.TempStore()
        elif request.param == "pywren_ndarray":
            import s3fs.mapping

            def create_unique_bucket_name(prefix):
                import uuid

                return "%s-%s" % (prefix, str(uuid.uuid4()).replace("-", ""))

            s3 = s3fs.S3FileSystem()
            bucket = create_unique_bucket_name("zappy-test")
            s3.mkdir(bucket)
            path = "%s/%s" % (bucket, "test.zarr")
            s3store = s3fs.mapping.S3Map(path, s3=s3)
            executor = zappy.executor.PywrenExecutor()
            yield zappy.executor.from_ndarray(executor, x.copy(),
                                              chunks), s3store
            s3.rm(bucket, recursive=True)
Exemplo n.º 6
0
    def test_write_zarr(self, adata, adata_dist):
        import dask.array as da
        import zarr

        log1p(adata_dist)
        temp_store = zarr.TempStore()
        chunks = adata_dist.X.chunks
        if isinstance(chunks[0], tuple):
            chunks = (chunks[0][0], ) + chunks[1]
        # write metadata using regular anndata
        adata.write_zarr(temp_store, chunks)
        if isinstance(adata_dist.X, da.Array):
            adata_dist.X.to_zarr(temp_store.dir_path("X"), overwrite=True)
        else:
            adata_dist.X.to_zarr(temp_store.dir_path("X"), chunks)
        # read back as zarr directly and check it is the same as adata.X
        adata_log1p = ad.read_zarr(temp_store)
        log1p(adata)
        npt.assert_allclose(adata_log1p.X, adata.X)
Exemplo n.º 7
0
    def test_run_batch_dim(self, dims, data, clock, parallel, scheduler):
        @xs.process
        class P:
            in_var = xs.variable(dims=[(), "x"])
            out_var = xs.variable(dims=[(), "x"], intent="out")
            idx_var = xs.index(dims="x")

            def initialize(self):
                self.idx_var = [0, 1]

            def run_step(self):
                self.out_var = self.in_var * 2

        m = xs.Model({"p": P})

        in_ds = xs.create_setup(
            model=m,
            clocks={"clock": [0, 1, 2]},
            input_vars={"p__in_var": (dims, data)},
            output_vars={"p__out_var": clock},
        )

        out_ds = in_ds.xsimlab.run(
            model=m,
            batch_dim="batch",
            parallel=parallel,
            scheduler=scheduler,
            store=zarr.TempStore(),
        )

        if clock is None:
            coords = {}
        else:
            coords = {"clock": in_ds["clock"]}

        expected = xr.DataArray(data, dims=dims, coords=coords) * 2
        xr.testing.assert_equal(out_ds["p__out_var"], expected)
Exemplo n.º 8
0
def offcore_array(
    shape: Union[Tuple[int, ...], Generator[int, None, None]],
    dtype: numpy.dtype,
    force_memmap: bool = False,
    zarr_allowed: bool = False,
    no_memmap_limit: bool = True,
    max_memory_usage_ratio: float = 0.9,
):
    """
    Instanciates an array of given shape and dtype in  'off-core' fashion i.e. not in main memory.
    Right now it simply uses memory mapping on temp file that is deleted after the file is closed

    Parameters
    ----------
    shape
    dtype
    force_memmap
    zarr_allowed
    no_memmap_limit
    max_memory_usage_ratio
    """

    with lsection(f"Array of shape: {shape} and dtype: {dtype} requested"):
        size_in_bytes = numpy.prod(shape) * numpy.dtype(dtype).itemsize
        lprint(f'Array requested will be {(size_in_bytes / 1E6)} MB.')

        total_physical_memory_in_bytes = psutil.virtual_memory().total
        total_swap_memory_in_bytes = psutil.swap_memory().total

        total_mem_in_bytes = total_physical_memory_in_bytes + total_swap_memory_in_bytes
        lprint(
            f'There is {int(psutil.virtual_memory().total / 1E6)} MB of physical memory'
        )
        lprint(
            f'There is {int(psutil.swap_memory().total / 1E6)} MB of swap memory'
        )
        lprint(f'There is {int(total_mem_in_bytes / 1E6)} MB of total memory')

        is_enough_physical_memory = (size_in_bytes < max_memory_usage_ratio *
                                     total_physical_memory_in_bytes)

        is_enough_total_memory = (size_in_bytes <
                                  max_memory_usage_ratio * total_mem_in_bytes)

        if not force_memmap and is_enough_total_memory:
            lprint(
                f'There is enough physical+swap memory -- we do not need to use a mem mapped array or zarr-backed array.'
            )
            array = numpy.zeros(shape, dtype=dtype)

        elif no_memmap_limit:
            lprint(
                f'There is not enough physical+swap memory -- we will use a mem mapped array.'
            )
            temp_file = tempfile.NamedTemporaryFile(
                dir=OffCore.memmap_directory)
            lprint(
                f'The temporary memory mapped file is at: {temp_file.name} (but you might not be able to see it!)'
            )
            array = numpy.memmap(temp_file,
                                 dtype=dtype,
                                 mode='w+',
                                 shape=shape)

        elif zarr_allowed:
            lprint(
                f'There is not enough physical+swap memory -- we will use a zarr-backed array.'
            )
            import zarr

            array = zarr.create(shape=shape,
                                dtype=dtype,
                                store=zarr.TempStore("output.zarr"))
            # from numcodecs import Blosc
            # compressor = Blosc(cname = 'zstd', clevel = 3, shuffle = Blosc.BITSHUFFLE)
            # array = zarr.zeros((102_0, 200, 210), chunks = (100, 200, 210), compressor = compressor

        return array
Exemplo n.º 9
0
    def write_image_by_tile(
        self,
        image_name: str,
        output_dir: Union[Path, str] = "",
        write_pyramid: bool = True,
        compression: Optional[str] = "default",
        zarr_temp_dir: Optional[Union[str, Path]] = None,
    ) -> str:
        """
        Write images to OME-TIFF from temp zarr store with data.

        Parameters
        ----------
        image_name: str
            file path stem of the image to be written
        output_dir: Union[str,Path]
            directory where image is to be written
        write_pyramid: bool
            whether to write a pyramid or single layer
        compression: str
            Use compression. "default" will be lossless "deflate" for non-rgb images
            and "jpeg" for RGB images
        zarr_temp_dir: Path or str
            Directory to store the temporary zarr data
            (mostly used for debugging)

        Returns
        -------
        output_file_name: Path
            Path to written image file
        """
        zstr = zarr.TempStore(dir=zarr_temp_dir)
        try:
            resample_zarray = self.write_tiles_to_zarr_store(zstr)
            output_file_name = str(Path(output_dir) / f"{image_name}.ome.tiff")

            if compression == "default":
                print("using default compression")
                compression = "jpeg" if self.reg_image.is_rgb else "deflate"
            else:
                compression = compression

            (
                n_pyr_levels,
                subifds,
                out_tile_shape,
                omexml,
            ) = self._prepare_image_info(
                image_name, write_pyramid=write_pyramid
            )

            print(f"saving to {output_file_name}")

            dask_image = da.from_zarr(resample_zarray)
            options = dict(
                tile=self.tile_shape,
                compression=compression,
                photometric="rgb" if self.reg_image.is_rgb else "minisblack",
                metadata=None,
            )
            with TiffWriter(output_file_name, bigtiff=True) as tif:
                if self.reg_image.is_rgb:
                    print(
                        f"writing base layer RGB - shape: {dask_image.shape}"
                    )
                    # tile_iterator_strides = self._get_tile_iterator_strides(dask_image)
                    tile_iterator = self._transformed_tile_generator(
                        dask_image, 0
                    )
                    tif.write(
                        tile_iterator,
                        subifds=subifds,
                        description=omexml,
                        shape=dask_image.shape,
                        dtype=dask_image.dtype,
                        **options,
                    )

                    if write_pyramid:
                        for pyr_idx in range(1, n_pyr_levels):
                            sub_res = compute_sub_res(
                                dask_image,
                                pyr_idx,
                                self.tile_shape[0],
                                self.reg_image.is_rgb,
                                self.reg_image.im_dtype,
                            )
                            print(
                                f"pyr {pyr_idx} : RGB-shape: {sub_res.shape}"
                            )

                            # tile_strides = self._get_tile_iterator_strides(sub_res)
                            sub_res_tile_iterator = (
                                self._transformed_tile_generator(sub_res, 0)
                            )
                            tif.write(
                                sub_res_tile_iterator,
                                shape=sub_res.shape,
                                dtype=self.reg_image.im_dtype,
                                **options,
                                subfiletype=1,
                            )
                else:
                    for channel_idx in range(self.reg_image.n_ch):

                        description = omexml if channel_idx == 0 else None
                        print(
                            f"writing channel {channel_idx} - shape: {dask_image.shape[1:]}"
                        )
                        tile_iterator = self._transformed_tile_generator(
                            dask_image, channel_idx
                        )

                        tif.write(
                            tile_iterator,
                            subifds=subifds,
                            description=description,
                            shape=dask_image.shape[1:],
                            dtype=dask_image.dtype,
                            **options,
                        )
                        if write_pyramid:
                            for pyr_idx in range(1, n_pyr_levels):
                                sub_res = compute_sub_res(
                                    dask_image,
                                    pyr_idx,
                                    self.tile_shape[0],
                                    self.reg_image.is_rgb,
                                    self.reg_image.im_dtype,
                                )

                                sub_res_tile_iterator = (
                                    self._transformed_tile_generator(
                                        sub_res, channel_idx
                                    )
                                )

                                tif.write(
                                    sub_res_tile_iterator,
                                    shape=sub_res.shape[1:],
                                    dtype=dask_image.dtype,
                                    **options,
                                    subfiletype=1,
                                )
            try:
                resample_zarray.store.clear()
            except FileNotFoundError:
                pass
            return output_file_name

        # bare except to always clear temporary storage on failure
        except Exception as e:
            print(e)
            try:
                resample_zarray.store.clear()
            except FileNotFoundError:
                pass
def generate_coalescent_synthetic_data(num_samples=1000,
                                       num_bases=1e7,
                                       Ne=1e4,
                                       mu=3.5e-9,
                                       rrate=1e-8,
                                       ploidy=2,
                                       seed=57):
    """
        Function credits: Nick Harding
        Reference URL: https://hardingnj.github.io/2017/08/23/power-of-correct-tools.html
    """
    tree_sequence = msprime.simulate(sample_size=num_samples * ploidy,
                                     Ne=Ne,
                                     length=num_bases,
                                     recombination_rate=rrate,
                                     mutation_rate=mu,
                                     random_seed=seed,
                                     model="dtwf")

    # Print the number of mutations in tree sequence
    print("Simulated ", tree_sequence.get_num_mutations(), "mutations")

    print("Creating Zarr data store root")
    store = zarr.DirectoryStore(ZARR_PATH)
    root = zarr.group(store=store, overwrite=True)

    print('Creating Zarr Array')
    compressor = Blosc(cname='zstd', clevel=1, shuffle=Blosc.AUTOSHUFFLE)
    z_shape = (tree_sequence.get_num_mutations(), num_samples, ploidy)
    z_chunks = (VARIANTS_PER_CHUNK, SAMPLES_PER_CHUNK, PLOIDY_PER_CHUNK)
    z = root.empty('calldata/GT',
                   shape=z_shape,
                   chunks=z_chunks,
                   dtype='i1',
                   compressor=compressor)

    print('Creating temporary Zarr Array for holding data')
    temp_chunks = (TEMP_VARIANTS_PER_CHUNK, TEMP_SAMPLES_PER_CHUNK,
                   TEMP_PLOIDY_PER_CHUNK)
    temp_store = zarr.TempStore(dir='./')
    temp_root = zarr.group(store=temp_store, overwrite=True)
    temp_z = temp_root.empty('calldata/GT',
                             shape=z_shape,
                             chunks=temp_chunks,
                             dtype='i1',
                             compressor=None)

    num_variants = z.shape[0]
    num_samples = z.shape[1]
    num_ploidy = z.shape[2]

    print('Num Samples: {}'.format(num_samples))
    print('Num Variants: {}'.format(num_variants))
    print('Ploidy: {}'.format(num_ploidy))
    print("Variation rate: {}".format(num_variants / num_bases))

    bar = ProgressBar(tree_sequence.get_num_mutations(), max_width=80)
    print("Pulling variant data...")
    variant_counter = 0
    for variant in tree_sequence.variants():
        bar.numerator = variant_counter
        print(bar, end='\r')
        sys.stdout.flush()

        var = variant.genotypes.reshape((num_samples, ploidy))
        temp_z[variant.index, :, :] = var
        variant_counter += 1

    # Store data in final data store
    z[:, :, :] = temp_z

    print('Done.\n')
    print(z.info)