Python to_zarr示例，dask.array.to_zarr Python示例

示例#1

0

显示文件

文件： cluster.py 项目： GFleishman/bigstream

def execute(array, write_path=None, **kwargs):
    """
    """

    with ClusterWrap.cluster(**kwargs) as cluster:

        # if user wants to write to disk
        if write_path is not None:
            compressor = Blosc(
                cname='zstd',
                clevel=4,
                shuffle=Blosc.BITSHUFFLE,
            )
            zarr_disk = zarr.open(
                write_path,
                'w',
                shape=array.shape,
                chunks=array.chunksize,
                dtype=array.dtype,
                compressor=compressor,
            )
            to_zarr(array, zarr_disk)
            return zarr_disk

        # otherwise user wants result returned to local process
        return array.compute()

示例#2

0

显示文件

文件： ndarray.py 项目： vt100/intake

    def _persist(source, path, component=None, storage_options=None, **kwargs):
        """Save array to local persistent store

        Makes a parquet dataset out of the data using zarr.
        This then becomes a data entry in the persisted datasets catalog.
        Only works locally for the moment.

        Parameters
        ----------
        source: a DataSource instance to save
        name: str or None
            Key to refer to this persisted dataset by. If not given, will
            attempt to get from the source's name
        kwargs: passed on to zarr array creation, see
        """
        from dask.array import to_zarr, from_array
        from ..source.zarr import ZarrArraySource
        try:
            arr = source.to_dask()
        except NotImplementedError:
            arr = from_array(source.read(), chunks=-1).rechunk('auto')
        to_zarr(arr,
                path,
                component=None,
                storage_options=storage_options,
                **kwargs)

        source = ZarrArraySource(path, storage_options, component)
        return source

示例#3

0

显示文件

def compose_position_fields(fields,
                            spacing,
                            output,
                            blocksize=[
                                256,
                            ] * 3,
                            displacement=None):
    """
    """

    with distributed.distributedState() as ds:

        # get number of jobs needed
        block_grid = np.ceil(np.array(fields[0].shape[:-1]) /
                             blocksize).astype(int)
        nblocks = np.prod(block_grid)

        # set up the cluster
        ds.initializeLSFCluster(job_extra=["-P multifish"])
        ds.initializeClient()
        ds.scaleCluster(njobs=nblocks)

        # wrap fields as dask arrays
        fields_da = da.stack(
            [da.from_array(f, chunks=blocksize + [
                3,
            ]) for f in fields])

        # accumulate
        composed = da.sum(fields_da, axis=0)

        # modify for multiple position fields
        if displacement is not None:
            raise NotImplementedError(
                "composing displacement fields not implemented yet")
        else:
            grid = position_grid_dask(composed.shape[:3],
                                      blocksize) * spacing.astype(np.float32)
            composed = composed - (len(fields) - 1) * grid

        # write in parallel as 3D array to zarr file
        compressor = Blosc(cname='zstd', clevel=9, shuffle=Blosc.BITSHUFFLE)
        composed_disk = zarr.open(
            output,
            'w',
            shape=composed.shape,
            chunks=composed.chunksize,
            dtype=composed.dtype,
            compressor=compressor,
        )
        da.to_zarr(composed, composed_disk)

        # return pointer to zarr file
        return composed_disk

示例#4

0

显示文件

def _to_zarr(  # type: ignore[no-untyped-def]
    arr,
    url,
    component=None,
    storage_options=None,
    overwrite=False,
    compute=True,
    return_stored=False,
    attrs=None,
    **kwargs,
):
    """Extension of dask.array.core.to_zarr that can set attributes on the resulting Zarr array,
    in the same Dask operation.
    """

    # call Dask version with compute=False just to check preconditions
    da.to_zarr(
        arr,
        url,
        component=component,
        storage_options=storage_options,
        overwrite=overwrite,
        compute=False,
        return_stored=return_stored,
        **kwargs,
    )

    storage_options = storage_options or {}
    if isinstance(url, str):
        mapper = get_mapper(url, **storage_options)
    else:
        # assume the object passed is already a mapper
        mapper = url  # pragma: no cover
    chunks = [c[0] for c in arr.chunks]
    z = dask.delayed(_zarr_create_with_attrs)(
        shape=arr.shape,
        chunks=chunks,
        dtype=arr.dtype,
        store=mapper,
        path=component,
        overwrite=overwrite,
        attrs=attrs,
        **kwargs,
    )
    return arr.store(z,
                     lock=False,
                     compute=compute,
                     return_stored=return_stored)

示例#5

0

显示文件

def global_affine_to_position_field(shape,
                                    spacing,
                                    affine,
                                    output,
                                    blocksize=[
                                        256,
                                    ] * 3):
    """
    """

    with distributed.distributedState() as ds:

        # get number of jobs needed
        block_grid = np.ceil(np.array(shape) / blocksize).astype(int)
        nblocks = np.prod(block_grid)

        # set up the cluster
        ds.initializeLSFCluster(job_extra=["-P multifish"])
        ds.initializeClient()
        ds.scaleCluster(njobs=nblocks)

        # compute affine transform as position coordinates, lazy dask arrays
        grid = position_grid_dask(shape, blocksize) * spacing.astype(
            np.float32)
        coords = affine_to_grid_dask(affine, grid)
        coords = da.around(coords, decimals=2)

        # write in parallel as 4D array to zarr file
        compressor = Blosc(cname='zstd', clevel=9, shuffle=Blosc.BITSHUFFLE)
        coords_disk = zarr.open(
            output,
            'w',
            shape=coords.shape,
            chunks=tuple(blocksize + [
                3,
            ]),
            dtype=coords.dtype,
            compressor=compressor,
        )
        da.to_zarr(coords, coords_disk)

        # return pointer to zarr file
        return coords_disk

示例#6

0

显示文件

文件： ndarray.py 项目： zillow/intake

 def _data_to_source(arr,
                     path,
                     component=None,
                     storage_options=None,
                     **kwargs):
     from dask.utils import is_arraylike
     from dask.array import to_zarr, from_array
     from ..source.zarr import ZarrArraySource
     if not is_arraylike(arr):
         raise NotImplementedError
     if not hasattr(arr, 'npartitions'):
         arr = from_array(arr, chunks='auto')
     to_zarr(arr,
             path,
             component=None,
             storage_options=storage_options,
             **kwargs)
     source = ZarrArraySource(path, storage_options, component)
     return source

示例#7

0

显示文件

def merge_arrays(data_path):
    numpyload = delayed(numpy.load, pure=True)

    filelist = os.listdir(data_path)

    def filenum(x):
        return int(x[8:-4])

    filelist = sorted(filelist, key=filenum)

    array_list = []

    for symbol in filelist:
        arr_name = data_path + symbol
        arr_d = numpyload(arr_name)
        arr = da.from_delayed(arr_d, (256, 256, 256), float)
        array_list.append(arr)
        print(arr_name)

    filelist = os.listdir(data_path[:-1] + "_2")
    filelist = sorted(filelist, key=filenum)

    for symbol in filelist:
        arr_name = data_path + symbol
        arr_d = numpyload(arr_name)
        arr = da.from_delayed(arr_d, (256, 256, 256), float)
        array_list.append(arr)
        print(arr_name)

    z = da.stack(array_list)

    z.rechunk((256, 256, 256, 1))
    # da.to_npy_stack(data_path+'zarr_data', z)
    # m = z[:][:][:][199]
    # client = Client('128.104.222.103:8786')
    # re = client.compute(m)
    da.to_zarr(z, data_path + 'zarr_data_full')

示例#8

0

显示文件

文件： motion_correct.py 项目： GFleishman/CircuitSeeker

def resample_frames(
    frames,
    frames_spacing,
    transforms,
    write_path,
    mask=None,
    time_stride=1,
    compression_level=4,
    cluster_kwargs={},
):
    """
    """

    with ClusterWrap.cluster(**cluster_kwargs) as cluster:

        # create dask array of all frames
        if csio.testPathExtensionForHDF5(frames['suffix']):
            frames_data = csio.daskArrayBackedByHDF5(
                frames['folder'],
                frames['prefix'],
                frames['suffix'],
                frames['dataset_path'],
                stride=time_stride,
            )
        elif csio.testPathExtensionForSTACK(frames['suffix']):
            frames_data = csio.daskArrayBackedBySTACK(
                frames['folder'],
                frames['prefix'],
                frames['suffix'],
                frames['dtype'],
                frames['shape'],
                stride=time_stride,
            )
        compute_frames = frames_data.shape[0]

        # wrap transforms as dask array
        # extra dimension to match frames_data ndims
        if len(transforms.shape) == 3:
            transforms = transforms[::time_stride, None, :, :]
        elif len(transforms.shape) == 2:
            transforms = transforms[::time_stride, None, None, :]
        transforms_d = da.from_array(transforms,
                                     chunks=(1, ) + transforms[0].shape)

        # wrap mask
        mask_d = None
        if mask is not None:
            mask_sh, frame_sh = mask.shape, frames_data.shape[1:]
            if mask_sh != frame_sh:
                mask = zoom(mask, np.array(frame_sh) / mask_sh, order=0)
            mask_d = cluster.client.scatter(mask, broadcast=True)

        # wrap transform function
        def wrapped_apply_transform(mov, t, mask_d=None):
            mov = mov.squeeze()
            t = t.squeeze()

            # just an affine matrix
            transform_list = [
                t,
            ]

            # affine plus bspline
            if len(t.shape) == 1:
                transform_list = [t[:16].reshape((4, 4)), t[16:]]

            # apply transform(s)
            aligned = apply_transform(
                mov,
                mov,
                frames_spacing,
                frames_spacing,
                transform_list=transform_list,
            )
            if mask_d is not None:
                aligned = aligned * mask_d
            return aligned[None, ...]

        # apply transform to all frames
        frames_aligned = da.map_blocks(
            wrapped_apply_transform,
            frames_data,
            transforms_d,
            mask_d=mask_d,
            dtype=np.uint16,
            chunks=[
                1,
            ] + list(frames_data.shape[1:]),
        )

        # write in parallel as 4D array to zarr file
        compressor = Blosc(
            cname='zstd',
            clevel=compression_level,
            shuffle=Blosc.BITSHUFFLE,
        )
        aligned_disk = zarr.open(write_path,
                                 'w',
                                 shape=frames_aligned.shape,
                                 chunks=[
                                     1,
                                 ] + list(frames_data.shape[1:]),
                                 dtype=frames_aligned.dtype,
                                 compressor=compressor)
        da.to_zarr(frames_aligned, aligned_disk)

        # return reference to zarr store
        return aligned_disk

示例#9

0

显示文件

def write_zarr(uri, data, path="/"):
    import dask.array as da

    da.to_zarr(data, uri, component=path, overwrite=True)

    return uri

示例#10

0

显示文件

文件： base.py 项目： yjx520/simpeg

def dask_linear_operator(self):
    self.nC = self.modelMap.shape[0]

    n_data_comp = len(self.survey.components)
    components = np.array(list(self.survey.components.keys()))
    active_components = np.hstack(
        [np.c_[values] for values in self.survey.components.values()]
    ).tolist()

    row = delayed(self.evaluate_integral, pure=True)
    rows = [
        array.from_delayed(
            row(receiver_location, components[component]),
            dtype=np.float32,
            shape=(n_data_comp, self.nC),
        )
        for receiver_location, component in zip(
            self.survey.receiver_locations.tolist(), active_components
        )
    ]
    stack = array.vstack(rows)

    # Chunking options
    if self.chunk_format == "row" or self.store_sensitivities == "forward_only":
        config.set({"array.chunk-size": f"{self.max_chunk_size}MiB"})
        # Autochunking by rows is faster and more memory efficient for
        # very large problems sensitivty and forward calculations
        stack = stack.rechunk({0: "auto", 1: -1})

    elif self.chunk_format == "equal":
        # Manual chunks for equal number of blocks along rows and columns.
        # Optimal for Jvec and Jtvec operations
        row_chunk, col_chunk = compute_chunk_sizes(*stack.shape, self.max_chunk_size)
        stack = stack.rechunk((row_chunk, col_chunk))
    else:
        # Auto chunking by columns is faster for Inversions
        config.set({"array.chunk-size": f"{self.max_chunk_size}MiB"})
        stack = stack.rechunk({0: -1, 1: "auto"})

    if self.store_sensitivities == "disk":
        sens_name = self.sensitivity_path + "sensitivity.zarr"
        if os.path.exists(sens_name):
            kernel = array.from_zarr(sens_name)
            if np.all(
                np.r_[
                    np.any(np.r_[kernel.chunks[0]] == stack.chunks[0]),
                    np.any(np.r_[kernel.chunks[1]] == stack.chunks[1]),
                    np.r_[kernel.shape] == np.r_[stack.shape],
                ]
            ):
                # Check that loaded kernel matches supplied data and mesh
                print("Zarr file detected with same shape and chunksize ... re-loading")
                return kernel
        else:
            print("Writing Zarr file to disk")
            with ProgressBar():
                print("Saving kernel to zarr: " + sens_name)
                kernel = array.to_zarr(
                    stack, sens_name, compute=True, return_stored=True, overwrite=True
                )
    elif self.store_sensitivities == "forward_only":
        with ProgressBar():
            print("Forward calculation: ")
            pred = (stack @ self.model).compute()
        return pred
    else:
        print(stack.chunks)
        with ProgressBar():
            print("Computing sensitivities to local ram")
            kernel = array.asarray(stack.compute())
    return kernel

示例#11

0

显示文件

文件： rechunk.py 项目： sbesson/omero-ms-zarr

def convert(resized, target_array):
    da.to_zarr(resized, target_array)

示例#12

0

显示文件

文件： mocorr.py 项目： jingxlim/CircuitSeeker

def motionCorrect(
    folder,
    prefix,
    suffix,
    fixed,
    fixed_vox,
    moving_vox,
    write_path,
    dataset_path=None,
    distributed_state=None,
    sigma=7,
    transforms_dir=None,
    **kwargs,
):
    """
    """

    # set up the distributed environment
    ds = distributed_state
    if distributed_state is None:
        ds = csd.distributedState()
        # writing large compressed chunks locks GIL for a long time
        ds.modifyConfig({
            'distributed.comm.timeouts.connect': '60s',
            'distributed.comm.timeouts.tcp': '180s',
        })
        ds.initializeLSFCluster(job_extra=["-P scicompsoft"])
        ds.initializeClient()

    # create (lazy) dask bag from all frames
    frames = csio.daskBagOfFilePaths(folder, prefix, suffix)
    nframes = frames.npartitions

    # scale cluster carefully
    if 'max_workers' in kwargs.keys():
        max_workers = kwargs['max_workers']
    else:
        max_workers = 1250
    ds.scaleCluster(njobs=min(nframes, max_workers))

    # align all
    dfixed = delayed(fixed)
    dfixed_vox = delayed(fixed_vox)
    dmoving_vox = delayed(moving_vox)
    ddataset_path = delayed(dataset_path)
    params = frames.map(
        lambda b, w, x, y, z: rigidAlign(w, b, x, y, dataset_path=z),
        w=dfixed,
        x=dfixed_vox,
        y=dmoving_vox,
        z=ddataset_path,
    ).compute()
    params = np.array(list(params))

    # (weak) outlier removal and smoothing
    params = percentile_filter(params, 50, footprint=np.ones((3, 1)))
    params = gaussian_filter1d(params, sigma, axis=0)

    # write transforms as matrices
    if transforms_dir is not None:
        paths = list(frames)
        for ind, p in enumerate(params):
            transform = _parametersToRigidMatrix(p)
            basename = os.path.splitext(os.path.basename(paths[ind]))[0]
            path = os.path.join(transforms_dir, basename) + '_rigid.mat'
            np.savetxt(path, transform)

    # apply transforms to all images
    params = db.from_sequence(params, npartitions=nframes)
    transformed = frames.map(
        lambda b, x, y, z: applyTransform(b, x, y, dataset_path=z),
        x=dmoving_vox,
        y=params,
        z=ddataset_path,
    ).to_delayed()

    # convert to a (lazy) 4D dask array
    sh = transformed[0][0].shape.compute()
    dd = transformed[0][0].dtype.compute()
    arrays = [da.from_delayed(t[0], sh, dtype=dd) for t in transformed]
    transformed = da.stack(arrays, axis=0)

    # write in parallel as 4D array to zarr file
    compressor = Blosc(cname='zstd', clevel=9, shuffle=Blosc.BITSHUFFLE)
    transformed_disk = zarr.open(write_path,
                                 'w',
                                 shape=transformed.shape,
                                 chunks=(256, 10, 256, 256),
                                 dtype=transformed.dtype,
                                 compressor=compressor)
    da.to_zarr(transformed, transformed_disk)

    # release resources
    if distributed_state is None:
        ds.closeClient()

    # return reference to data on disk
    return transformed_disk

示例#13

0

显示文件

def distributed_deltafoverf(
    zarr_path,
    window_size,
    batch_size,
    write_path,
    compression_level=4,
    cluster_kwargs={},
):
    """
    """

    # launch cluster
    with ClusterWrap.cluster(**cluster_kwargs) as cluster:

        # lazy load zarr to get metadata
        metadata = zarr.open(zarr_path, 'r')

        # get block start indices
        start_indices, start_index = [], 0
        while start_index + window_size < metadata.shape[0]:
            start_indices.append(start_index)
            start_index = start_index + batch_size - window_size

        # convert to dask array
        start_indices_da = da.from_array(start_indices, chunks=(1, ))

        # wrap deltafoverf function
        def wrapped_deltafoverf(index):
            zarr_file = zarr.open(zarr_path, 'r')
            data = zarr_file[index[0]:index[0] + batch_size]
            return deltafoverf(data, window_size)

        # map function to each block
        dff = da.map_blocks(
            wrapped_deltafoverf,
            start_indices_da,
            dtype=np.float16,
            new_axis=list(range(1, metadata.ndim)),
            chunks=(batch_size - window_size, ) + metadata.chunks[1:],
        )

        # ensure the correct shape and rechunk for faster writing
        dff = dff[:metadata.shape[0] - window_size]
        dff = dff.rechunk((1, ) + metadata.chunks[1:])

        # persist dff before writing to zarr, prevents RAM conflicts
        dff = dff.persist()

        # write to output zarr
        compressor = Blosc(
            cname='zstd',
            clevel=compression_level,
            shuffle=Blosc.BITSHUFFLE,
        )
        dff_disk = zarr.open(
            write_path,
            'w',
            shape=dff.shape,
            chunks=metadata.chunks,
            dtype=dff.dtype,
            compressor=compressor,
        )
        da.to_zarr(dff, dff_disk)

        # return reference to zarr store
        return dff_disk

示例#14

0

显示文件

def apply_position_field(
        mov,
        mov_spacing,
        fix,
        fix_spacing,
        transform,
        output,
        blocksize=[
            256,
        ] * 3,
        order=1,
        transform_spacing=None,
        transpose=[
            False,
        ] * 3,
        depth=(32, 32, 32),
):
    """
    """

    with distributed.distributedState() as ds:

        # get number of jobs needed
        block_grid = np.ceil(np.array(mov.shape) / blocksize).astype(int)
        nblocks = np.prod(block_grid)

        # set up the cluster
        ds.initializeLSFCluster(job_extra=["-P multifish"])
        ds.initializeClient()
        ds.scaleCluster(njobs=nblocks)

        # determine mov/fix relative chunking
        m_blocksize = blocksize * fix_spacing / mov_spacing
        m_blocksize = list(np.round(m_blocksize).astype(np.int16))
        m_depth = depth * fix_spacing / mov_spacing
        m_depth = tuple(np.round(m_depth).astype(np.int16))

        # determine trans/fix relative chunking
        if transform_spacing is not None:
            t_blocksize = blocksize * fix_spacing / transform_spacing
            t_blocksize = list(np.round(t_blocksize).astype(np.int16))
            t_depth = depth * fix_spacing / transform_spacing
            t_depth = tuple(np.round(t_depth).astype(np.int16))
        else:
            t_blocksize = blocksize
            t_depth = depth

        # wrap objects as dask arrays
        fix_da = da.from_array(fix)
        if transpose[0]:
            fix_da = fix_da.transpose(2, 1, 0)

        mov_da = da.from_array(mov)
        if transpose[1]:
            mov_da = mov_da.transpose(2, 1, 0)
            block_grid = block_grid[::-1]

        transform_da = da.from_array(transform)
        if transpose[2]:
            transform_da = transform_da.transpose(2, 1, 0, 3)
            transform_da = transform_da[..., ::-1]

        # chunk dask arrays
        fix_da = da.reshape(fix_da, fix_da.shape + (1, )).rechunk(
            tuple(blocksize + [
                1,
            ]))
        mov_da = da.reshape(mov_da, mov_da.shape + (1, )).rechunk(
            tuple(m_blocksize + [
                1,
            ]))
        transform_da = transform_da.rechunk(tuple(t_blocksize + [
            3,
        ]))

        # put transform in voxel units
        transform_da = transform_da / mov_spacing

        # map the interpolate function with overlaps
        # TODO: depth should be computed automatically from transform maximum?
        d = [depth + (0, ), m_depth + (0, ), t_depth + (0, )]
        aligned = da.map_overlap(
            interpolate_image_dask,
            fix_da,
            mov_da,
            transform_da,
            blocksize=m_blocksize,
            margin=m_depth,
            depth=d,
            boundary=0,
            dtype=np.uint16,
            align_arrays=False,
        )

        # remove degenerate dimension
        aligned = da.reshape(aligned, aligned.shape[:-1])

        # write in parallel as 3D array to zarr file
        compressor = Blosc(cname='zstd', clevel=9, shuffle=Blosc.BITSHUFFLE)
        aligned_disk = zarr.open(
            output,
            'w',
            shape=aligned.shape,
            chunks=aligned.chunksize,
            dtype=aligned.dtype,
            compressor=compressor,
        )
        da.to_zarr(aligned, aligned_disk)

        # return pointer to zarr file
        return aligned_disk

示例#15

0

显示文件

def process(directory,
            threshold=6,
            integrate=False,
            counting=False,
            hdr=None,
            mean_e=256,
            nav_shape=None,
            chunk_shape=None,
            verbose=False):
    if verbose:
        _logger.setLevel(logging.DEBUG)
        handler = logging.StreamHandler(sys.stdout)
        handler.setLevel(logging.DEBUG)
        formatter = logging.Formatter('%(message)s \n')
        handler.setFormatter(formatter)
        _logger.addHandler(handler)

    _logger.info(msg="\n\n .SEQ Processor Application (and Counting)...\n"
                 "Created by: Carter Francis ([email protected])\n"
                 "Updated 2021-06-18\n"
                 "------------------\n")
    _logger.info(msg="Version:" + __version__)
    tick = time.time()
    file_dict = get_files(folder=directory)
    for key in file_dict:
        if len(file_dict[key]) == 0:
            file_dict[key].pop()
        else:
            file_dict[key] = file_dict[key][0]
    if "top" in file_dict and "bottom" in file_dict:
        file_dict.pop("seq")
        data_dict = cel_file_reader(**file_dict,
                                    nav_shape=nav_shape,
                                    chunk_shape=chunk_shape,
                                    lazy=True)
    elif "seq" in file_dict:
        data_dict = file_reader(**file_dict,
                                nav_shape=nav_shape,
                                chunk_shape=chunk_shape,
                                lazy=True)
    if hdr is not None:
        hdr = hs.load(hdr).data
    else:
        hdr = None
    if hdr is None and integrate is False:
        dtype = bool
    else:
        dtype = np.float32

    if counting:
        data_dict["data"] = data_dict["data"].map_blocks(
            _counting_filter_cpu,
            threshold=threshold,
            integrate=integrate,
            hdr_mask=hdr,
            method="maximum",
            mean_electron_val=mean_e,
            dtype=dtype)

    _logger.info(data_dict)
    sig = dict2signal(data_dict, lazy=True)

    _logger.info("Data... :" + str(sig.data))
    _logger.info("Dtype:" + str(sig.data.dtype))
    _logger.info("Saving... ")

    da.to_zarr(sig.data, directory + "_zarr", overwrite=True)

    #sig.save(directory + ".hspy",
    #         compression=False,
    #         overwrite=True)

    tock = time.time()
    _logger.info("Total time elapsed : " + str(tock - tick) + " sec")
    return sig

示例#16

0

显示文件

    def _to_zarr(self, data, labels, location):

        data = da.to_zarr(data, location, component='data', compute=False)
        labels = da.to_zarr(labels, location, component='labels', compute=False)

        return data, labels

示例#17

0

显示文件

文件： simulation.py 项目： zhaohp2018/simpeg

def dask_getJ(self, m, f=None):
    """
        Generate Full sensitivity matrix
    """
    if self._Jmatrix is not None:
        return self._Jmatrix

    self.model = m
    if f is None:
        f = self.fields(m)

    if self.verbose:
        print("Calculating J and storing")

    if os.path.exists(self.sensitivity_path):
        shutil.rmtree(self.sensitivity_path, ignore_errors=True)

        # Wait for the system to clear out the directory
        while os.path.exists(self.sensitivity_path):
            pass

    m_size = self.model.size
    count = 0
    for source in self.survey.source_list:
        u_source = f[source, self._solutionType]
        for rx in source.receiver_list:
            PT = rx.getP(self.mesh, rx.projGLoc(f)).toarray().T
            df_duT = PT
            # Find a block of receivers
            n_block_col = int(np.ceil(df_duT.size * 8 * 1e-9 / self.max_ram))

            n_col = int(np.ceil(df_duT.shape[1] / n_block_col))

            nrows = int(
                m_size /
                np.ceil(m_size * n_col * 8 * 1e-6 / self.max_chunk_size))
            ind = 0
            for col in range(n_block_col):
                ATinvdf_duT = da.asarray(self.Ainv *
                                         df_duT[:, ind:ind + n_col]).rechunk(
                                             (nrows, n_col))
                dA_dmT = self.getADeriv(u_source, ATinvdf_duT, adjoint=True)
                # du_dmT = -da.from_delayed(dask.delayed(dA_dmT), shape=(self.model.size, n_col), dtype=float)
                if n_col > 1:
                    du_dmT = da.from_delayed(dask.delayed(-dA_dmT),
                                             shape=(m_size, n_col),
                                             dtype=float)
                else:
                    du_dmT = da.from_delayed(dask.delayed(-dA_dmT),
                                             shape=(m_size, ),
                                             dtype=float)
                blockName = self.sensitivity_path + "J" + str(count) + ".zarr"

                da.to_zarr((du_dmT.T).rechunk("auto"), blockName)
                del ATinvdf_duT
                count += 1
                ind += n_col

    dask_arrays = []
    for ii in range(count):
        blockName = self.sensitivity_path + "J" + str(ii) + ".zarr"
        J = da.from_zarr(blockName)
        # Stack all the source blocks in one big zarr
        dask_arrays.append(J)

    rowChunk, colChunk = compute_chunk_sizes(self.survey.nD, m_size,
                                             self.max_chunk_size)
    self._Jmatrix = da.vstack(dask_arrays).rechunk((rowChunk, colChunk))
    self.Ainv.clean()

    return self._Jmatrix

示例#18

0

显示文件

import h5py
from glob import glob
import os
import dask.array as da

filenames = sorted(glob(os.path.join('data', 'weather-big', '*.hdf5')))
dsets = [h5py.File(filename, mode='r')['/t2m'] for filename in filenames]

arrays = [da.from_array(dset, chunks=(500, 500)) for dset in dsets]

x = da.stack(arrays, axis=0)

result = x[:, ::2, ::2]

da.to_zarr(result, os.path.join('data', 'myfile.zarr'), overwrite=True)

示例#19

0

显示文件

文件： simulation.py 项目： yjx520/simpeg

def dask_getJ(self, m, f=None):
    """
        Generate Full sensitivity matrix
    """

    if self._Jmatrix is not None:
        return self._Jmatrix
    if f is None:
        f = self.fields(m)

    if self.verbose:
        print("Calculating J and storing")

    if self._mini_survey is not None:
        # Need to use _Jtvec for this operation currently...
        J = self._Jtvec(m=m, v=None, f=f).T
        self._Jmatrix = da.from_array(J)
        return self._Jmatrix

    if os.path.exists(self.sensitivity_path):
        shutil.rmtree(self.sensitivity_path, ignore_errors=True)

        # Wait for the system to clear out the directory
        while os.path.exists(self.sensitivity_path):
            pass

    m_size = self.model.size
    count = 0
    for source in self.survey.source_list:
        u_source = f[source, self._solutionType]
        for rx in source.receiver_list:
            # wrt f, need possibility wrt m
            PTv = rx.evalDeriv(source, self.mesh, f).toarray().T

            df_duTFun = getattr(f, "_{0!s}Deriv".format(rx.projField), None)
            df_duT, df_dmT = df_duTFun(source, None, PTv, adjoint=True)

            # Find a block of receivers
            n_block_col = int(np.ceil(df_duT.size * 8 * 1e-9 / self.max_ram))

            n_col = int(np.ceil(df_duT.shape[1] / n_block_col))

            nrows = int(
                m_size /
                np.ceil(m_size * n_col * 8 * 1e-6 / self.max_chunk_size))
            ind = 0
            for col in range(n_block_col):
                ATinvdf_duT = da.asarray(self.Ainv *
                                         df_duT[:, ind:ind + n_col]).rechunk(
                                             (nrows, n_col))

                dA_dmT = self.getADeriv(u_source, ATinvdf_duT, adjoint=True)

                dRHS_dmT = self.getRHSDeriv(source, ATinvdf_duT, adjoint=True)

                if n_col > 1:
                    du_dmT = da.from_delayed(dask.delayed(-dA_dmT),
                                             shape=(m_size, n_col),
                                             dtype=float)
                else:
                    du_dmT = da.from_delayed(dask.delayed(-dA_dmT),
                                             shape=(m_size, ),
                                             dtype=float)

                if not isinstance(dRHS_dmT, Zero):
                    du_dmT += da.from_delayed(dask.delayed(dRHS_dmT),
                                              shape=(m_size, n_col),
                                              dtype=float)

                if not isinstance(df_dmT, Zero):
                    du_dmT += da.from_delayed(df_dmT,
                                              shape=(m_size, n_col),
                                              dtype=float)

                blockName = self.sensitivity_path + "J" + str(count) + ".zarr"
                da.to_zarr((du_dmT.T).rechunk("auto"), blockName)
                del ATinvdf_duT
                count += 1

                ind += n_col

    dask_arrays = []
    for ii in range(count):
        blockName = self.sensitivity_path + "J" + str(ii) + ".zarr"
        J = da.from_zarr(blockName)
        # Stack all the source blocks in one big zarr
        dask_arrays.append(J)

    rowChunk, colChunk = compute_chunk_sizes(self.survey.nD, m_size,
                                             self.max_chunk_size)
    self._Jmatrix = da.vstack(dask_arrays).rechunk((rowChunk, colChunk))
    self.Ainv.clean()

    return self._Jmatrix

示例#20

0

显示文件

def write_zarr(uri, data, internal_path="/"):
    data = data.rechunk("auto")
    da.to_zarr(data, uri, component=internal_path, overwrite=True)
    return uri

示例#21

0

显示文件

def local_affine_to_position_field(shape,
                                   spacing,
                                   local_affines,
                                   output,
                                   blocksize=[
                                       256,
                                   ] * 3):
    """
    """

    with distributed.distributedState() as ds:

        # get number of jobs needed
        block_grid = np.ceil(np.array(shape) / blocksize).astype(int)
        nblocks = np.prod(block_grid)

        # set up the cluster
        ds.initializeLSFCluster(
            job_extra=["-P multifish"],
            cores=4,
            memory="64GB",
            ncpus=4,
            threads_per_worker=8,
            mem=64000,
        )
        ds.initializeClient()
        ds.scaleCluster(njobs=nblocks)

        # augment the blocksize by the fixed overlap size
        pads = [2 * int(round(x / 8)) for x in blocksize]
        blocksize_with_overlap = np.array(blocksize) + pads

        # get a grid used for each affine
        grid = position_grid_dask(blocksize_with_overlap,
                                  list(blocksize_with_overlap))
        grid = grid * spacing.astype(np.float32)

        # wrap local_affines as dask array
        local_affines_da = da.from_array(local_affines, chunks=(1, 1, 1, 3, 4))

        # compute affine transforms as position coordinates, lazy dask arrays
        coords = da.map_blocks(
            affine_to_grid_dask,
            local_affines_da,
            grid=grid,
            displacement=True,
            new_axis=[5, 6],
            chunks=(
                1,
                1,
                1,
            ) + tuple(grid.shape),
            dtype=np.float32,
        )

        # stitch affine position fields
        coords = stitch.stitch_fields(coords, blocksize)

        # crop to original shape
        coords = coords[:shape[0], :shape[1], :shape[2]]

        # convert to position field
        coords = coords + position_grid_dask(
            shape, blocksize) * spacing.astype(np.float32)
        coords = da.around(coords, decimals=2)

        # write in parallel as 3D array to zarr file
        compressor = Blosc(cname='zstd', clevel=9, shuffle=Blosc.BITSHUFFLE)
        coords_disk = zarr.open(
            output,
            'w',
            shape=coords.shape,
            chunks=tuple(blocksize + [
                3,
            ]),
            dtype=coords.dtype,
            compressor=compressor,
        )
        da.to_zarr(coords, coords_disk)

        # return pointer to zarr file
        return coords_disk

示例#22

0

显示文件

文件： deform.py 项目： prete/bigstream

def tiled_deformable_align(
    fixed,
    moving,
    fixed_spacing,
    moving_spacing,
    blocksize,
    transpose=[False] * 2,
    global_affine=None,
    local_affines=None,
    write_path=None,
    lazy=True,
    deform_kwargs={},
    #    cluster_kwargs={},
):
    """
    """

    # get number of blocks required
    block_grid = np.ceil(np.array(fixed.shape) / blocksize)
    nblocks = np.prod(block_grid)

    # get true field shape
    original_shape = fixed.shape
    if transpose[0]:
        original_shape = original_shape[::-1]

    # get affine position field
    affine_pf = None
    if global_affine is not None or local_affines is not None:
        if local_affines is None:
            local_affines = np.empty(
                block_grid + (3, 4),
                dtype=np.float32,
            )
            local_affines[..., :, :] = np.eye(4)[:3, :]
        affine_pf = transform.local_affines_to_position_field(
            original_shape,
            fixed_spacing,
            blocksize,
            local_affines,
            global_affine=global_affine,
            lazy=True,
            #cluster_kwargs=cluster_kwargs,
        )

    # distributed computations done in cluster context
    #with ClusterWrap.cluster(**cluster_kwargs) as cluster:
    #    if write_path is not None or not lazy:
    #        cluster.scale_cluster(nblocks + WORKER_BUFFER)

    # wrap images as dask arrays
    fixed_da = da.from_array(fixed)
    moving_da = da.from_array(moving)

    # in case xyz convention is flipped for input file
    if transpose[0]:
        fixed_da = fixed_da.transpose(2, 1, 0)
    if transpose[1]:
        moving_da = moving_da.transpose(2, 1, 0)

    # pad the ends to fill in the last blocks
    pads = []
    for x, y in zip(original_shape, blocksize):
        pads += [(0, y - x % y) if x % y > 0 else (0, 0)]
    fixed_da = da.pad(fixed_da, pads)
    moving_da = da.pad(moving_da, pads)

    # chunk to blocksize
    fixed_da = fixed_da.rechunk(tuple(blocksize))
    moving_da = moving_da.rechunk(tuple(blocksize))

    # wrap deformable function
    def wrapped_deformable_align(x, y):
        warp = deformable_align(
            x,
            y,
            fixed_spacing,
            moving_spacing,
            **deform_kwargs,
        )
        return warp.reshape((1, 1, 1) + warp.shape)

    # deform all chunks
    overlaps = tuple([int(round(x / 8)) for x in blocksize])
    out_blocks = [x + 2 * y for x, y in zip(blocksize, overlaps)]
    out_blocks = [1, 1, 1] + out_blocks + [
        3,
    ]

    warps = da.map_overlap(
        wrapped_deformable_align,
        fixed_da,
        moving_da,
        depth=overlaps,
        boundary=0,
        trim=False,
        align_arrays=False,
        dtype=np.float32,
        new_axis=[
            3,
            4,
            5,
            6,
        ],
        chunks=out_blocks,
    )

    # stitch neighboring displacement fields
    warps = stitch.stitch_fields(warps, blocksize)

    # crop any pads
    warps = warps[:original_shape[0], :original_shape[1], :original_shape[2]]

    # TODO refactor transform.compose_position_fields
    #      replace this approximation
    # compose with affine position field
    if affine_pf is not None:
        final_field = affine_pf + warps
    else:
        final_field = warps + transform.position_grid_dask(
            original_shape,
            blocksize,
        )

    # if user wants to write to disk
    if write_path is not None:
        compressor = Blosc(cname='zstd', clevel=9, shuffle=Blosc.BITSHUFFLE)
        final_field_disk = zarr.open(
            write_path,
            'w',
            shape=final_field.shape,
            chunks=tuple(blocksize + [
                3,
            ]),
            dtype=final_field.dtype,
            compressor=compressor,
        )
        da.to_zarr(final_field, final_field_disk)

    # if user wants to compute and return full field
    if not lazy:
        return final_field.compute()

    # if user wants to return compute graph w/o executing
    if lazy:
        return final_field

示例#23

0

显示文件

文件： cremi-hdf2zarr.py 项目： pwinston/napari-pwinston

import numpy as np
import h5py
import dask.array as da

filename = '/Users/pbw/data/sample_A/sample_A_20160501.hdf'
source_data = h5py.File(filename, 'r')
raw = np.asarray(source_data['volumes/raw'])
labels = np.asarray(source_data['volumes/labels/neuron_ids'])

raw_dask = da.from_array(raw, chunks=(1, 1250, 1250))
da.to_zarr(raw_dask, 'raw.zarr')
labels_dask = da.from_array(labels, chunks=(1, 1250, 1250))
da.to_zarr(labels_dask, 'labels.zarr')

示例#24

0

显示文件

文件： deform.py 项目： wangyuhan01/bigstream

def deformable_align_distributed(
    fixed, moving,
    fixed_vox, moving_vox,
    write_path,
    cc_radius,
    gradient_smoothing,
    field_smoothing,
    iterations,
    shrink_factors,
    smooth_sigmas,
    step,
    blocksize=[256,]*3,
    cluster_extra=["-P multifish"],
    transpose=False,
):
    """
    """

    # distributed computations done in cluster context
    with distributed.distributedState() as ds:

        # get number of blocks required
        block_grid = np.ceil(np.array(fixed.shape) / blocksize)
        nblocks = np.prod(block_grid)

        # set up the cluster
        ds.initializeLSFCluster(
            job_extra=cluster_extra,
            cores=4, memory="64GB", ncpus=4, threads_per_worker=8, mem=64000,
        )
        ds.initializeClient()
        ds.scaleCluster(njobs=nblocks)

        # wrap images as dask arrays
        fixed_da = da.from_array(fixed)
        moving_da = da.from_array(moving)

        # in case xyz convention is flipped for input file
        if transpose:
            fixed_da = fixed_da.transpose(2,1,0)

        # pad the ends to fill in the last blocks
        orig_sh = fixed_da.shape
        pads = [(0, y - x % y) if x % y != 0 else (0, 0) for x, y in zip(orig_sh, blocksize)]
        fixed_da = da.pad(fixed_da, pads)
        moving_da = da.pad(moving_da, pads)
        fixed_da = fixed_da.rechunk(tuple(blocksize))
        moving_da = moving_da.rechunk(tuple(blocksize))

        # wrap deformable function to simplify passing parameters
        def my_deformable_align(x, y):
            return deformable_align(
                x, y, fixed_vox, moving_vox,
                cc_radius, gradient_smoothing, field_smoothing,
                iterations, shrink_factors, smooth_sigmas, step,
            )

        # deform all chunks
        overlaps = tuple([int(round(x/8)) for x in blocksize])
        out_blocks = [1,1,1] + [x + 2*y for x, y in zip(blocksize, overlaps)] + [3,]

        warps = da.map_overlap(
            my_deformable_align, fixed_da, moving_da,
            depth=overlaps,
            boundary='reflect',
            trim=False,
            align_arrays=False,
            dtype=np.float32,
            new_axis=[3,4,5,6,],
            chunks=out_blocks,
        )

        # stitch neighboring displacement fields
        warps = stitch.stitch_fields(warps, blocksize)

        # crop any pads
        warps = warps[:orig_sh[0], :orig_sh[1], :orig_sh[2]]

        # convert to position field
        warps = warps + transform.position_grid_dask(orig_sh, blocksize)

        # write result to zarr file
        compressor = Blosc(cname='zstd', clevel=9, shuffle=Blosc.BITSHUFFLE)
        warps_disk = zarr.open(write_path, 'w',
            shape=warps.shape, chunks=tuple(blocksize + [3,]),
            dtype=warps.dtype, compressor=compressor,
        )
        da.to_zarr(warps, warps_disk)

        # return reference to zarr data store
        return warps_disk

示例#25

0

显示文件

    def test_zarr_functionament(self):
        # with shape
        np_array = np.random.randint(1, 10, size=1000)
        array = da.from_array(np_array)

        with TemporaryDirectory() as tmpdir:
            delayed = da.to_zarr(array, url=tmpdir,
                                 compute=False, component='/data')
            dask.compute(delayed)

            z_object = zarr.open_group(tmpdir, mode='r')

            assert np.all(np_array == z_object.data[:])

        # def without_shape():
        np_array = np.random.randint(1, 10, size=1000000)
        array = da.from_array(np_array)

        array = array[array > 5]

        with TemporaryDirectory() as tmpdir:
            array.compute_chunk_sizes()
            delayed = da.to_zarr(array, url=tmpdir,
                                 compute=False, component='/data')
            dask.compute(delayed)

            z_object = zarr.open_group(tmpdir, mode='r')

            assert np.all(np_array[np_array > 5] == z_object.data[:])

        # without_shape2
        np_array = np.random.randint(1, 10, size=10000000)
        array = da.from_array(np_array)

        array = array[array > 5]

        with TemporaryDirectory() as tmpdir:
            array.compute_chunk_sizes()
            delayed = da.to_zarr(array, url=tmpdir,
                                 compute=False, component='/data')
            dask.compute(delayed)

            z_object = zarr.open_group(tmpdir, mode='r')

            assert np.all(np_array[np_array > 5] == z_object.data[:])

        # write_chunks
        chunks = []

        sizes = (1, 2, 3)
        # total_size = sum(sizes)

        for i, n in enumerate(sizes):
            chunks.append(np.full(n, (i,)))
        with TemporaryDirectory() as tmpdir:
            store = zarr.DirectoryStore(tmpdir)
            root = zarr.group(store=store, overwrite=True)
            dataset = root.create_dataset('test', shape=(0,),
                                  chunks=chunks[0].shape,
                                  dtype=chunks[0].dtype)

            # offset = 0
            for chunk in chunks:
                dataset.append(chunk)

示例#26

0

显示文件

文件： preview.py 项目： cbc-group/stitching

def main(src_dir, dst_dir, remap, flip, host, mip):
    logging.getLogger("tifffile").setLevel(logging.ERROR)
    coloredlogs.install(level="DEBUG",
                        fmt="%(asctime)s %(levelname)s %(message)s",
                        datefmt="%H:%M:%S")

    logger = logging.getLogger(__name__)

    src_ds = load_dataset(src_dir, remap, flip)
    desc = tuple(f"{k}={v}"
                 for k, v in zip(("x", "y", "z"), reversed(src_ds.tile_shape)))
    logger.info(f"tiling dimension ({', '.join(desc)})")

    try:
        views = src_ds.index.get_level_values("view").unique().values
        if len(views) > 1:
            view = prompt_options("Please select a view: ", views)
            src_ds.drop(
                src_ds.iloc[
                    src_ds.index.get_level_values("view") != view].index,
                inplace=True,
            )
            logger.debug(f'found multiple views, using "{view}"')
        else:
            logger.debug(f"single-view dataset")
    except KeyError:
        # no need to differentiate different view
        logger.debug("not a multi-view dataset")

    try:
        channels = src_ds.index.get_level_values("channel").unique().values
        if len(channels) > 1:
            channel = prompt_options("Please select a channel: ", channels)
            src_ds.drop(
                src_ds.iloc[
                    src_ds.index.get_level_values("channel") != channel].index,
                inplace=True,
            )
            logger.debug(f'found multiple channels, using "{channel}"')
        else:
            logger.debug(f"single-channel dataset")
    except KeyError:
        # no need to differentiate different view
        logger.debug("not a multi-channel dataset")

    # preview summary
    print(src_ds.inventory)

    if host == "local":
        client = LocalCluster()
    else:
        client = Client(host)
    logger.info(client)

    # create directives
    preview = run(src_ds, mip=mip)
    logger.info(f"final preview {preview.shape}, {preview.dtype}")

    # saving the result to zarr format
    zarr_path = f"{dst_dir.rstrip(os.sep)}.zarr"
    chunks = (8, 512, 512)
    logger.info(f'generating "{os.path.basename(zarr_path)}"')
    logger.debug(
        f"shape={preview.shape}, dtype={preview.dtype}, chunks={chunks}")

    try:
        logger.info("dumping to zarr directory store, waiting...")
        preview = preview.rechunk(chunks)
        da.to_zarr(preview, zarr_path, overwrite=False)
    except ValueError:
        logger.warning("found existing zarr store, reusing it")

    logger.info("release dask array")
    del preview

    logger.info(f'saving layered preivew to "{dst_dir}"')
    try:
        os.makedirs(dst_dir)
    except FileExistsError:
        logger.warning(f'"{dst_dir}" exists')
        pass

    logger.info(f"reload data from zarr")
    preview = da.from_zarr(zarr_path)

    futures = []
    with tqdm(total=preview.shape[0]) as pbar:
        for i, layer in enumerate(preview):
            fname = f"layer_{i+1:04d}.tif"
            pbar.set_description(fname)
            path = os.path.join(dst_dir, fname)
            future = client.submit(imageio.imwrite, path, layer)
            futures.append(future)
            pbar.update(1)

    # submit tasks
    with tqdm(total=len(futures),
              bar_format="{l_bar}{bar:24}{r_bar}{bar:-10b}") as pbar:
        for future in as_completed(futures, with_results=False):
            try:
                future.result()  # ensure we do not have an exception
                pbar.update(1)
            except Exception as error:
                logger.exception(error)
            future.release()

    logger.info("closing scheduler connection")
    client.close()