def execute(array, write_path=None, **kwargs): """ """ with ClusterWrap.cluster(**kwargs) as cluster: # if user wants to write to disk if write_path is not None: compressor = Blosc( cname='zstd', clevel=4, shuffle=Blosc.BITSHUFFLE, ) zarr_disk = zarr.open( write_path, 'w', shape=array.shape, chunks=array.chunksize, dtype=array.dtype, compressor=compressor, ) to_zarr(array, zarr_disk) return zarr_disk # otherwise user wants result returned to local process return array.compute()
def _persist(source, path, component=None, storage_options=None, **kwargs): """Save array to local persistent store Makes a parquet dataset out of the data using zarr. This then becomes a data entry in the persisted datasets catalog. Only works locally for the moment. Parameters ---------- source: a DataSource instance to save name: str or None Key to refer to this persisted dataset by. If not given, will attempt to get from the source's name kwargs: passed on to zarr array creation, see """ from dask.array import to_zarr, from_array from ..source.zarr import ZarrArraySource try: arr = source.to_dask() except NotImplementedError: arr = from_array(source.read(), chunks=-1).rechunk('auto') to_zarr(arr, path, component=None, storage_options=storage_options, **kwargs) source = ZarrArraySource(path, storage_options, component) return source
def compose_position_fields(fields, spacing, output, blocksize=[ 256, ] * 3, displacement=None): """ """ with distributed.distributedState() as ds: # get number of jobs needed block_grid = np.ceil(np.array(fields[0].shape[:-1]) / blocksize).astype(int) nblocks = np.prod(block_grid) # set up the cluster ds.initializeLSFCluster(job_extra=["-P multifish"]) ds.initializeClient() ds.scaleCluster(njobs=nblocks) # wrap fields as dask arrays fields_da = da.stack( [da.from_array(f, chunks=blocksize + [ 3, ]) for f in fields]) # accumulate composed = da.sum(fields_da, axis=0) # modify for multiple position fields if displacement is not None: raise NotImplementedError( "composing displacement fields not implemented yet") else: grid = position_grid_dask(composed.shape[:3], blocksize) * spacing.astype(np.float32) composed = composed - (len(fields) - 1) * grid # write in parallel as 3D array to zarr file compressor = Blosc(cname='zstd', clevel=9, shuffle=Blosc.BITSHUFFLE) composed_disk = zarr.open( output, 'w', shape=composed.shape, chunks=composed.chunksize, dtype=composed.dtype, compressor=compressor, ) da.to_zarr(composed, composed_disk) # return pointer to zarr file return composed_disk
def _to_zarr( # type: ignore[no-untyped-def] arr, url, component=None, storage_options=None, overwrite=False, compute=True, return_stored=False, attrs=None, **kwargs, ): """Extension of dask.array.core.to_zarr that can set attributes on the resulting Zarr array, in the same Dask operation. """ # call Dask version with compute=False just to check preconditions da.to_zarr( arr, url, component=component, storage_options=storage_options, overwrite=overwrite, compute=False, return_stored=return_stored, **kwargs, ) storage_options = storage_options or {} if isinstance(url, str): mapper = get_mapper(url, **storage_options) else: # assume the object passed is already a mapper mapper = url # pragma: no cover chunks = [c[0] for c in arr.chunks] z = dask.delayed(_zarr_create_with_attrs)( shape=arr.shape, chunks=chunks, dtype=arr.dtype, store=mapper, path=component, overwrite=overwrite, attrs=attrs, **kwargs, ) return arr.store(z, lock=False, compute=compute, return_stored=return_stored)
def global_affine_to_position_field(shape, spacing, affine, output, blocksize=[ 256, ] * 3): """ """ with distributed.distributedState() as ds: # get number of jobs needed block_grid = np.ceil(np.array(shape) / blocksize).astype(int) nblocks = np.prod(block_grid) # set up the cluster ds.initializeLSFCluster(job_extra=["-P multifish"]) ds.initializeClient() ds.scaleCluster(njobs=nblocks) # compute affine transform as position coordinates, lazy dask arrays grid = position_grid_dask(shape, blocksize) * spacing.astype( np.float32) coords = affine_to_grid_dask(affine, grid) coords = da.around(coords, decimals=2) # write in parallel as 4D array to zarr file compressor = Blosc(cname='zstd', clevel=9, shuffle=Blosc.BITSHUFFLE) coords_disk = zarr.open( output, 'w', shape=coords.shape, chunks=tuple(blocksize + [ 3, ]), dtype=coords.dtype, compressor=compressor, ) da.to_zarr(coords, coords_disk) # return pointer to zarr file return coords_disk
def _data_to_source(arr, path, component=None, storage_options=None, **kwargs): from dask.utils import is_arraylike from dask.array import to_zarr, from_array from ..source.zarr import ZarrArraySource if not is_arraylike(arr): raise NotImplementedError if not hasattr(arr, 'npartitions'): arr = from_array(arr, chunks='auto') to_zarr(arr, path, component=None, storage_options=storage_options, **kwargs) source = ZarrArraySource(path, storage_options, component) return source
def merge_arrays(data_path): numpyload = delayed(numpy.load, pure=True) filelist = os.listdir(data_path) def filenum(x): return int(x[8:-4]) filelist = sorted(filelist, key=filenum) array_list = [] for symbol in filelist: arr_name = data_path + symbol arr_d = numpyload(arr_name) arr = da.from_delayed(arr_d, (256, 256, 256), float) array_list.append(arr) print(arr_name) filelist = os.listdir(data_path[:-1] + "_2") filelist = sorted(filelist, key=filenum) for symbol in filelist: arr_name = data_path + symbol arr_d = numpyload(arr_name) arr = da.from_delayed(arr_d, (256, 256, 256), float) array_list.append(arr) print(arr_name) z = da.stack(array_list) z.rechunk((256, 256, 256, 1)) # da.to_npy_stack(data_path+'zarr_data', z) # m = z[:][:][:][199] # client = Client('128.104.222.103:8786') # re = client.compute(m) da.to_zarr(z, data_path + 'zarr_data_full')
def resample_frames( frames, frames_spacing, transforms, write_path, mask=None, time_stride=1, compression_level=4, cluster_kwargs={}, ): """ """ with ClusterWrap.cluster(**cluster_kwargs) as cluster: # create dask array of all frames if csio.testPathExtensionForHDF5(frames['suffix']): frames_data = csio.daskArrayBackedByHDF5( frames['folder'], frames['prefix'], frames['suffix'], frames['dataset_path'], stride=time_stride, ) elif csio.testPathExtensionForSTACK(frames['suffix']): frames_data = csio.daskArrayBackedBySTACK( frames['folder'], frames['prefix'], frames['suffix'], frames['dtype'], frames['shape'], stride=time_stride, ) compute_frames = frames_data.shape[0] # wrap transforms as dask array # extra dimension to match frames_data ndims if len(transforms.shape) == 3: transforms = transforms[::time_stride, None, :, :] elif len(transforms.shape) == 2: transforms = transforms[::time_stride, None, None, :] transforms_d = da.from_array(transforms, chunks=(1, ) + transforms[0].shape) # wrap mask mask_d = None if mask is not None: mask_sh, frame_sh = mask.shape, frames_data.shape[1:] if mask_sh != frame_sh: mask = zoom(mask, np.array(frame_sh) / mask_sh, order=0) mask_d = cluster.client.scatter(mask, broadcast=True) # wrap transform function def wrapped_apply_transform(mov, t, mask_d=None): mov = mov.squeeze() t = t.squeeze() # just an affine matrix transform_list = [ t, ] # affine plus bspline if len(t.shape) == 1: transform_list = [t[:16].reshape((4, 4)), t[16:]] # apply transform(s) aligned = apply_transform( mov, mov, frames_spacing, frames_spacing, transform_list=transform_list, ) if mask_d is not None: aligned = aligned * mask_d return aligned[None, ...] # apply transform to all frames frames_aligned = da.map_blocks( wrapped_apply_transform, frames_data, transforms_d, mask_d=mask_d, dtype=np.uint16, chunks=[ 1, ] + list(frames_data.shape[1:]), ) # write in parallel as 4D array to zarr file compressor = Blosc( cname='zstd', clevel=compression_level, shuffle=Blosc.BITSHUFFLE, ) aligned_disk = zarr.open(write_path, 'w', shape=frames_aligned.shape, chunks=[ 1, ] + list(frames_data.shape[1:]), dtype=frames_aligned.dtype, compressor=compressor) da.to_zarr(frames_aligned, aligned_disk) # return reference to zarr store return aligned_disk
def write_zarr(uri, data, path="/"): import dask.array as da da.to_zarr(data, uri, component=path, overwrite=True) return uri
def dask_linear_operator(self): self.nC = self.modelMap.shape[0] n_data_comp = len(self.survey.components) components = np.array(list(self.survey.components.keys())) active_components = np.hstack( [np.c_[values] for values in self.survey.components.values()] ).tolist() row = delayed(self.evaluate_integral, pure=True) rows = [ array.from_delayed( row(receiver_location, components[component]), dtype=np.float32, shape=(n_data_comp, self.nC), ) for receiver_location, component in zip( self.survey.receiver_locations.tolist(), active_components ) ] stack = array.vstack(rows) # Chunking options if self.chunk_format == "row" or self.store_sensitivities == "forward_only": config.set({"array.chunk-size": f"{self.max_chunk_size}MiB"}) # Autochunking by rows is faster and more memory efficient for # very large problems sensitivty and forward calculations stack = stack.rechunk({0: "auto", 1: -1}) elif self.chunk_format == "equal": # Manual chunks for equal number of blocks along rows and columns. # Optimal for Jvec and Jtvec operations row_chunk, col_chunk = compute_chunk_sizes(*stack.shape, self.max_chunk_size) stack = stack.rechunk((row_chunk, col_chunk)) else: # Auto chunking by columns is faster for Inversions config.set({"array.chunk-size": f"{self.max_chunk_size}MiB"}) stack = stack.rechunk({0: -1, 1: "auto"}) if self.store_sensitivities == "disk": sens_name = self.sensitivity_path + "sensitivity.zarr" if os.path.exists(sens_name): kernel = array.from_zarr(sens_name) if np.all( np.r_[ np.any(np.r_[kernel.chunks[0]] == stack.chunks[0]), np.any(np.r_[kernel.chunks[1]] == stack.chunks[1]), np.r_[kernel.shape] == np.r_[stack.shape], ] ): # Check that loaded kernel matches supplied data and mesh print("Zarr file detected with same shape and chunksize ... re-loading") return kernel else: print("Writing Zarr file to disk") with ProgressBar(): print("Saving kernel to zarr: " + sens_name) kernel = array.to_zarr( stack, sens_name, compute=True, return_stored=True, overwrite=True ) elif self.store_sensitivities == "forward_only": with ProgressBar(): print("Forward calculation: ") pred = (stack @ self.model).compute() return pred else: print(stack.chunks) with ProgressBar(): print("Computing sensitivities to local ram") kernel = array.asarray(stack.compute()) return kernel
def convert(resized, target_array): da.to_zarr(resized, target_array)
def motionCorrect( folder, prefix, suffix, fixed, fixed_vox, moving_vox, write_path, dataset_path=None, distributed_state=None, sigma=7, transforms_dir=None, **kwargs, ): """ """ # set up the distributed environment ds = distributed_state if distributed_state is None: ds = csd.distributedState() # writing large compressed chunks locks GIL for a long time ds.modifyConfig({ 'distributed.comm.timeouts.connect': '60s', 'distributed.comm.timeouts.tcp': '180s', }) ds.initializeLSFCluster(job_extra=["-P scicompsoft"]) ds.initializeClient() # create (lazy) dask bag from all frames frames = csio.daskBagOfFilePaths(folder, prefix, suffix) nframes = frames.npartitions # scale cluster carefully if 'max_workers' in kwargs.keys(): max_workers = kwargs['max_workers'] else: max_workers = 1250 ds.scaleCluster(njobs=min(nframes, max_workers)) # align all dfixed = delayed(fixed) dfixed_vox = delayed(fixed_vox) dmoving_vox = delayed(moving_vox) ddataset_path = delayed(dataset_path) params = frames.map( lambda b, w, x, y, z: rigidAlign(w, b, x, y, dataset_path=z), w=dfixed, x=dfixed_vox, y=dmoving_vox, z=ddataset_path, ).compute() params = np.array(list(params)) # (weak) outlier removal and smoothing params = percentile_filter(params, 50, footprint=np.ones((3, 1))) params = gaussian_filter1d(params, sigma, axis=0) # write transforms as matrices if transforms_dir is not None: paths = list(frames) for ind, p in enumerate(params): transform = _parametersToRigidMatrix(p) basename = os.path.splitext(os.path.basename(paths[ind]))[0] path = os.path.join(transforms_dir, basename) + '_rigid.mat' np.savetxt(path, transform) # apply transforms to all images params = db.from_sequence(params, npartitions=nframes) transformed = frames.map( lambda b, x, y, z: applyTransform(b, x, y, dataset_path=z), x=dmoving_vox, y=params, z=ddataset_path, ).to_delayed() # convert to a (lazy) 4D dask array sh = transformed[0][0].shape.compute() dd = transformed[0][0].dtype.compute() arrays = [da.from_delayed(t[0], sh, dtype=dd) for t in transformed] transformed = da.stack(arrays, axis=0) # write in parallel as 4D array to zarr file compressor = Blosc(cname='zstd', clevel=9, shuffle=Blosc.BITSHUFFLE) transformed_disk = zarr.open(write_path, 'w', shape=transformed.shape, chunks=(256, 10, 256, 256), dtype=transformed.dtype, compressor=compressor) da.to_zarr(transformed, transformed_disk) # release resources if distributed_state is None: ds.closeClient() # return reference to data on disk return transformed_disk
def distributed_deltafoverf( zarr_path, window_size, batch_size, write_path, compression_level=4, cluster_kwargs={}, ): """ """ # launch cluster with ClusterWrap.cluster(**cluster_kwargs) as cluster: # lazy load zarr to get metadata metadata = zarr.open(zarr_path, 'r') # get block start indices start_indices, start_index = [], 0 while start_index + window_size < metadata.shape[0]: start_indices.append(start_index) start_index = start_index + batch_size - window_size # convert to dask array start_indices_da = da.from_array(start_indices, chunks=(1, )) # wrap deltafoverf function def wrapped_deltafoverf(index): zarr_file = zarr.open(zarr_path, 'r') data = zarr_file[index[0]:index[0] + batch_size] return deltafoverf(data, window_size) # map function to each block dff = da.map_blocks( wrapped_deltafoverf, start_indices_da, dtype=np.float16, new_axis=list(range(1, metadata.ndim)), chunks=(batch_size - window_size, ) + metadata.chunks[1:], ) # ensure the correct shape and rechunk for faster writing dff = dff[:metadata.shape[0] - window_size] dff = dff.rechunk((1, ) + metadata.chunks[1:]) # persist dff before writing to zarr, prevents RAM conflicts dff = dff.persist() # write to output zarr compressor = Blosc( cname='zstd', clevel=compression_level, shuffle=Blosc.BITSHUFFLE, ) dff_disk = zarr.open( write_path, 'w', shape=dff.shape, chunks=metadata.chunks, dtype=dff.dtype, compressor=compressor, ) da.to_zarr(dff, dff_disk) # return reference to zarr store return dff_disk
def apply_position_field( mov, mov_spacing, fix, fix_spacing, transform, output, blocksize=[ 256, ] * 3, order=1, transform_spacing=None, transpose=[ False, ] * 3, depth=(32, 32, 32), ): """ """ with distributed.distributedState() as ds: # get number of jobs needed block_grid = np.ceil(np.array(mov.shape) / blocksize).astype(int) nblocks = np.prod(block_grid) # set up the cluster ds.initializeLSFCluster(job_extra=["-P multifish"]) ds.initializeClient() ds.scaleCluster(njobs=nblocks) # determine mov/fix relative chunking m_blocksize = blocksize * fix_spacing / mov_spacing m_blocksize = list(np.round(m_blocksize).astype(np.int16)) m_depth = depth * fix_spacing / mov_spacing m_depth = tuple(np.round(m_depth).astype(np.int16)) # determine trans/fix relative chunking if transform_spacing is not None: t_blocksize = blocksize * fix_spacing / transform_spacing t_blocksize = list(np.round(t_blocksize).astype(np.int16)) t_depth = depth * fix_spacing / transform_spacing t_depth = tuple(np.round(t_depth).astype(np.int16)) else: t_blocksize = blocksize t_depth = depth # wrap objects as dask arrays fix_da = da.from_array(fix) if transpose[0]: fix_da = fix_da.transpose(2, 1, 0) mov_da = da.from_array(mov) if transpose[1]: mov_da = mov_da.transpose(2, 1, 0) block_grid = block_grid[::-1] transform_da = da.from_array(transform) if transpose[2]: transform_da = transform_da.transpose(2, 1, 0, 3) transform_da = transform_da[..., ::-1] # chunk dask arrays fix_da = da.reshape(fix_da, fix_da.shape + (1, )).rechunk( tuple(blocksize + [ 1, ])) mov_da = da.reshape(mov_da, mov_da.shape + (1, )).rechunk( tuple(m_blocksize + [ 1, ])) transform_da = transform_da.rechunk(tuple(t_blocksize + [ 3, ])) # put transform in voxel units transform_da = transform_da / mov_spacing # map the interpolate function with overlaps # TODO: depth should be computed automatically from transform maximum? d = [depth + (0, ), m_depth + (0, ), t_depth + (0, )] aligned = da.map_overlap( interpolate_image_dask, fix_da, mov_da, transform_da, blocksize=m_blocksize, margin=m_depth, depth=d, boundary=0, dtype=np.uint16, align_arrays=False, ) # remove degenerate dimension aligned = da.reshape(aligned, aligned.shape[:-1]) # write in parallel as 3D array to zarr file compressor = Blosc(cname='zstd', clevel=9, shuffle=Blosc.BITSHUFFLE) aligned_disk = zarr.open( output, 'w', shape=aligned.shape, chunks=aligned.chunksize, dtype=aligned.dtype, compressor=compressor, ) da.to_zarr(aligned, aligned_disk) # return pointer to zarr file return aligned_disk
def process(directory, threshold=6, integrate=False, counting=False, hdr=None, mean_e=256, nav_shape=None, chunk_shape=None, verbose=False): if verbose: _logger.setLevel(logging.DEBUG) handler = logging.StreamHandler(sys.stdout) handler.setLevel(logging.DEBUG) formatter = logging.Formatter('%(message)s \n') handler.setFormatter(formatter) _logger.addHandler(handler) _logger.info(msg="\n\n .SEQ Processor Application (and Counting)...\n" "Created by: Carter Francis ([email protected])\n" "Updated 2021-06-18\n" "------------------\n") _logger.info(msg="Version:" + __version__) tick = time.time() file_dict = get_files(folder=directory) for key in file_dict: if len(file_dict[key]) == 0: file_dict[key].pop() else: file_dict[key] = file_dict[key][0] if "top" in file_dict and "bottom" in file_dict: file_dict.pop("seq") data_dict = cel_file_reader(**file_dict, nav_shape=nav_shape, chunk_shape=chunk_shape, lazy=True) elif "seq" in file_dict: data_dict = file_reader(**file_dict, nav_shape=nav_shape, chunk_shape=chunk_shape, lazy=True) if hdr is not None: hdr = hs.load(hdr).data else: hdr = None if hdr is None and integrate is False: dtype = bool else: dtype = np.float32 if counting: data_dict["data"] = data_dict["data"].map_blocks( _counting_filter_cpu, threshold=threshold, integrate=integrate, hdr_mask=hdr, method="maximum", mean_electron_val=mean_e, dtype=dtype) _logger.info(data_dict) sig = dict2signal(data_dict, lazy=True) _logger.info("Data... :" + str(sig.data)) _logger.info("Dtype:" + str(sig.data.dtype)) _logger.info("Saving... ") da.to_zarr(sig.data, directory + "_zarr", overwrite=True) #sig.save(directory + ".hspy", # compression=False, # overwrite=True) tock = time.time() _logger.info("Total time elapsed : " + str(tock - tick) + " sec") return sig
def _to_zarr(self, data, labels, location): data = da.to_zarr(data, location, component='data', compute=False) labels = da.to_zarr(labels, location, component='labels', compute=False) return data, labels
def dask_getJ(self, m, f=None): """ Generate Full sensitivity matrix """ if self._Jmatrix is not None: return self._Jmatrix self.model = m if f is None: f = self.fields(m) if self.verbose: print("Calculating J and storing") if os.path.exists(self.sensitivity_path): shutil.rmtree(self.sensitivity_path, ignore_errors=True) # Wait for the system to clear out the directory while os.path.exists(self.sensitivity_path): pass m_size = self.model.size count = 0 for source in self.survey.source_list: u_source = f[source, self._solutionType] for rx in source.receiver_list: PT = rx.getP(self.mesh, rx.projGLoc(f)).toarray().T df_duT = PT # Find a block of receivers n_block_col = int(np.ceil(df_duT.size * 8 * 1e-9 / self.max_ram)) n_col = int(np.ceil(df_duT.shape[1] / n_block_col)) nrows = int( m_size / np.ceil(m_size * n_col * 8 * 1e-6 / self.max_chunk_size)) ind = 0 for col in range(n_block_col): ATinvdf_duT = da.asarray(self.Ainv * df_duT[:, ind:ind + n_col]).rechunk( (nrows, n_col)) dA_dmT = self.getADeriv(u_source, ATinvdf_duT, adjoint=True) # du_dmT = -da.from_delayed(dask.delayed(dA_dmT), shape=(self.model.size, n_col), dtype=float) if n_col > 1: du_dmT = da.from_delayed(dask.delayed(-dA_dmT), shape=(m_size, n_col), dtype=float) else: du_dmT = da.from_delayed(dask.delayed(-dA_dmT), shape=(m_size, ), dtype=float) blockName = self.sensitivity_path + "J" + str(count) + ".zarr" da.to_zarr((du_dmT.T).rechunk("auto"), blockName) del ATinvdf_duT count += 1 ind += n_col dask_arrays = [] for ii in range(count): blockName = self.sensitivity_path + "J" + str(ii) + ".zarr" J = da.from_zarr(blockName) # Stack all the source blocks in one big zarr dask_arrays.append(J) rowChunk, colChunk = compute_chunk_sizes(self.survey.nD, m_size, self.max_chunk_size) self._Jmatrix = da.vstack(dask_arrays).rechunk((rowChunk, colChunk)) self.Ainv.clean() return self._Jmatrix
import h5py from glob import glob import os import dask.array as da filenames = sorted(glob(os.path.join('data', 'weather-big', '*.hdf5'))) dsets = [h5py.File(filename, mode='r')['/t2m'] for filename in filenames] arrays = [da.from_array(dset, chunks=(500, 500)) for dset in dsets] x = da.stack(arrays, axis=0) result = x[:, ::2, ::2] da.to_zarr(result, os.path.join('data', 'myfile.zarr'), overwrite=True)
def dask_getJ(self, m, f=None): """ Generate Full sensitivity matrix """ if self._Jmatrix is not None: return self._Jmatrix if f is None: f = self.fields(m) if self.verbose: print("Calculating J and storing") if self._mini_survey is not None: # Need to use _Jtvec for this operation currently... J = self._Jtvec(m=m, v=None, f=f).T self._Jmatrix = da.from_array(J) return self._Jmatrix if os.path.exists(self.sensitivity_path): shutil.rmtree(self.sensitivity_path, ignore_errors=True) # Wait for the system to clear out the directory while os.path.exists(self.sensitivity_path): pass m_size = self.model.size count = 0 for source in self.survey.source_list: u_source = f[source, self._solutionType] for rx in source.receiver_list: # wrt f, need possibility wrt m PTv = rx.evalDeriv(source, self.mesh, f).toarray().T df_duTFun = getattr(f, "_{0!s}Deriv".format(rx.projField), None) df_duT, df_dmT = df_duTFun(source, None, PTv, adjoint=True) # Find a block of receivers n_block_col = int(np.ceil(df_duT.size * 8 * 1e-9 / self.max_ram)) n_col = int(np.ceil(df_duT.shape[1] / n_block_col)) nrows = int( m_size / np.ceil(m_size * n_col * 8 * 1e-6 / self.max_chunk_size)) ind = 0 for col in range(n_block_col): ATinvdf_duT = da.asarray(self.Ainv * df_duT[:, ind:ind + n_col]).rechunk( (nrows, n_col)) dA_dmT = self.getADeriv(u_source, ATinvdf_duT, adjoint=True) dRHS_dmT = self.getRHSDeriv(source, ATinvdf_duT, adjoint=True) if n_col > 1: du_dmT = da.from_delayed(dask.delayed(-dA_dmT), shape=(m_size, n_col), dtype=float) else: du_dmT = da.from_delayed(dask.delayed(-dA_dmT), shape=(m_size, ), dtype=float) if not isinstance(dRHS_dmT, Zero): du_dmT += da.from_delayed(dask.delayed(dRHS_dmT), shape=(m_size, n_col), dtype=float) if not isinstance(df_dmT, Zero): du_dmT += da.from_delayed(df_dmT, shape=(m_size, n_col), dtype=float) blockName = self.sensitivity_path + "J" + str(count) + ".zarr" da.to_zarr((du_dmT.T).rechunk("auto"), blockName) del ATinvdf_duT count += 1 ind += n_col dask_arrays = [] for ii in range(count): blockName = self.sensitivity_path + "J" + str(ii) + ".zarr" J = da.from_zarr(blockName) # Stack all the source blocks in one big zarr dask_arrays.append(J) rowChunk, colChunk = compute_chunk_sizes(self.survey.nD, m_size, self.max_chunk_size) self._Jmatrix = da.vstack(dask_arrays).rechunk((rowChunk, colChunk)) self.Ainv.clean() return self._Jmatrix
def write_zarr(uri, data, internal_path="/"): data = data.rechunk("auto") da.to_zarr(data, uri, component=internal_path, overwrite=True) return uri
def local_affine_to_position_field(shape, spacing, local_affines, output, blocksize=[ 256, ] * 3): """ """ with distributed.distributedState() as ds: # get number of jobs needed block_grid = np.ceil(np.array(shape) / blocksize).astype(int) nblocks = np.prod(block_grid) # set up the cluster ds.initializeLSFCluster( job_extra=["-P multifish"], cores=4, memory="64GB", ncpus=4, threads_per_worker=8, mem=64000, ) ds.initializeClient() ds.scaleCluster(njobs=nblocks) # augment the blocksize by the fixed overlap size pads = [2 * int(round(x / 8)) for x in blocksize] blocksize_with_overlap = np.array(blocksize) + pads # get a grid used for each affine grid = position_grid_dask(blocksize_with_overlap, list(blocksize_with_overlap)) grid = grid * spacing.astype(np.float32) # wrap local_affines as dask array local_affines_da = da.from_array(local_affines, chunks=(1, 1, 1, 3, 4)) # compute affine transforms as position coordinates, lazy dask arrays coords = da.map_blocks( affine_to_grid_dask, local_affines_da, grid=grid, displacement=True, new_axis=[5, 6], chunks=( 1, 1, 1, ) + tuple(grid.shape), dtype=np.float32, ) # stitch affine position fields coords = stitch.stitch_fields(coords, blocksize) # crop to original shape coords = coords[:shape[0], :shape[1], :shape[2]] # convert to position field coords = coords + position_grid_dask( shape, blocksize) * spacing.astype(np.float32) coords = da.around(coords, decimals=2) # write in parallel as 3D array to zarr file compressor = Blosc(cname='zstd', clevel=9, shuffle=Blosc.BITSHUFFLE) coords_disk = zarr.open( output, 'w', shape=coords.shape, chunks=tuple(blocksize + [ 3, ]), dtype=coords.dtype, compressor=compressor, ) da.to_zarr(coords, coords_disk) # return pointer to zarr file return coords_disk
def tiled_deformable_align( fixed, moving, fixed_spacing, moving_spacing, blocksize, transpose=[False] * 2, global_affine=None, local_affines=None, write_path=None, lazy=True, deform_kwargs={}, # cluster_kwargs={}, ): """ """ # get number of blocks required block_grid = np.ceil(np.array(fixed.shape) / blocksize) nblocks = np.prod(block_grid) # get true field shape original_shape = fixed.shape if transpose[0]: original_shape = original_shape[::-1] # get affine position field affine_pf = None if global_affine is not None or local_affines is not None: if local_affines is None: local_affines = np.empty( block_grid + (3, 4), dtype=np.float32, ) local_affines[..., :, :] = np.eye(4)[:3, :] affine_pf = transform.local_affines_to_position_field( original_shape, fixed_spacing, blocksize, local_affines, global_affine=global_affine, lazy=True, #cluster_kwargs=cluster_kwargs, ) # distributed computations done in cluster context #with ClusterWrap.cluster(**cluster_kwargs) as cluster: # if write_path is not None or not lazy: # cluster.scale_cluster(nblocks + WORKER_BUFFER) # wrap images as dask arrays fixed_da = da.from_array(fixed) moving_da = da.from_array(moving) # in case xyz convention is flipped for input file if transpose[0]: fixed_da = fixed_da.transpose(2, 1, 0) if transpose[1]: moving_da = moving_da.transpose(2, 1, 0) # pad the ends to fill in the last blocks pads = [] for x, y in zip(original_shape, blocksize): pads += [(0, y - x % y) if x % y > 0 else (0, 0)] fixed_da = da.pad(fixed_da, pads) moving_da = da.pad(moving_da, pads) # chunk to blocksize fixed_da = fixed_da.rechunk(tuple(blocksize)) moving_da = moving_da.rechunk(tuple(blocksize)) # wrap deformable function def wrapped_deformable_align(x, y): warp = deformable_align( x, y, fixed_spacing, moving_spacing, **deform_kwargs, ) return warp.reshape((1, 1, 1) + warp.shape) # deform all chunks overlaps = tuple([int(round(x / 8)) for x in blocksize]) out_blocks = [x + 2 * y for x, y in zip(blocksize, overlaps)] out_blocks = [1, 1, 1] + out_blocks + [ 3, ] warps = da.map_overlap( wrapped_deformable_align, fixed_da, moving_da, depth=overlaps, boundary=0, trim=False, align_arrays=False, dtype=np.float32, new_axis=[ 3, 4, 5, 6, ], chunks=out_blocks, ) # stitch neighboring displacement fields warps = stitch.stitch_fields(warps, blocksize) # crop any pads warps = warps[:original_shape[0], :original_shape[1], :original_shape[2]] # TODO refactor transform.compose_position_fields # replace this approximation # compose with affine position field if affine_pf is not None: final_field = affine_pf + warps else: final_field = warps + transform.position_grid_dask( original_shape, blocksize, ) # if user wants to write to disk if write_path is not None: compressor = Blosc(cname='zstd', clevel=9, shuffle=Blosc.BITSHUFFLE) final_field_disk = zarr.open( write_path, 'w', shape=final_field.shape, chunks=tuple(blocksize + [ 3, ]), dtype=final_field.dtype, compressor=compressor, ) da.to_zarr(final_field, final_field_disk) # if user wants to compute and return full field if not lazy: return final_field.compute() # if user wants to return compute graph w/o executing if lazy: return final_field
import numpy as np import h5py import dask.array as da filename = '/Users/pbw/data/sample_A/sample_A_20160501.hdf' source_data = h5py.File(filename, 'r') raw = np.asarray(source_data['volumes/raw']) labels = np.asarray(source_data['volumes/labels/neuron_ids']) raw_dask = da.from_array(raw, chunks=(1, 1250, 1250)) da.to_zarr(raw_dask, 'raw.zarr') labels_dask = da.from_array(labels, chunks=(1, 1250, 1250)) da.to_zarr(labels_dask, 'labels.zarr')
def deformable_align_distributed( fixed, moving, fixed_vox, moving_vox, write_path, cc_radius, gradient_smoothing, field_smoothing, iterations, shrink_factors, smooth_sigmas, step, blocksize=[256,]*3, cluster_extra=["-P multifish"], transpose=False, ): """ """ # distributed computations done in cluster context with distributed.distributedState() as ds: # get number of blocks required block_grid = np.ceil(np.array(fixed.shape) / blocksize) nblocks = np.prod(block_grid) # set up the cluster ds.initializeLSFCluster( job_extra=cluster_extra, cores=4, memory="64GB", ncpus=4, threads_per_worker=8, mem=64000, ) ds.initializeClient() ds.scaleCluster(njobs=nblocks) # wrap images as dask arrays fixed_da = da.from_array(fixed) moving_da = da.from_array(moving) # in case xyz convention is flipped for input file if transpose: fixed_da = fixed_da.transpose(2,1,0) # pad the ends to fill in the last blocks orig_sh = fixed_da.shape pads = [(0, y - x % y) if x % y != 0 else (0, 0) for x, y in zip(orig_sh, blocksize)] fixed_da = da.pad(fixed_da, pads) moving_da = da.pad(moving_da, pads) fixed_da = fixed_da.rechunk(tuple(blocksize)) moving_da = moving_da.rechunk(tuple(blocksize)) # wrap deformable function to simplify passing parameters def my_deformable_align(x, y): return deformable_align( x, y, fixed_vox, moving_vox, cc_radius, gradient_smoothing, field_smoothing, iterations, shrink_factors, smooth_sigmas, step, ) # deform all chunks overlaps = tuple([int(round(x/8)) for x in blocksize]) out_blocks = [1,1,1] + [x + 2*y for x, y in zip(blocksize, overlaps)] + [3,] warps = da.map_overlap( my_deformable_align, fixed_da, moving_da, depth=overlaps, boundary='reflect', trim=False, align_arrays=False, dtype=np.float32, new_axis=[3,4,5,6,], chunks=out_blocks, ) # stitch neighboring displacement fields warps = stitch.stitch_fields(warps, blocksize) # crop any pads warps = warps[:orig_sh[0], :orig_sh[1], :orig_sh[2]] # convert to position field warps = warps + transform.position_grid_dask(orig_sh, blocksize) # write result to zarr file compressor = Blosc(cname='zstd', clevel=9, shuffle=Blosc.BITSHUFFLE) warps_disk = zarr.open(write_path, 'w', shape=warps.shape, chunks=tuple(blocksize + [3,]), dtype=warps.dtype, compressor=compressor, ) da.to_zarr(warps, warps_disk) # return reference to zarr data store return warps_disk
def test_zarr_functionament(self): # with shape np_array = np.random.randint(1, 10, size=1000) array = da.from_array(np_array) with TemporaryDirectory() as tmpdir: delayed = da.to_zarr(array, url=tmpdir, compute=False, component='/data') dask.compute(delayed) z_object = zarr.open_group(tmpdir, mode='r') assert np.all(np_array == z_object.data[:]) # def without_shape(): np_array = np.random.randint(1, 10, size=1000000) array = da.from_array(np_array) array = array[array > 5] with TemporaryDirectory() as tmpdir: array.compute_chunk_sizes() delayed = da.to_zarr(array, url=tmpdir, compute=False, component='/data') dask.compute(delayed) z_object = zarr.open_group(tmpdir, mode='r') assert np.all(np_array[np_array > 5] == z_object.data[:]) # without_shape2 np_array = np.random.randint(1, 10, size=10000000) array = da.from_array(np_array) array = array[array > 5] with TemporaryDirectory() as tmpdir: array.compute_chunk_sizes() delayed = da.to_zarr(array, url=tmpdir, compute=False, component='/data') dask.compute(delayed) z_object = zarr.open_group(tmpdir, mode='r') assert np.all(np_array[np_array > 5] == z_object.data[:]) # write_chunks chunks = [] sizes = (1, 2, 3) # total_size = sum(sizes) for i, n in enumerate(sizes): chunks.append(np.full(n, (i,))) with TemporaryDirectory() as tmpdir: store = zarr.DirectoryStore(tmpdir) root = zarr.group(store=store, overwrite=True) dataset = root.create_dataset('test', shape=(0,), chunks=chunks[0].shape, dtype=chunks[0].dtype) # offset = 0 for chunk in chunks: dataset.append(chunk)
def main(src_dir, dst_dir, remap, flip, host, mip): logging.getLogger("tifffile").setLevel(logging.ERROR) coloredlogs.install(level="DEBUG", fmt="%(asctime)s %(levelname)s %(message)s", datefmt="%H:%M:%S") logger = logging.getLogger(__name__) src_ds = load_dataset(src_dir, remap, flip) desc = tuple(f"{k}={v}" for k, v in zip(("x", "y", "z"), reversed(src_ds.tile_shape))) logger.info(f"tiling dimension ({', '.join(desc)})") try: views = src_ds.index.get_level_values("view").unique().values if len(views) > 1: view = prompt_options("Please select a view: ", views) src_ds.drop( src_ds.iloc[ src_ds.index.get_level_values("view") != view].index, inplace=True, ) logger.debug(f'found multiple views, using "{view}"') else: logger.debug(f"single-view dataset") except KeyError: # no need to differentiate different view logger.debug("not a multi-view dataset") try: channels = src_ds.index.get_level_values("channel").unique().values if len(channels) > 1: channel = prompt_options("Please select a channel: ", channels) src_ds.drop( src_ds.iloc[ src_ds.index.get_level_values("channel") != channel].index, inplace=True, ) logger.debug(f'found multiple channels, using "{channel}"') else: logger.debug(f"single-channel dataset") except KeyError: # no need to differentiate different view logger.debug("not a multi-channel dataset") # preview summary print(src_ds.inventory) if host == "local": client = LocalCluster() else: client = Client(host) logger.info(client) # create directives preview = run(src_ds, mip=mip) logger.info(f"final preview {preview.shape}, {preview.dtype}") # saving the result to zarr format zarr_path = f"{dst_dir.rstrip(os.sep)}.zarr" chunks = (8, 512, 512) logger.info(f'generating "{os.path.basename(zarr_path)}"') logger.debug( f"shape={preview.shape}, dtype={preview.dtype}, chunks={chunks}") try: logger.info("dumping to zarr directory store, waiting...") preview = preview.rechunk(chunks) da.to_zarr(preview, zarr_path, overwrite=False) except ValueError: logger.warning("found existing zarr store, reusing it") logger.info("release dask array") del preview logger.info(f'saving layered preivew to "{dst_dir}"') try: os.makedirs(dst_dir) except FileExistsError: logger.warning(f'"{dst_dir}" exists') pass logger.info(f"reload data from zarr") preview = da.from_zarr(zarr_path) futures = [] with tqdm(total=preview.shape[0]) as pbar: for i, layer in enumerate(preview): fname = f"layer_{i+1:04d}.tif" pbar.set_description(fname) path = os.path.join(dst_dir, fname) future = client.submit(imageio.imwrite, path, layer) futures.append(future) pbar.update(1) # submit tasks with tqdm(total=len(futures), bar_format="{l_bar}{bar:24}{r_bar}{bar:-10b}") as pbar: for future in as_completed(futures, with_results=False): try: future.result() # ensure we do not have an exception pbar.update(1) except Exception as error: logger.exception(error) future.release() logger.info("closing scheduler connection") client.close()